1 package org.unicode.cldr.tool; 2 3 import java.io.IOException; 4 import java.io.PrintWriter; 5 import java.io.Writer; 6 import java.util.ArrayList; 7 import java.util.Arrays; 8 import java.util.Collection; 9 import java.util.Collections; 10 import java.util.HashSet; 11 import java.util.LinkedHashSet; 12 import java.util.List; 13 import java.util.Map; 14 import java.util.Map.Entry; 15 import java.util.Set; 16 import java.util.TreeMap; 17 import java.util.TreeSet; 18 import java.util.function.Consumer; 19 import java.util.function.Function; 20 21 import org.apache.jena.query.QuerySolution; 22 import org.apache.jena.query.ResultSet; 23 import org.unicode.cldr.draft.FileUtilities; 24 import org.unicode.cldr.rdf.QueryClient; 25 import org.unicode.cldr.rdf.TsvWriter; 26 import org.unicode.cldr.util.CLDRConfig; 27 import org.unicode.cldr.util.CLDRFile; 28 import org.unicode.cldr.util.CLDRPaths; 29 import org.unicode.cldr.util.Containment; 30 import org.unicode.cldr.util.DtdType; 31 import org.unicode.cldr.util.Iso639Data; 32 import org.unicode.cldr.util.Iso639Data.Type; 33 import org.unicode.cldr.util.SimpleXMLSource; 34 import org.unicode.cldr.util.StandardCodes; 35 import org.unicode.cldr.util.StandardCodes.LstrField; 36 import org.unicode.cldr.util.StandardCodes.LstrType; 37 import org.unicode.cldr.util.Validity; 38 import org.unicode.cldr.util.Validity.Status; 39 40 import com.google.common.base.Joiner; 41 import com.google.common.base.Splitter; 42 import com.google.common.collect.ImmutableMap; 43 import com.google.common.collect.ImmutableMultimap; 44 import com.google.common.collect.ImmutableSet; 45 import com.google.common.collect.ImmutableSet.Builder; 46 import com.google.common.collect.LinkedHashMultimap; 47 import com.google.common.collect.Multimap; 48 import com.google.common.collect.Multimaps; 49 import com.google.common.collect.SortedSetMultimap; 50 import com.google.common.collect.TreeMultimap; 51 import com.ibm.icu.impl.Row.R2; 52 import com.ibm.icu.util.ICUUncheckedIOException; 53 54 /** 55 * <p>This code generates language group containment based on Wikidata. For example, it finds: 56 * root > Indo-European [Other] (ine) > Germanic [Other] (gem) > West Germanic languages (gmw) > English (en) 57 * </p><p> 58 * To do this, it reads three tables from Wikidata, and combines them. 59 * The combination is not trivial, because wikidata offers multiple "parents" for the same language, and many of the parents do not have ISO codes. 60 * For the first problem, the software computes the possible parent chains and picks among them. 61 * For the second problem, any parents without ISO codes are skipped (after forming the chains, so the ultimate ancestors are still found). 62 * <br>A number of debugging files are written to the external directory. 63 * </p><p> 64 * Some failures will be exposed by running this tool. Examples: 65 * <br><b>wikidata-entityToCode Multiple values:</b> Cebaara [Q1097512] [sef, sev]. 66 * <br>If these are not CLDR languages then they do not need to be fixed. 67 * <br><b>wikidata-childToParent Multiple values:</b> Q118712 [Q118712] [German [de, Q18], English [en, Q186]] 68 * <br>Normally these don't need to be fixed; the generation code works around them. 69 * <br><b>Cycle in [dng, zhx]</b> from [[http://www.wikidata.org/entity/Q33050, 70 * <br>These indicate that the Wikidata has a cycle in it. A => B => C => A. Ignore these unless the cases are worth investigating. 71 * </p><p> 72 * Others are exposed by running TestLanguageGroup.java 73 * <br> Error: (TestLanguageGroup.java:55) Single ancestor but not in ISOLATES: ce [Chechen] [ce] 74 * <br> Check to see if the language has a language group (in this case not, so add to TestLanguageGroup.ISOLATEs). 75 * <br> For kea [Kabuverdianu] [kea], you can add cpp as the parent, as follows. 76 * <br><b>Missing.</b> If a child-parent relation is missing, you can add it to EXTRA_PARENT_CHILDREN so that it shows up. For example, 77 * .put("gmw", "lb") says that West Germanic is the parent of Luxembourgish. 78 * <br><b>Extra.</b> Sometimes wikidata has conflicting or erroneous entries. Those can be fixed by adding to REMOVE_PARENT_CHILDREN. 79 * Use * to remove all children, such as .put("crp", "*") 80 * <br>Sometimes the tool fails with JsonParseExceptions, but works if you rerun. 81 * 82 * <br>Cycle in [dng, zhx] from ... Will be fixed by giving the language 'no parent' (mul) 83 * <p> 84 */ 85 public class GenerateLanguageContainment { 86 static { 87 System.out.println("See the class description for GenerateLanguageContainment.java about fixing problems."); 88 } 89 private static final boolean ONLY_LIVING = false; 90 private static final CLDRConfig CONFIG = CLDRConfig.getInstance(); 91 private static final QueryClient queryClient = QueryClient.getInstance(); 92 93 static final Splitter TAB = Splitter.on('\t').trimResults(); 94 static final CLDRFile ENGLISH = CONFIG.getEnglish(); 95 static final String relDir = "../util/data/languages/"; 96 static final Map<String, R2<List<String>, String>> ALIAS_MAP = CONFIG 97 .getSupplementalDataInfo() 98 .getLocaleAliasInfo() 99 .get("language"); 100 101 /** 102 * We load the SparQL queries using this helper object, to be able to catch exceptions… 103 */ 104 final static class QueryHelper { 105 final public Map<String, String> entityToLabel; 106 final public Map<String, String> entityToCode; 107 final public ImmutableMultimap<String, String> codeToEntity; 108 final public Multimap<String, String> childToParent; 109 QueryHelper()110 QueryHelper() { 111 try { 112 entityToLabel = loadQueryPairsUnique(GenerateLanguageContainment.class, "wikidata-entityToLabel", 113 null, null, null); 114 115 entityToCode = loadQueryPairsUnique(GenerateLanguageContainment.class, "wikidata-entityToCode", 116 code -> { 117 code = code.replace("\"", ""); 118 R2<List<String>, String> v = ALIAS_MAP.get(code); 119 String result = v == null 120 ? code : v.get0().get(0); 121 result = result.contains("_") 122 ? code 123 : result; 124 return result; 125 }, 126 code -> showNameAndCode(code), NAME); 127 128 codeToEntity = ImmutableMultimap.copyOf( 129 Multimaps.invertFrom(Multimaps.forMap(entityToCode), LinkedHashMultimap.create())); 130 131 childToParent = loadQueryPairs(GenerateLanguageContainment.class, "wikidata-childToParent", 132 code -> showNameAndCode(code), code -> showNameAndCode(code)); 133 134 } catch(Throwable t) { 135 t.printStackTrace(); 136 throw new RuntimeException(t); 137 } 138 } 139 getEntityName(String key)140 String getEntityName(String key) { 141 String code = getEntityCode(key); 142 if (code != null) { 143 try { 144 String name = NAME.apply(code); 145 if (name != null) { 146 return name; 147 } 148 } catch (Exception e) { 149 // TODO: Why would NAME.apply throw? 150 // TODO: Need better handling here? 151 } 152 } 153 String name = entityToLabel.get(key); 154 if (name != null) { 155 return name; 156 } 157 return afterLastSlash(key); 158 } 159 getEntityCode(String key)160 private String getEntityCode(String key) { 161 return entityToCode == null ? null : entityToCode.get(key); 162 } 163 afterLastSlash(String key)164 private String afterLastSlash(String key) { 165 return key.substring(key.lastIndexOf('/') + 1, key.length() - 1); 166 } 167 writeTsvs()168 public void writeTsvs() throws IOException { 169 TsvWriter.writeTsv("childToParent.tsv", childToParent, "child", "parent"); 170 TsvWriter.writeTsv("entityToCode.tsv", entityToCode, "lang", "langCode"); 171 TsvWriter.writeTsv("entityToLabel.tsv", entityToLabel, "lang", "langLabel"); 172 SortedSetMultimap<String,String> childToParentWithCodes = TreeMultimap.create(); 173 for (Entry<String, String> entry : childToParent.entries()) { 174 String child = entry.getKey(); 175 String parent = entry.getValue(); 176 childToParentWithCodes.put(showNameAndCode(child), showNameAndCode(parent)); 177 } 178 TsvWriter.writeTsv("childToParentWithCodes.tsv", childToParentWithCodes, "childCode\tLabel", "parentCode\tLabel"); 179 } 180 showNameAndCode(String qid)181 public String showNameAndCode(String qid) { 182 return getEntityName(qid) + " (" + (getEntityCode(qid) == null ? "" : getEntityCode(qid) + ", ") + afterLastSlash(qid) + ")"; 183 } 184 showNameAndCode(T qids)185 public <T extends Iterable<String>> String showNameAndCode(T qids) { 186 StringBuilder b = new StringBuilder(); 187 qids.forEach(qid -> { 188 if (b.length() != 0) b.append(", "); 189 b.append(showNameAndCode(qid));}); 190 return b.toString(); 191 } 192 showNameAndCode2(U qids)193 public <T extends Iterable<String>, U extends Iterable<T>> String showNameAndCode2(U qids) { 194 StringBuilder b = new StringBuilder(); 195 qids.forEach(qid -> { 196 if (b.length() != 0) b.append("; "); 197 b.append(showNameAndCode(qid));}); 198 return b.toString(); 199 } 200 201 } 202 static final QueryHelper QUERY_HELPER = new QueryHelper(); 203 204 static final Function<String, String> NAME = code -> code.equals("mul") ? "root" : ENGLISH.getName(code) + " (" + code + ")"; 205 206 static final Set<String> COLLECTIONS; 207 static { 208 Map<String, Map<LstrField, String>> languages = StandardCodes.getEnumLstreg().get(LstrType.language); 209 Builder<String> _collections = ImmutableSet.<String> builder(); 210 for (Entry<String, Map<LstrField, String>> e : languages.entrySet()) { 211 String scope = e.getValue().get(LstrField.Scope); 212 if (scope != null 213 && "Collection".equalsIgnoreCase(scope)) { e.getKey()214 _collections.add(e.getKey()); 215 } 216 } 217 COLLECTIONS = _collections.build(); 218 } 219 220 static class Tree { 221 Set<String> leaves = new LinkedHashSet<>(); 222 add(List<String> chain)223 void add(List<String> chain) { 224 Collections.reverse(chain); 225 } 226 } 227 228 /** 229 * To add parent-child relations to Wikidata 230 */ 231 static final Multimap<String, String> EXTRA_PARENT_CHILDREN = ImmutableMultimap.<String, String> builder() 232 .put("alv", "agq") 233 .put("alv", "cch") // Atlantic–Congo <= cch [Atsam] 234 .put("alv", "kcg") // Atlantic–Congo <= kcg [Tyap] 235 .put("alv", "ken") // Atlantic–Congo <= ken [Kenyang] 236 .put("alv", "ngb") 237 .put("alv", "yav") 238 .put("ber", "zgh") 239 .put("bnt", "asa") 240 .put("bnt", "bez") 241 .put("bnt", "cgg") 242 .put("bnt", "ebu") 243 .put("bnt", "jmc") 244 .put("bnt", "ksb") 245 .put("bnt", "lag") 246 .put("bnt", "mer") 247 .put("bnt", "mgh") 248 .put("bnt", "nmg") 249 .put("bnt", "rof") 250 .put("bnt", "rwk") 251 .put("bnt", "sbp") 252 .put("bnt", "seh") 253 .put("bnt", "vun") 254 .put("bnt", "xog") 255 .put("cpp", "kea") 256 .put("euq", "eu") 257 // gmw = West Germanic 258 .put("gmw", "ksh") 259 .put("gmw", "lb") 260 .put("gmw", "wae") 261 .put("grk", "el") 262 .put("grk", "gmy") 263 .put("grk", "grc") 264 .put("ira", "lrc") 265 .put("ira", "bgn") // Iranian <= Western Balochi 266 .put("inc", "trw") // Indo-Aryan <= Torwali 267 .put("jpx", "ja") 268 .put("mul", "art") 269 .put("mul", "euq") 270 .put("mul", "jpx") 271 .put("mul", "tai") 272 .put("ngb", "sg") 273 .put("roa", "cpf") 274 .put("roa", "cpp") 275 .put("roa", "cpp") 276 .put("sdv", "saq") 277 .put("son", "khq") 278 .put("sw", "swc") 279 .put("tai", "blt") // tai [Tai] <= blt [Tai Dam] 280 .put("tai", "lo") 281 .put("tai", "th") 282 .put("zlw", "szl") // West Slavic <= Silesian 283 .build(); 284 285 /** 286 * To remove parent-child relations from Wikidata, eg if a child has two parents (where that causes problems) 287 */ 288 static final Multimap<String, String> REMOVE_PARENT_CHILDREN = ImmutableMultimap.<String, String> builder() 289 .put("alv", "ukg") // ngf [Trans-New Guinea languages] <= ukg [Ukuriguma] 290 .put("crp", "*") // general Creole group interferes with French/Spanish/... language grouping 291 .put("cus", "mhd") // bnt [Bantu] <= mhd [Mbugu] (not cus [Cushitic]) 292 .put("gmw", "pih") // cpe [Creoles and pidgins, English based] <= pih [Pitcairn-Norfolk] 293 .put("inc", "rmg") 294 // Indo-European 295 .put("ine", "el") 296 .put("ine", "gmy") 297 .put("ine", "grc") 298 .put("ine", "trw") // inc [Indic] <= trw [Torwali] 299 .put("mul", "crp") 300 .put("mul", "cpp") // Creoles and pidgins, Portuguese-based 301 .put("mul", "und") // anomaly 302 .put("nic", "kcp") // ssa [Nilo-Saharan] <= kcp [Kanga] 303 .put("nic", "kec") // ssa [Nilo-Saharan] <= kec [Keiga] 304 .put("nic", "kgo") // ssa [Nilo-Saharan] <= kgo [Krongo] 305 .put("nic", "rof") // ssa [Nilo-Saharan] <= rof [Rombo] 306 .put("nic", "tbr") // ssa [Nilo-Saharan] <= tbr [Tumtum] 307 .put("nic", "tey") // ssa [Nilo-Saharan] <= tey [Tulishi] 308 .put("sit", "th") // sit <= tbq <= th 309 .put("sit", "dz") // sit <= tbq <= dz 310 .put("sit", "zh") 311 .put("sla", "cu") 312 .put("tbq", "psq") // paa [Papuan]; for psq [Pasi] - not tbq [Tibeto-Burman languages]; (There is also a variety of the Sino-Tibetan Adi language called Pasi. 313 .build(); 314 main(String[] args)315 public static void main(String[] args) throws IOException { 316 new GenerateLanguageContainment().run(args); 317 } 318 run(String[] args)319 void run(String[] args) throws IOException { 320 if (true) { 321 // check on items 322 for (String check : Arrays.asList("sw", "km", "ksh", "wae", "kea", "mfe", "th", "lo")) { 323 System.out.println("Checking " + ENGLISH.getName(check) + "[" + check + "]"); 324 Collection<String> entities = QUERY_HELPER.codeToEntity.get(check); 325 if (entities.isEmpty()) { 326 System.out.println("no code for " + check + ": " + entities); 327 continue; 328 } 329 for (String entity : entities) { 330 Set<List<String>> ancestors = getAllAncestors(entity); 331 showEntityLists(entity + " parents ", ancestors); 332 System.out.println(); 333 } 334 } 335 } 336 337 Map<Status, Set<String>> table = Validity.getInstance().getStatusToCodes(LstrType.language); 338 TreeMultimap<String, String> _parentToChild = TreeMultimap.create(); 339 TreeSet<String> missing = new TreeSet<>(table.get(Status.regular)); 340 _parentToChild.put("mul", "und"); 341 Set<String> skipping = new LinkedHashSet<>(); 342 for (String code : table.get(Status.regular)) { 343 if (ONLY_LIVING) { 344 Type type = Iso639Data.getType(code); 345 if (type != Type.Living) { 346 continue; 347 } 348 } 349 if (code.compareTo("hdz") > 0) { 350 int debug = 0; 351 } 352 // if (COLLECTIONS.contains(code)) { 353 // continue; 354 // } 355 Collection<String> entities = QUERY_HELPER.codeToEntity.get(code); 356 if (entities.isEmpty()) { 357 continue; 358 } 359 for (String entity : entities) { 360 if (QUERY_HELPER.childToParent.get(entity).isEmpty()) { 361 continue; 362 } 363 Set<Set<String>> chains = getAncestors(entity, skipping); 364 if (chains.size() > 1) { 365 int debug = 0; 366 } 367 for (Set<String> chain : chains) { 368 String last = null; 369 for (String link : chain) { 370 if (last != null) { 371 _parentToChild.put(link, last); 372 } 373 last = link; 374 } 375 } 376 } 377 } 378 System.out.println("Writing " + "skippingCodes.tsv"); 379 try(PrintWriter w = FileUtilities.openUTF8Writer(TsvWriter.getTsvDir(), "skippingCodes.tsv")) { 380 //TsvWriter.writeRow(w, "childCode\tLabel", "parentCode\tLabel"); // header 381 skipping.forEach(e -> w.println(e)); 382 } 383 384 385 for (Entry<String, Collection<String>> entity : REMOVE_PARENT_CHILDREN.asMap().entrySet()) { 386 String key = entity.getKey(); 387 for (String value : entity.getValue()) { 388 if (value.equals("*")) { 389 _parentToChild.removeAll(key); 390 } else { 391 _parentToChild.remove(key, value); 392 } 393 } 394 } 395 396 _parentToChild.putAll(EXTRA_PARENT_CHILDREN); 397 398 // special code for artificial 399 for (String code : Iso639Data.getAvailable()) { 400 Type type = Iso639Data.getType(code); 401 if (type == Type.Constructed) { 402 _parentToChild.put("art", code); 403 } 404 } 405 406 Multimap<String, String> parentToChild = ImmutableMultimap.copyOf(_parentToChild); 407 Multimap<String, String> childToParent = ImmutableMultimap.copyOf(Multimaps.invertFrom(parentToChild, TreeMultimap.create())); 408 System.out.println("Checking " + "he" + "\t" + Containment.getAllDirected(childToParent, "he")); 409 410 try(PrintWriter w = FileUtilities.openUTF8Writer(TsvWriter.getTsvDir(), "RawLanguageContainment.txt")) { 411 print(w, parentToChild, new ArrayList<>(Arrays.asList("mul"))); 412 } 413 SimpleXMLSource xmlSource = new SimpleXMLSource("languageGroup"); 414 xmlSource.setNonInheriting(true); // should be gotten from DtdType... 415 CLDRFile newFile = new CLDRFile(xmlSource); 416 newFile.setDtdType(DtdType.supplementalData); 417 newFile.add("//" + DtdType.supplementalData + "/version[@number='$Revision$']", ""); 418 printXML(newFile, parentToChild); 419 420 try (PrintWriter outFile = FileUtilities.openUTF8Writer(CLDRPaths.SUPPLEMENTAL_DIRECTORY, "languageGroup.xml")) { 421 newFile.write(outFile); 422 } catch (IOException e1) { 423 throw new ICUUncheckedIOException("Can't write to languageGroup.xml", e1); 424 } 425 426 // for (Entry<String,String> entry : childToParent.entries()) { 427 // String childNames = getName(entityToCode, entityToLabel, entry.getKey()); 428 // String parentNames = getName(entityToCode, entityToLabel, entry.getValue()); 429 // System.out.println(entry.getKey() + "\t" + entry.getValue() + "\t" + childNames + "\t" + parentNames); 430 // } 431 QUERY_HELPER.writeTsvs(); 432 } 433 showEntityLists(String title, Set<List<String>> ancestors)434 private static void showEntityLists(String title, Set<List<String>> ancestors) { 435 ancestors.forEach(new Consumer<List<String>>() { 436 @Override 437 public void accept(List<String> item) { 438 item.forEach(new Consumer<String>() { 439 @Override 440 public void accept(String t) { 441 System.out.println(t + "\t" + QUERY_HELPER.entityToCode.get(t) + "\t" + QUERY_HELPER.entityToLabel.get(t)); 442 } 443 }); 444 System.out.println(); 445 } 446 }); 447 } 448 printXML(CLDRFile newFile, Multimap<String, String> parentToChild)449 private static void printXML(CLDRFile newFile, Multimap<String, String> parentToChild) { 450 printXML(newFile, parentToChild, "mul"); 451 } 452 printXML(CLDRFile newFile, Multimap<String, String> parentToChild, String base)453 private static void printXML(CLDRFile newFile, Multimap<String, String> parentToChild, String base) { 454 Collection<String> children = parentToChild.get(base); 455 if (children.isEmpty()) { 456 return; 457 } 458 if (base.equals("und")) { 459 // skip, no good info 460 } else { 461 newFile.add("//" + DtdType.supplementalData + "/languageGroups/languageGroup[@parent=\"" + base + "\"]", 462 Joiner.on(" ").join(children)); 463 } 464 for (String child : children) { 465 printXML(newFile, parentToChild, child); 466 } 467 } 468 print(Writer out, Multimap<String, String> parentToChild, List<String> line)469 private static void print(Writer out, Multimap<String, String> parentToChild, List<String> line) { 470 String current = line.get(line.size() - 1); 471 Collection<String> children = parentToChild.get(current); 472 if (children.isEmpty()) { 473 try { 474 String sep = ""; 475 for (String item : line) { 476 out.append(sep).append(NAME.apply(item)); 477 sep = " > "; 478 } 479 out.append('\n'); 480 out.flush(); 481 } catch (IOException e) { 482 } 483 } else { 484 for (String child : children) { 485 line.add(child); 486 print(out, parentToChild, line); 487 line.remove(line.size() - 1); 488 } 489 } 490 } 491 getAncestors(String leaf, Set<String> skipping)492 private static Set<Set<String>> getAncestors(String leaf, Set<String> skipping) { 493 Set<List<String>> items = Containment.getAllDirected(QUERY_HELPER.childToParent, leaf); 494 Set<Set<String>> itemsFixed = new LinkedHashSet<>(); 495 main: for (List<String> item : items) { 496 Set<String> chain = new LinkedHashSet<>(); 497 for (String id : item) { 498 String code = QUERY_HELPER.entityToCode.get(id); 499 if (code == null) { 500 continue; 501 } 502 503 // skip leaf nodes after the first 504 505 if (!chain.isEmpty() && !COLLECTIONS.contains(code)) { 506 if (code.equals("zh")) { 507 code = "zhx"; // rewrite collections usage 508 } else { 509 skipping.add("Skipping inheritance from\t" + chain + "\t" + code + "\tfrom\t" + QUERY_HELPER.showNameAndCode2(items)); 510 continue; 511 } 512 } 513 514 // check for cycle, and skip if we have one 515 516 boolean changed = chain.add(code); 517 if (!changed) { 518 log("Cycle in\t" + chain + "\tfrom\t" + QUERY_HELPER.showNameAndCode2(items)); 519 continue main; 520 } 521 } 522 if (chain.size() > 1) { 523 chain.add("mul"); // root 524 itemsFixed.add(chain); 525 } 526 } 527 // remove subsets 528 // eg [[smp, he, mul], [smp, he, sem, afa, mul]] 529 // => [[smp, he, sem, afa, mul]] 530 if (itemsFixed.size() > 1) { 531 Set<Set<String>> removals = new HashSet<>(); 532 for (Set<String> chain1 : itemsFixed) { 533 for (Set<String> chain2 : itemsFixed) { 534 if (chain1.containsAll(chain2) && !chain2.containsAll(chain1)) { 535 removals.add(chain2); 536 } 537 } 538 } 539 itemsFixed.removeAll(removals); 540 } 541 return itemsFixed; 542 // TODO: delete this commented-out code? 543 // while (true) { 544 // String code = entityToCode.get(leaf); 545 // if (code != null) { 546 // chain.add(code); 547 // } 548 // Collection<String> parents = childToParent.get(leaf); 549 // if (parents.isEmpty()) { 550 // // clean up duplicates 551 // chain = new ArrayList<>(new LinkedHashSet<>(chain)); 552 // // wikipedia has non-collections as parents. Remove those if they are not first. 553 // break; 554 // } 555 // leaf = getBest(parents); 556 // } 557 // String last = chain.get(0); 558 // for (int i = 1; i < chain.size(); ++i) { 559 // String item = chain.get(i); 560 // if (!COLLECTIONS.contains(item)) { 561 // chain.set(i, item.equals("zh") ? "zhx" : ""); 562 // DROPPED_PARENTS_TO_CHILDREN.put(item, last); 563 // } else { 564 // last = item; 565 // } 566 // } 567 // chain.removeIf(x -> x.isEmpty()); 568 // if ("zh".equals(chain.get(0))) { 569 // chain.add(1,"zhx"); 570 // } 571 // last = chain.get(chain.size()-1); 572 // if (!"mul".equals(last)) { 573 // chain.add("mul"); // make sure we have root. 574 // } 575 // if (chain.size() == 2) { 576 // chain.add(1,"und"); 577 // } 578 // return chain; 579 } 580 log(String string)581 private static void log(String string) { 582 System.out.println(string); 583 // for (Entry<String, String> e : DROPPED_PARENTS_TO_CHILDREN.entries()) { 584 // System.out.println(NAME.apply(e.getKey()) + "\t" + NAME.apply(e.getValue()) 585 // ); 586 // } 587 } 588 589 // TODO: This function is only called by other commented-out code above. 590 // private static String getBest(Collection<String> parents) { 591 // for (String parent : parents) { 592 // String code = QUERY_HELPER.entityToCode.get(parent); 593 // if (code == null) continue; 594 // Type type = Iso639Data.getType(code); 595 // if (type != Type.Living) { 596 // continue; 597 // } 598 // return parent; 599 // } 600 // // failed 601 // return parents.iterator().next(); 602 // } 603 loadQueryPairs(Class<?> class1, String file, Function<String, String> keyMapper, Function<String, String> valueMapper)604 private static Multimap<String, String> loadQueryPairs(Class<?> class1, String file, 605 Function<String, String> keyMapper, Function<String, String> valueMapper) throws IOException { 606 System.out.println("QUERY: " + file); 607 ResultSet rs = queryClient.execSelectFromSparql(file, QueryClient.WIKIDATA_SPARQL_SERVER); 608 // the query must return exactly two variables. 609 List<String> resultVars = rs.getResultVars(); 610 assertTwoVars(resultVars); 611 final String keyName = resultVars.get(0); 612 final String valueName = resultVars.get(1); 613 614 ImmutableMultimap.Builder<String, String> _keyToValues = ImmutableMultimap.builder(); 615 for (;rs.hasNext();) { 616 final QuerySolution qs = rs.next(); 617 String key = QueryClient.getStringOrNull(qs, keyName); 618 String value = QueryClient.getStringOrNull(qs, valueName); 619 _keyToValues.put(key, value); 620 } 621 ImmutableMultimap<String, String> result = _keyToValues.build(); 622 showDups(file, result, keyMapper, valueMapper); 623 System.out.println("LOADED: " + file + " with rows " + rs.getRowNumber()); 624 return result; 625 } 626 627 /** 628 * Assuming that the SPARQL query returns exactly 2 results, treat them as Key=Value. 629 * @param class1 630 * @param file name of a sparql query, such as 'wikidata-childToParent' 631 * @param fixValue 632 * @param keyMapper 633 * @param valueMapper 634 * @return 635 * @throws IOException 636 */ loadQueryPairsUnique(Class<?> class1, String file, Function<String, String> fixValue, Function<String, String> keyMapper, Function<String, String> valueMapper)637 private static Map<String, String> loadQueryPairsUnique(Class<?> class1, String file, 638 Function<String, String> fixValue, 639 Function<String, String> keyMapper, Function<String, String> valueMapper) throws IOException { 640 641 System.out.println("QUERY: " + file); 642 ResultSet rs = queryClient.execSelectFromSparql(file, QueryClient.WIKIDATA_SPARQL_SERVER); 643 644 // the query must return exactly two variables. 645 List<String> resultVars = rs.getResultVars(); 646 assertTwoVars(resultVars); 647 final String keyName = resultVars.get(0); 648 final String valueName = resultVars.get(1); 649 650 Map<String, String> _keyToValue = new TreeMap<>(); 651 Multimap<String, String> _keyToValues = TreeMultimap.create(); 652 for (;rs.hasNext();) { 653 final QuerySolution qs = rs.next(); 654 String key = QueryClient.getStringOrNull(qs, keyName); 655 String value = QueryClient.getStringOrNull(qs, valueName); 656 if (fixValue != null) { 657 value = fixValue.apply(value); 658 } 659 _keyToValues.put(key, value); 660 String oldValue = _keyToValue.get(key); 661 if (oldValue == null || oldValue.equals("kxm")) { 662 _keyToValue.put(key, value); 663 } 664 } 665 _keyToValue = ImmutableMap.copyOf(_keyToValue); 666 showDups(file, _keyToValues, keyMapper, valueMapper); 667 System.out.println("LOADED: " + file + " with rows " + rs.getRowNumber()); 668 return _keyToValue; 669 } assertTwoVars(List<String> resultVars)670 private static void assertTwoVars(List<String> resultVars) { 671 if(resultVars.size() != 2) { 672 throw new IllegalArgumentException("expected 2 result vars but got " + resultVars.size() + ": " + resultVars); 673 } 674 } 675 showDups(String file, Multimap<String, String> _keyToValues, Function<String, String> keyMapper, Function<String, String> valueMapper)676 private static void showDups(String file, Multimap<String, String> _keyToValues, 677 Function<String, String> keyMapper, Function<String, String> valueMapper) { 678 for (Entry<String, Collection<String>> entry : _keyToValues.asMap().entrySet()) { 679 Collection<String> valueSet = entry.getValue(); 680 if (valueSet.size() > 1) { 681 String key = entry.getKey(); 682 key = keyMapper == null ? key : keyMapper.apply(key); 683 if (valueMapper != null) { 684 Set<String> result = new LinkedHashSet<>(); 685 valueSet.stream().map(valueMapper).forEach(x -> result.add(x)); 686 valueSet = result; 687 } 688 log(file + "\tMultiple values: " + key + "\t" + valueSet); 689 } 690 } 691 } 692 getAllAncestors(String lang)693 static Set<List<String>> getAllAncestors(String lang) { 694 return Containment.getAllDirected(QUERY_HELPER.childToParent, lang); 695 } 696 } 697