• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 package org.unicode.cldr.tool;
2 
3 import java.io.IOException;
4 import java.io.PrintWriter;
5 import java.io.Writer;
6 import java.util.ArrayList;
7 import java.util.Arrays;
8 import java.util.Collection;
9 import java.util.Collections;
10 import java.util.HashSet;
11 import java.util.LinkedHashSet;
12 import java.util.List;
13 import java.util.Map;
14 import java.util.Map.Entry;
15 import java.util.Set;
16 import java.util.TreeMap;
17 import java.util.TreeSet;
18 import java.util.function.Consumer;
19 import java.util.function.Function;
20 
21 import org.apache.jena.query.QuerySolution;
22 import org.apache.jena.query.ResultSet;
23 import org.unicode.cldr.draft.FileUtilities;
24 import org.unicode.cldr.rdf.QueryClient;
25 import org.unicode.cldr.rdf.TsvWriter;
26 import org.unicode.cldr.util.CLDRConfig;
27 import org.unicode.cldr.util.CLDRFile;
28 import org.unicode.cldr.util.CLDRPaths;
29 import org.unicode.cldr.util.Containment;
30 import org.unicode.cldr.util.DtdType;
31 import org.unicode.cldr.util.Iso639Data;
32 import org.unicode.cldr.util.Iso639Data.Type;
33 import org.unicode.cldr.util.SimpleXMLSource;
34 import org.unicode.cldr.util.StandardCodes;
35 import org.unicode.cldr.util.StandardCodes.LstrField;
36 import org.unicode.cldr.util.StandardCodes.LstrType;
37 import org.unicode.cldr.util.Validity;
38 import org.unicode.cldr.util.Validity.Status;
39 
40 import com.google.common.base.Joiner;
41 import com.google.common.base.Splitter;
42 import com.google.common.collect.ImmutableMap;
43 import com.google.common.collect.ImmutableMultimap;
44 import com.google.common.collect.ImmutableSet;
45 import com.google.common.collect.ImmutableSet.Builder;
46 import com.google.common.collect.LinkedHashMultimap;
47 import com.google.common.collect.Multimap;
48 import com.google.common.collect.Multimaps;
49 import com.google.common.collect.SortedSetMultimap;
50 import com.google.common.collect.TreeMultimap;
51 import com.ibm.icu.impl.Row.R2;
52 import com.ibm.icu.util.ICUUncheckedIOException;
53 
54 /**
55  * <p>This code generates language group containment based on Wikidata. For example, it finds:
56  * root > Indo-European [Other] (ine) > Germanic [Other] (gem) > West Germanic languages (gmw) > English (en)
57  * </p><p>
58  * To do this, it reads three tables from Wikidata, and combines them.
59  * The combination is not trivial, because wikidata offers multiple "parents" for the same language, and many of the parents do not have ISO codes.
60  * For the first problem, the software computes the possible parent chains and picks among them.
61  * For the second problem, any parents without ISO codes are skipped (after forming the chains, so the ultimate ancestors are still found).
62  * <br>A number of debugging files are written to the external directory.
63  * </p><p>
64  * Some failures will be exposed by running this tool. Examples:
65  * <br><b>wikidata-entityToCode	Multiple values:</b> Cebaara [Q1097512]	[sef, sev].
66  * <br>If these are not CLDR languages then they do not need to be fixed.
67  * <br><b>wikidata-childToParent	Multiple values:</b>  Q118712 [Q118712]	[German [de, Q18], English [en, Q186]]
68  * <br>Normally these don't need to be fixed; the generation code works around them.
69  * <br><b>Cycle in	[dng, zhx]</b>	from	[[http://www.wikidata.org/entity/Q33050,
70  * <br>These indicate that the Wikidata has a cycle in it. A => B => C => A. Ignore these unless the cases are worth investigating.
71  * </p><p>
72  * Others are exposed by running TestLanguageGroup.java
73  * <br> Error: (TestLanguageGroup.java:55) Single ancestor but not in ISOLATES: ce [Chechen]	[ce]
74  * <br> Check to see if the language has a language group (in this case not, so add to TestLanguageGroup.ISOLATEs).
75  * <br> For kea [Kabuverdianu]	[kea], you can add cpp as the parent, as follows.
76  * <br><b>Missing.</b> If a child-parent relation is missing, you can add it to EXTRA_PARENT_CHILDREN so that it shows up. For example,
77  * .put("gmw", "lb") says that West Germanic is the parent of Luxembourgish.
78  * <br><b>Extra.</b> Sometimes wikidata has conflicting or erroneous entries. Those can be fixed by adding to REMOVE_PARENT_CHILDREN.
79  * Use * to remove all children, such as .put("crp", "*")
80  * <br>Sometimes the tool fails with JsonParseExceptions, but works if you rerun.
81  *
82  * <br>Cycle in	[dng, zhx]	from ... Will be fixed by giving the language 'no parent' (mul)
83  * <p>
84  */
85 public class GenerateLanguageContainment {
86 	static {
87 		System.out.println("See the class description for GenerateLanguageContainment.java about fixing problems.");
88 	}
89 	private static final boolean ONLY_LIVING = false;
90 	private static final CLDRConfig CONFIG = CLDRConfig.getInstance();
91 	private static final QueryClient queryClient = QueryClient.getInstance();
92 
93 	static final Splitter TAB = Splitter.on('\t').trimResults();
94 	static final CLDRFile ENGLISH = CONFIG.getEnglish();
95 	static final String relDir = "../util/data/languages/";
96 	static final Map<String, R2<List<String>, String>> ALIAS_MAP = CONFIG
97 			.getSupplementalDataInfo()
98 			.getLocaleAliasInfo()
99 			.get("language");
100 
101 	/**
102 	 * We load the SparQL queries using this helper object, to be able to catch exceptions…
103 	 */
104 	final static class QueryHelper {
105 		final public Map<String, String> entityToLabel;
106 		final public Map<String, String> entityToCode;
107 		final public ImmutableMultimap<String, String> codeToEntity;
108 		final public Multimap<String, String> childToParent;
109 
QueryHelper()110 		QueryHelper() {
111 			try {
112 				entityToLabel = loadQueryPairsUnique(GenerateLanguageContainment.class, "wikidata-entityToLabel",
113 						null, null, null);
114 
115 				entityToCode = loadQueryPairsUnique(GenerateLanguageContainment.class,  "wikidata-entityToCode",
116 						code -> {
117 							code = code.replace("\"", "");
118 							R2<List<String>, String> v = ALIAS_MAP.get(code);
119 							String result = v == null
120 									? code : v.get0().get(0);
121 							result = result.contains("_")
122 									? code
123 											: result;
124 							return result;
125 						},
126 						code -> showNameAndCode(code), NAME);
127 
128 				codeToEntity = ImmutableMultimap.copyOf(
129 						Multimaps.invertFrom(Multimaps.forMap(entityToCode), LinkedHashMultimap.create()));
130 
131 				childToParent = loadQueryPairs(GenerateLanguageContainment.class, "wikidata-childToParent",
132 						code -> showNameAndCode(code), code -> showNameAndCode(code));
133 
134 			} catch(Throwable t) {
135 				t.printStackTrace();
136 				throw new RuntimeException(t);
137 			}
138 		}
139 
getEntityName(String key)140 		String getEntityName(String key) {
141 			String code = getEntityCode(key);
142 			if (code != null) {
143 				try {
144 					String name = NAME.apply(code);
145 					if (name != null) {
146 						return name;
147 					}
148 				} catch (Exception e) {
149 					// TODO: Why would NAME.apply throw?
150 					// TODO: Need better handling here?
151 				}
152 			}
153 			String name = entityToLabel.get(key);
154 			if (name != null) {
155 				return name;
156 			}
157 			return afterLastSlash(key);
158 		}
159 
getEntityCode(String key)160 		private String getEntityCode(String key) {
161 			return entityToCode == null ? null : entityToCode.get(key);
162 		}
163 
afterLastSlash(String key)164 		private String afterLastSlash(String key) {
165 			return key.substring(key.lastIndexOf('/') + 1, key.length() - 1);
166 		}
167 
writeTsvs()168 		public void writeTsvs() throws IOException {
169 			TsvWriter.writeTsv("childToParent.tsv", childToParent, "child", "parent");
170 			TsvWriter.writeTsv("entityToCode.tsv", entityToCode, "lang", "langCode");
171 			TsvWriter.writeTsv("entityToLabel.tsv", entityToLabel, "lang", "langLabel");
172 			SortedSetMultimap<String,String> childToParentWithCodes = TreeMultimap.create();
173 			for (Entry<String, String> entry : childToParent.entries()) {
174 				String child = entry.getKey();
175 				String parent = entry.getValue();
176 				childToParentWithCodes.put(showNameAndCode(child), showNameAndCode(parent));
177 			}
178 			TsvWriter.writeTsv("childToParentWithCodes.tsv", childToParentWithCodes, "childCode\tLabel", "parentCode\tLabel");
179 		}
180 
showNameAndCode(String qid)181 		public String showNameAndCode(String qid) {
182 			return getEntityName(qid) + " (" + (getEntityCode(qid) == null ? "" : getEntityCode(qid) + ", ") + afterLastSlash(qid) + ")";
183 		}
184 
showNameAndCode(T qids)185 		public <T extends Iterable<String>> String showNameAndCode(T qids) {
186 			StringBuilder b = new StringBuilder();
187 			qids.forEach(qid -> {
188 				if (b.length() != 0) b.append(", ");
189 				b.append(showNameAndCode(qid));});
190 			return b.toString();
191 		}
192 
showNameAndCode2(U qids)193 		public <T extends Iterable<String>, U extends Iterable<T>> String showNameAndCode2(U qids) {
194 			StringBuilder b = new StringBuilder();
195 			qids.forEach(qid -> {
196 				if (b.length() != 0) b.append("; ");
197 				b.append(showNameAndCode(qid));});
198 			return b.toString();
199 		}
200 
201 	}
202 	static final QueryHelper QUERY_HELPER = new QueryHelper();
203 
204 	static final Function<String, String> NAME = code -> code.equals("mul") ? "root" : ENGLISH.getName(code) + " (" + code + ")";
205 
206 	static final Set<String> COLLECTIONS;
207 	static {
208 		Map<String, Map<LstrField, String>> languages = StandardCodes.getEnumLstreg().get(LstrType.language);
209 		Builder<String> _collections = ImmutableSet.<String> builder();
210 		for (Entry<String, Map<LstrField, String>> e : languages.entrySet()) {
211 			String scope = e.getValue().get(LstrField.Scope);
212 			if (scope != null
213 					&& "Collection".equalsIgnoreCase(scope)) {
e.getKey()214 				_collections.add(e.getKey());
215 			}
216 		}
217 		COLLECTIONS = _collections.build();
218 	}
219 
220 	static class Tree {
221 		Set<String> leaves = new LinkedHashSet<>();
222 
add(List<String> chain)223 		void add(List<String> chain) {
224 			Collections.reverse(chain);
225 		}
226 	}
227 
228 	/**
229 	 * To add parent-child relations to Wikidata
230 	 */
231 	static final Multimap<String, String> EXTRA_PARENT_CHILDREN = ImmutableMultimap.<String, String> builder()
232 			.put("alv", "agq")
233 			.put("alv", "cch") // Atlantic–Congo <= cch [Atsam]
234 			.put("alv", "kcg") // Atlantic–Congo <= kcg [Tyap]
235 			.put("alv", "ken") // Atlantic–Congo <= ken [Kenyang]
236 			.put("alv", "ngb")
237 			.put("alv", "yav")
238 			.put("ber", "zgh")
239 			.put("bnt", "asa")
240 			.put("bnt", "bez")
241 			.put("bnt", "cgg")
242 			.put("bnt", "ebu")
243 			.put("bnt", "jmc")
244 			.put("bnt", "ksb")
245 			.put("bnt", "lag")
246 			.put("bnt", "mer")
247 			.put("bnt", "mgh")
248 			.put("bnt", "nmg")
249 			.put("bnt", "rof")
250 			.put("bnt", "rwk")
251 			.put("bnt", "sbp")
252 			.put("bnt", "seh")
253 			.put("bnt", "vun")
254 			.put("bnt", "xog")
255 			.put("cpp", "kea")
256 			.put("euq", "eu")
257 			// gmw = West Germanic
258 			.put("gmw", "ksh")
259 			.put("gmw", "lb")
260 			.put("gmw", "wae")
261 			.put("grk", "el")
262 			.put("grk", "gmy")
263 			.put("grk", "grc")
264 			.put("ira", "lrc")
265 			.put("ira", "bgn") // Iranian <= Western Balochi
266 			.put("inc", "trw") // Indo-Aryan <= Torwali
267 			.put("jpx", "ja")
268 			.put("mul", "art")
269 			.put("mul", "euq")
270 			.put("mul", "jpx")
271 			.put("mul", "tai")
272 			.put("ngb", "sg")
273 			.put("roa", "cpf")
274 			.put("roa", "cpp")
275 			.put("roa", "cpp")
276 			.put("sdv", "saq")
277 			.put("son", "khq")
278 			.put("sw", "swc")
279 			.put("tai", "blt") // tai [Tai] <= blt [Tai Dam]
280 			.put("tai", "lo")
281 			.put("tai", "th")
282 			.put("zlw", "szl") // West Slavic <= Silesian
283 			.build();
284 
285 	/**
286 	 * To remove parent-child relations from Wikidata, eg if a child has two parents (where that causes problems)
287 	 */
288 	static final Multimap<String, String> REMOVE_PARENT_CHILDREN = ImmutableMultimap.<String, String> builder()
289 			.put("alv", "ukg") // ngf [Trans-New Guinea languages] <= ukg [Ukuriguma]
290 			.put("crp", "*") // general Creole group interferes with French/Spanish/... language grouping
291 			.put("cus", "mhd") // bnt [Bantu] <= mhd [Mbugu] (not cus [Cushitic])
292 			.put("gmw", "pih") // cpe [Creoles and pidgins, English based] <= pih [Pitcairn-Norfolk]
293 			.put("inc", "rmg")
294 			// Indo-European
295 			.put("ine", "el")
296 			.put("ine", "gmy")
297 			.put("ine", "grc")
298 			.put("ine", "trw") // inc [Indic] <= trw [Torwali]
299 			.put("mul", "crp")
300 			.put("mul", "cpp") // Creoles and pidgins, Portuguese-based
301 			.put("mul", "und") // anomaly
302 			.put("nic", "kcp") // ssa [Nilo-Saharan] <= kcp [Kanga]
303 			.put("nic", "kec") // ssa [Nilo-Saharan] <= kec [Keiga]
304 			.put("nic", "kgo") // ssa [Nilo-Saharan] <= kgo [Krongo]
305 			.put("nic", "rof") // ssa [Nilo-Saharan] <= rof [Rombo]
306 			.put("nic", "tbr") // ssa [Nilo-Saharan] <= tbr [Tumtum]
307 			.put("nic", "tey") // ssa [Nilo-Saharan] <= tey [Tulishi]
308 			.put("sit", "th") // sit <= tbq <= th
309 			.put("sit", "dz") // sit <= tbq <= dz
310 			.put("sit", "zh")
311 			.put("sla", "cu")
312 			.put("tbq", "psq") // paa [Papuan]; for	psq [Pasi] - not tbq [Tibeto-Burman languages]; 	(There is also a variety of the Sino-Tibetan Adi language called Pasi.
313 			.build();
314 
main(String[] args)315 	public static void main(String[] args) throws IOException {
316 		new GenerateLanguageContainment().run(args);
317 	}
318 
run(String[] args)319 	void run(String[] args) throws IOException {
320 		if (true) {
321 			// check on items
322 			for (String check : Arrays.asList("sw", "km", "ksh", "wae", "kea", "mfe", "th", "lo")) {
323 				System.out.println("Checking " + ENGLISH.getName(check) + "[" + check + "]");
324 				Collection<String> entities = QUERY_HELPER.codeToEntity.get(check);
325 				if (entities.isEmpty()) {
326 					System.out.println("no code for " + check + ": " + entities);
327 					continue;
328 				}
329 				for (String entity : entities) {
330 					Set<List<String>> ancestors = getAllAncestors(entity);
331 					showEntityLists(entity + " parents ", ancestors);
332 					System.out.println();
333 				}
334 			}
335 		}
336 
337 		Map<Status, Set<String>> table = Validity.getInstance().getStatusToCodes(LstrType.language);
338 		TreeMultimap<String, String> _parentToChild = TreeMultimap.create();
339 		TreeSet<String> missing = new TreeSet<>(table.get(Status.regular));
340 		_parentToChild.put("mul", "und");
341 		Set<String> skipping = new LinkedHashSet<>();
342 		for (String code : table.get(Status.regular)) {
343 			if (ONLY_LIVING) {
344 				Type type = Iso639Data.getType(code);
345 				if (type != Type.Living) {
346 					continue;
347 				}
348 			}
349 			if (code.compareTo("hdz") > 0) {
350 				int debug = 0;
351 			}
352 			//            if (COLLECTIONS.contains(code)) {
353 			//                continue;
354 			//            }
355 			Collection<String> entities = QUERY_HELPER.codeToEntity.get(code);
356 			if (entities.isEmpty()) {
357 				continue;
358 			}
359 			for (String entity : entities) {
360 				if (QUERY_HELPER.childToParent.get(entity).isEmpty()) {
361 					continue;
362 				}
363 				Set<Set<String>> chains = getAncestors(entity, skipping);
364 				if (chains.size() > 1) {
365 					int debug = 0;
366 				}
367 				for (Set<String> chain : chains) {
368 					String last = null;
369 					for (String link : chain) {
370 						if (last != null) {
371 							_parentToChild.put(link, last);
372 						}
373 						last = link;
374 					}
375 				}
376 			}
377 		}
378 		System.out.println("Writing " + "skippingCodes.tsv");
379 		try(PrintWriter w = FileUtilities.openUTF8Writer(TsvWriter.getTsvDir(), "skippingCodes.tsv")) {
380 		    //TsvWriter.writeRow(w, "childCode\tLabel", "parentCode\tLabel"); // header
381 		    skipping.forEach(e -> w.println(e));
382 		}
383 
384 
385 		for (Entry<String, Collection<String>> entity : REMOVE_PARENT_CHILDREN.asMap().entrySet()) {
386 			String key = entity.getKey();
387 			for (String value : entity.getValue()) {
388 				if (value.equals("*")) {
389 					_parentToChild.removeAll(key);
390 				} else {
391 					_parentToChild.remove(key, value);
392 				}
393 			}
394 		}
395 
396 		_parentToChild.putAll(EXTRA_PARENT_CHILDREN);
397 
398 		// special code for artificial
399 		for (String code : Iso639Data.getAvailable()) {
400 			Type type = Iso639Data.getType(code);
401 			if (type == Type.Constructed) {
402 				_parentToChild.put("art", code);
403 			}
404 		}
405 
406 		Multimap<String, String> parentToChild = ImmutableMultimap.copyOf(_parentToChild);
407 		Multimap<String, String> childToParent = ImmutableMultimap.copyOf(Multimaps.invertFrom(parentToChild, TreeMultimap.create()));
408 		System.out.println("Checking " + "he" + "\t" + Containment.getAllDirected(childToParent, "he"));
409 
410 		try(PrintWriter w = FileUtilities.openUTF8Writer(TsvWriter.getTsvDir(), "RawLanguageContainment.txt")) {
411 			print(w, parentToChild, new ArrayList<>(Arrays.asList("mul")));
412 		}
413 		SimpleXMLSource xmlSource = new SimpleXMLSource("languageGroup");
414 		xmlSource.setNonInheriting(true); // should be gotten from DtdType...
415 		CLDRFile newFile = new CLDRFile(xmlSource);
416 		newFile.setDtdType(DtdType.supplementalData);
417 		newFile.add("//" + DtdType.supplementalData + "/version[@number='$Revision$']", "");
418 		printXML(newFile, parentToChild);
419 
420 		try (PrintWriter outFile = FileUtilities.openUTF8Writer(CLDRPaths.SUPPLEMENTAL_DIRECTORY, "languageGroup.xml")) {
421 			newFile.write(outFile);
422 		} catch (IOException e1) {
423 			throw new ICUUncheckedIOException("Can't write to languageGroup.xml", e1);
424 		}
425 
426 		//        for (Entry<String,String> entry : childToParent.entries()) {
427 		//            String childNames = getName(entityToCode, entityToLabel, entry.getKey());
428 		//            String parentNames = getName(entityToCode, entityToLabel, entry.getValue());
429 		//            System.out.println(entry.getKey() + "\t" + entry.getValue() + "\t" + childNames + "\t" + parentNames);
430 		//        }
431 		QUERY_HELPER.writeTsvs();
432 	}
433 
showEntityLists(String title, Set<List<String>> ancestors)434 	private static void showEntityLists(String title, Set<List<String>> ancestors) {
435 		ancestors.forEach(new Consumer<List<String>>() {
436 			@Override
437 			public void accept(List<String> item) {
438 				item.forEach(new Consumer<String>() {
439 					@Override
440 					public void accept(String t) {
441 						System.out.println(t + "\t" + QUERY_HELPER.entityToCode.get(t) + "\t" + QUERY_HELPER.entityToLabel.get(t));
442 					}
443 				});
444 				System.out.println();
445 			}
446 		});
447 	}
448 
printXML(CLDRFile newFile, Multimap<String, String> parentToChild)449 	private static void printXML(CLDRFile newFile, Multimap<String, String> parentToChild) {
450 		printXML(newFile, parentToChild, "mul");
451 	}
452 
printXML(CLDRFile newFile, Multimap<String, String> parentToChild, String base)453 	private static void printXML(CLDRFile newFile, Multimap<String, String> parentToChild, String base) {
454 		Collection<String> children = parentToChild.get(base);
455 		if (children.isEmpty()) {
456 			return;
457 		}
458 		if (base.equals("und")) {
459 			// skip, no good info
460 		} else {
461 			newFile.add("//" + DtdType.supplementalData + "/languageGroups/languageGroup[@parent=\"" + base + "\"]",
462 					Joiner.on(" ").join(children));
463 		}
464 		for (String child : children) {
465 			printXML(newFile, parentToChild, child);
466 		}
467 	}
468 
print(Writer out, Multimap<String, String> parentToChild, List<String> line)469 	private static void print(Writer out, Multimap<String, String> parentToChild, List<String> line) {
470 		String current = line.get(line.size() - 1);
471 		Collection<String> children = parentToChild.get(current);
472 		if (children.isEmpty()) {
473 			try {
474 				String sep = "";
475 				for (String item : line) {
476 					out.append(sep).append(NAME.apply(item));
477 					sep = " > ";
478 				}
479 				out.append('\n');
480 				out.flush();
481 			} catch (IOException e) {
482 			}
483 		} else {
484 			for (String child : children) {
485 				line.add(child);
486 				print(out, parentToChild, line);
487 				line.remove(line.size() - 1);
488 			}
489 		}
490 	}
491 
getAncestors(String leaf, Set<String> skipping)492 	private static Set<Set<String>> getAncestors(String leaf, Set<String> skipping) {
493 		Set<List<String>> items = Containment.getAllDirected(QUERY_HELPER.childToParent, leaf);
494 		Set<Set<String>> itemsFixed = new LinkedHashSet<>();
495 		main: for (List<String> item : items) {
496 			Set<String> chain = new LinkedHashSet<>();
497 			for (String id : item) {
498 				String code = QUERY_HELPER.entityToCode.get(id);
499 				if (code == null) {
500 					continue;
501 				}
502 
503 				// skip leaf nodes after the first
504 
505 				if (!chain.isEmpty() && !COLLECTIONS.contains(code)) {
506 					if (code.equals("zh")) {
507 						code = "zhx"; // rewrite collections usage
508 					} else {
509 						skipping.add("Skipping inheritance from\t" + chain + "\t" + code + "\tfrom\t" + QUERY_HELPER.showNameAndCode2(items));
510 						continue;
511 					}
512 				}
513 
514 				// check for cycle, and skip if we have one
515 
516 				boolean changed = chain.add(code);
517 				if (!changed) {
518 					log("Cycle in\t" + chain + "\tfrom\t" + QUERY_HELPER.showNameAndCode2(items));
519 					continue main;
520 				}
521 			}
522 			if (chain.size() > 1) {
523 				chain.add("mul"); // root
524 				itemsFixed.add(chain);
525 			}
526 		}
527 		// remove subsets
528 		// eg [[smp, he, mul], [smp, he, sem, afa, mul]]
529 		// => [[smp, he, sem, afa, mul]]
530 		if (itemsFixed.size() > 1) {
531 			Set<Set<String>> removals = new HashSet<>();
532 			for (Set<String> chain1 : itemsFixed) {
533 				for (Set<String> chain2 : itemsFixed) {
534 					if (chain1.containsAll(chain2) && !chain2.containsAll(chain1)) {
535 						removals.add(chain2);
536 					}
537 				}
538 			}
539 			itemsFixed.removeAll(removals);
540 		}
541 		return itemsFixed;
542 		// TODO: delete this commented-out code?
543 		//        while (true) {
544 		//            String code = entityToCode.get(leaf);
545 		//            if (code != null) {
546 		//                chain.add(code);
547 		//            }
548 		//            Collection<String> parents = childToParent.get(leaf);
549 		//            if (parents.isEmpty()) {
550 		//                // clean up duplicates
551 		//                chain = new ArrayList<>(new LinkedHashSet<>(chain));
552 		//                // wikipedia has non-collections as parents. Remove those if they are not first.
553 		//                break;
554 		//            }
555 		//            leaf = getBest(parents);
556 		//        }
557 		//        String last = chain.get(0);
558 		//        for (int i = 1; i < chain.size(); ++i) {
559 		//            String item = chain.get(i);
560 		//            if (!COLLECTIONS.contains(item)) {
561 		//                chain.set(i, item.equals("zh") ? "zhx" : "");
562 		//                DROPPED_PARENTS_TO_CHILDREN.put(item, last);
563 		//            } else {
564 		//                last = item;
565 		//            }
566 		//        }
567 		//        chain.removeIf(x -> x.isEmpty());
568 		//        if ("zh".equals(chain.get(0))) {
569 		//            chain.add(1,"zhx");
570 		//        }
571 		//        last = chain.get(chain.size()-1);
572 		//        if (!"mul".equals(last)) {
573 		//            chain.add("mul"); // make sure we have root.
574 		//        }
575 		//        if (chain.size() == 2) {
576 		//            chain.add(1,"und");
577 		//        }
578 		//        return chain;
579 	}
580 
log(String string)581 	private static void log(String string) {
582 		System.out.println(string);
583 		//        for (Entry<String, String> e : DROPPED_PARENTS_TO_CHILDREN.entries()) {
584 		//            System.out.println(NAME.apply(e.getKey()) + "\t" + NAME.apply(e.getValue())
585 		//                );
586 		//        }
587 	}
588 
589 	// TODO: This function is only called by other commented-out code above.
590 	//    private static String getBest(Collection<String> parents) {
591 	//        for (String parent : parents) {
592 	//            String code = QUERY_HELPER.entityToCode.get(parent);
593 	//            if (code == null) continue;
594 	//            Type type = Iso639Data.getType(code);
595 	//            if (type != Type.Living) {
596 	//                continue;
597 	//            }
598 	//            return parent;
599 	//        }
600 	//        // failed
601 	//        return parents.iterator().next();
602 	//    }
603 
loadQueryPairs(Class<?> class1, String file, Function<String, String> keyMapper, Function<String, String> valueMapper)604 	private static Multimap<String, String> loadQueryPairs(Class<?> class1, String file,
605 			Function<String, String> keyMapper, Function<String, String> valueMapper) throws IOException {
606 		System.out.println("QUERY: " + file);
607 		ResultSet rs = queryClient.execSelectFromSparql(file, QueryClient.WIKIDATA_SPARQL_SERVER);
608 		// the query must return exactly two variables.
609 		List<String> resultVars = rs.getResultVars();
610 		assertTwoVars(resultVars);
611 		final String keyName = resultVars.get(0);
612 		final String valueName = resultVars.get(1);
613 
614 		ImmutableMultimap.Builder<String, String> _keyToValues = ImmutableMultimap.builder();
615 		for (;rs.hasNext();) {
616 			final QuerySolution qs = rs.next();
617 			String key = QueryClient.getStringOrNull(qs, keyName);
618 			String value = QueryClient.getStringOrNull(qs, valueName);
619 			_keyToValues.put(key, value);
620 		}
621 		ImmutableMultimap<String, String> result = _keyToValues.build();
622 		showDups(file, result, keyMapper, valueMapper);
623 		System.out.println("LOADED: " + file + " with rows " + rs.getRowNumber());
624 		return result;
625 	}
626 
627 	/**
628 	 * Assuming that the SPARQL query returns exactly 2 results, treat them as Key=Value.
629 	 * @param class1
630 	 * @param file name of a sparql query, such as 'wikidata-childToParent'
631 	 * @param fixValue
632 	 * @param keyMapper
633 	 * @param valueMapper
634 	 * @return
635 	 * @throws IOException
636 	 */
loadQueryPairsUnique(Class<?> class1, String file, Function<String, String> fixValue, Function<String, String> keyMapper, Function<String, String> valueMapper)637 	private static Map<String, String> loadQueryPairsUnique(Class<?> class1, String file,
638 			Function<String, String> fixValue,
639 			Function<String, String> keyMapper, Function<String, String> valueMapper) throws IOException {
640 
641 		System.out.println("QUERY: " + file);
642 		ResultSet rs = queryClient.execSelectFromSparql(file, QueryClient.WIKIDATA_SPARQL_SERVER);
643 
644 		// the query must return exactly two variables.
645 		List<String> resultVars = rs.getResultVars();
646 		assertTwoVars(resultVars);
647 		final String keyName = resultVars.get(0);
648 		final String valueName = resultVars.get(1);
649 
650 		Map<String, String> _keyToValue = new TreeMap<>();
651 		Multimap<String, String> _keyToValues = TreeMultimap.create();
652 		for (;rs.hasNext();) {
653 			final QuerySolution qs = rs.next();
654 			String key = QueryClient.getStringOrNull(qs, keyName);
655 			String value = QueryClient.getStringOrNull(qs, valueName);
656 			if (fixValue != null) {
657 				value = fixValue.apply(value);
658 			}
659 			_keyToValues.put(key, value);
660 			String oldValue = _keyToValue.get(key);
661 			if (oldValue == null || oldValue.equals("kxm")) {
662 				_keyToValue.put(key, value);
663 			}
664 		}
665 		_keyToValue = ImmutableMap.copyOf(_keyToValue);
666 		showDups(file, _keyToValues, keyMapper, valueMapper);
667 		System.out.println("LOADED: " + file + " with rows " + rs.getRowNumber());
668 		return _keyToValue;
669 	}
assertTwoVars(List<String> resultVars)670 	private static void assertTwoVars(List<String> resultVars) {
671 		if(resultVars.size() != 2) {
672 			throw new IllegalArgumentException("expected 2 result vars but got " + resultVars.size() + ": " + resultVars);
673 		}
674 	}
675 
showDups(String file, Multimap<String, String> _keyToValues, Function<String, String> keyMapper, Function<String, String> valueMapper)676 	private static void showDups(String file, Multimap<String, String> _keyToValues,
677 			Function<String, String> keyMapper, Function<String, String> valueMapper) {
678 		for (Entry<String, Collection<String>> entry : _keyToValues.asMap().entrySet()) {
679 			Collection<String> valueSet = entry.getValue();
680 			if (valueSet.size() > 1) {
681 				String key = entry.getKey();
682 				key = keyMapper == null ? key : keyMapper.apply(key);
683 				if (valueMapper != null) {
684 					Set<String> result = new LinkedHashSet<>();
685 					valueSet.stream().map(valueMapper).forEach(x -> result.add(x));
686 					valueSet = result;
687 				}
688 				log(file + "\tMultiple values: " + key + "\t" + valueSet);
689 			}
690 		}
691 	}
692 
getAllAncestors(String lang)693 	static Set<List<String>> getAllAncestors(String lang) {
694 		return Containment.getAllDirected(QUERY_HELPER.childToParent, lang);
695 	}
696 }
697