• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 package org.unicode.cldr.tool;
2 
3 import com.google.common.base.Joiner;
4 import com.google.common.base.Splitter;
5 import com.google.common.collect.ImmutableMap;
6 import com.google.common.collect.ImmutableMultimap;
7 import com.google.common.collect.ImmutableSet;
8 import com.google.common.collect.ImmutableSet.Builder;
9 import com.google.common.collect.LinkedHashMultimap;
10 import com.google.common.collect.Multimap;
11 import com.google.common.collect.Multimaps;
12 import com.google.common.collect.Sets;
13 import com.google.common.collect.Sets.SetView;
14 import com.google.common.collect.SortedSetMultimap;
15 import com.google.common.collect.TreeMultimap;
16 import com.ibm.icu.impl.Row.R2;
17 import com.ibm.icu.util.ICUUncheckedIOException;
18 import java.io.IOException;
19 import java.io.PrintWriter;
20 import java.io.Writer;
21 import java.util.ArrayList;
22 import java.util.Arrays;
23 import java.util.Collection;
24 import java.util.Collections;
25 import java.util.HashSet;
26 import java.util.LinkedHashSet;
27 import java.util.List;
28 import java.util.Map;
29 import java.util.Map.Entry;
30 import java.util.Set;
31 import java.util.TreeMap;
32 import java.util.TreeSet;
33 import java.util.function.Consumer;
34 import java.util.function.Function;
35 import org.apache.jena.query.QuerySolution;
36 import org.apache.jena.query.ResultSet;
37 import org.unicode.cldr.draft.FileUtilities;
38 import org.unicode.cldr.rdf.QueryClient;
39 import org.unicode.cldr.rdf.TsvWriter;
40 import org.unicode.cldr.util.CLDRConfig;
41 import org.unicode.cldr.util.CLDRFile;
42 import org.unicode.cldr.util.CLDRPaths;
43 import org.unicode.cldr.util.Containment;
44 import org.unicode.cldr.util.DiffLanguageGroups;
45 import org.unicode.cldr.util.DtdType;
46 import org.unicode.cldr.util.Iso639Data;
47 import org.unicode.cldr.util.Iso639Data.Type;
48 import org.unicode.cldr.util.LocaleNames;
49 import org.unicode.cldr.util.SimpleXMLSource;
50 import org.unicode.cldr.util.StandardCodes;
51 import org.unicode.cldr.util.StandardCodes.LstrField;
52 import org.unicode.cldr.util.StandardCodes.LstrType;
53 import org.unicode.cldr.util.Validity;
54 import org.unicode.cldr.util.Validity.Status;
55 
56 /**
57  * This code generates language group containment based on Wikidata. For example, it finds: root >
58  * Indo-European [Other] (ine) > Germanic [Other] (gem) > West Germanic languages (gmw) > English
59  * (en)
60  *
61  * <p>To do this, it reads three tables from Wikidata, and combines them. The combination is not
62  * trivial, because wikidata offers multiple "parents" for the same language, and many of the
63  * parents do not have ISO codes. For the first problem, the software computes the possible parent
64  * chains and picks among them. For the second problem, any parents without ISO codes are skipped
65  * (after forming the chains, so the ultimate ancestors are still found). <br>
66  * A number of debugging files are written to the external directory.
67  *
68  * <p>Some failures will be exposed by running this tool. Examples: <br>
69  * <b>wikidata-entityToCode Multiple values:</b> Cebaara [Q1097512] [sef, sev]. <br>
70  * If these are not CLDR languages then they do not need to be fixed. <br>
71  * <b>wikidata-childToParent Multiple values:</b> Q118712 [Q118712] [German [de, Q18], English [en,
72  * Q186]] <br>
73  * Normally these don't need to be fixed; the generation code works around them. <br>
74  * <b>Cycle in [dng, zhx]</b> from [[http://www.wikidata.org/entity/Q33050, <br>
75  * These indicate that the Wikidata has a cycle in it. A => B => C => A. Ignore these unless the
76  * cases are worth investigating.
77  *
78  * <p>Others are exposed by running TestLanguageGroup.java <br>
79  * Error: (TestLanguageGroup.java:55) Single ancestor but not in ISOLATES: ce [Chechen] [ce] <br>
80  * Check to see if the language has a language group (in this case not, so add to
81  * TestLanguageGroup.ISOLATEs). <br>
82  * For kea [Kabuverdianu] [kea], you can add cpp as the parent, as follows. <br>
83  * <b>Missing.</b> If a child-parent relation is missing, you can add it to EXTRA_PARENT_CHILDREN so
84  * that it shows up. For example, .put("gmw", "lb") says that West Germanic is the parent of
85  * Luxembourgish. <br>
86  * <b>Extra.</b> Sometimes wikidata has conflicting or erroneous entries. Those can be fixed by
87  * adding to REMOVE_PARENT_CHILDREN. Use * to remove all children, such as .put("crp", "*") <br>
88  * Sometimes the tool fails with JsonParseExceptions, but works if you rerun. <br>
89  * Cycle in [dng, zhx] from ... Will be fixed by giving the language 'no parent' (mul)
90  *
91  * <p>
92  */
93 public class GenerateLanguageContainment {
94     static {
95         System.out.println(
96                 "See the class description for GenerateLanguageContainment.java about fixing problems.");
97     }
98 
99     private static final boolean ONLY_LIVING = false;
100     private static final CLDRConfig CONFIG = CLDRConfig.getInstance();
101     private static final QueryClient queryClient = QueryClient.getInstance();
102 
103     static final Splitter TAB = Splitter.on('\t').trimResults();
104     private static final Joiner JOIN_TAB = Joiner.on('\t');
105     static final CLDRFile ENGLISH = CONFIG.getEnglish();
106     static final String relDir = "../util/data/languages/";
107     static final Map<String, R2<List<String>, String>> ALIAS_MAP =
108             CONFIG.getSupplementalDataInfo().getLocaleAliasInfo().get("language");
109 
110     /** We load the SparQL queries using this helper object, to be able to catch exceptions… */
111     static final class QueryHelper {
112         public final Map<String, String> entityToLabel;
113         public final Map<String, String> entityToCode;
114         public final ImmutableMultimap<String, String> codeToEntity;
115         public final Multimap<String, String> childToParent;
116 
QueryHelper()117         QueryHelper() {
118             try {
119                 entityToLabel =
120                         loadQueryPairsUnique(
121                                 GenerateLanguageContainment.class,
122                                 "wikidata-entityToLabel",
123                                 null,
124                                 null,
125                                 null);
126 
127                 entityToCode =
128                         loadQueryPairsUnique(
129                                 GenerateLanguageContainment.class,
130                                 "wikidata-entityToCode",
131                                 code -> {
132                                     code = code.replace("\"", "");
133                                     R2<List<String>, String> v = ALIAS_MAP.get(code);
134                                     String result = v == null ? code : v.get0().get(0);
135                                     result = result.contains("_") ? code : result;
136                                     return result;
137                                 },
138                                 code -> showNameAndCode(code),
139                                 NAME);
140 
141                 codeToEntity =
142                         ImmutableMultimap.copyOf(
143                                 Multimaps.invertFrom(
144                                         Multimaps.forMap(entityToCode),
145                                         LinkedHashMultimap.create()));
146 
147                 childToParent =
148                         loadQueryPairs(
149                                 GenerateLanguageContainment.class,
150                                 "wikidata-childToParent",
151                                 code -> showNameAndCode(code),
152                                 code -> showNameAndCode(code));
153 
154             } catch (Throwable t) {
155                 t.printStackTrace();
156                 throw new RuntimeException(t);
157             }
158         }
159 
getEntityName(String key)160         String getEntityName(String key) {
161             String code = getEntityCode(key);
162             if (code != null) {
163                 try {
164                     String name = NAME.apply(code);
165                     if (name != null) {
166                         return name;
167                     }
168                 } catch (Exception e) {
169                     // TODO: Why would NAME.apply throw?
170                     // TODO: Need better handling here?
171                 }
172             }
173             String name = entityToLabel.get(key);
174             if (name != null) {
175                 return name;
176             }
177             return afterLastSlash(key);
178         }
179 
getEntityCode(String key)180         private String getEntityCode(String key) {
181             return entityToCode == null ? null : entityToCode.get(key);
182         }
183 
afterLastSlash(String key)184         private String afterLastSlash(String key) {
185             return key.substring(key.lastIndexOf('/') + 1, key.length() - 1);
186         }
187 
writeTsvs()188         public void writeTsvs() throws IOException {
189             TsvWriter.writeTsv("childToParent.tsv", childToParent, "child", "parent");
190             TsvWriter.writeTsv("entityToCode.tsv", entityToCode, "lang", "langCode");
191             TsvWriter.writeTsv("entityToLabel.tsv", entityToLabel, "lang", "langLabel");
192             SortedSetMultimap<String, String> childToParentWithCodes = TreeMultimap.create();
193             for (Entry<String, String> entry : childToParent.entries()) {
194                 String child = entry.getKey();
195                 String parent = entry.getValue();
196                 childToParentWithCodes.put(showNameAndCode(child), showNameAndCode(parent));
197             }
198             TsvWriter.writeTsv(
199                     "childToParentWithCodes.tsv",
200                     childToParentWithCodes,
201                     "childCode\tLabel",
202                     "parentCode\tLabel");
203         }
204 
showNameAndCode(String qid)205         public String showNameAndCode(String qid) {
206             return getEntityName(qid)
207                     + " ("
208                     + (getEntityCode(qid) == null ? "" : getEntityCode(qid) + ", ")
209                     + afterLastSlash(qid)
210                     + ")";
211         }
212 
showNameAndCode(T qids)213         public <T extends Iterable<String>> String showNameAndCode(T qids) {
214             StringBuilder b = new StringBuilder();
215             qids.forEach(
216                     qid -> {
217                         if (b.length() != 0) b.append(", ");
218                         b.append(showNameAndCode(qid));
219                     });
220             return b.toString();
221         }
222 
showNameAndCode2(U qids)223         public <T extends Iterable<String>, U extends Iterable<T>> String showNameAndCode2(U qids) {
224             StringBuilder b = new StringBuilder();
225             qids.forEach(
226                     qid -> {
227                         if (b.length() != 0) b.append("; ");
228                         b.append(showNameAndCode(qid));
229                     });
230             return b.toString();
231         }
232     }
233 
234     static final QueryHelper QUERY_HELPER = new QueryHelper();
235 
236     static final Function<String, String> NAME =
237             code ->
238                     code.equals(LocaleNames.MUL)
239                             ? LocaleNames.ROOT
240                             : ENGLISH.getName(code) + " (" + code + ")";
241 
242     static final Set<String> COLLECTIONS;
243 
244     static {
245         Map<String, Map<LstrField, String>> languages =
246                 StandardCodes.getEnumLstreg().get(LstrType.language);
247         Builder<String> _collections = ImmutableSet.<String>builder();
248         for (Entry<String, Map<LstrField, String>> e : languages.entrySet()) {
249             String scope = e.getValue().get(LstrField.Scope);
250             if (scope != null && "Collection".equalsIgnoreCase(scope)) {
e.getKey()251                 _collections.add(e.getKey());
252             }
253         }
254         COLLECTIONS = _collections.build();
255     }
256 
257     static class Tree {
258         Set<String> leaves = new LinkedHashSet<>();
259 
add(List<String> chain)260         void add(List<String> chain) {
261             Collections.reverse(chain);
262         }
263     }
264 
265     /** To add parent-child relations to Wikidata */
266     static final Multimap<String, String> RESET_PARENT_CHILDREN =
267             ImmutableMultimap.<String, String>builder()
268                     .put(LocaleNames.MUL, LocaleNames.UND) // anomaly
269                     .put(LocaleNames.MUL, "art") // no containing language family
270                     .put(LocaleNames.MUL, "euq") // no containing language family
271                     .put(LocaleNames.MUL, "jpx") // no containing language family
272                     .put(LocaleNames.MUL, "tai") // no containing language family
273                     .put(
274                             LocaleNames.MUL,
275                             "ko") // no containing language family (Altaic is too controversial)
276                     .put(LocaleNames.MUL, "crp") // no containing language family
277                     .put(LocaleNames.MUL, "kgp") // no containing language family
278                     .put("alv", "agq")
279                     .put("alv", "cch") // Atlantic–Congo <= cch [Atsam]
280                     .put("alv", "kcg") // Atlantic–Congo <= kcg [Tyap]
281                     .put("alv", "ken") // Atlantic–Congo <= ken [Kenyang]
282                     .put("alv", "ngb")
283                     .put("alv", "yav")
284                     .put("ber", "zgh")
285                     .put("bnt", "asa")
286                     .put("bnt", "bez")
287                     .put("bnt", "cgg")
288                     .put("bnt", "ebu")
289                     .put("bnt", "jmc")
290                     .put("bnt", "ksb")
291                     .put("bnt", "lag")
292                     .put("bnt", "mer")
293                     .put("bnt", "mgh")
294                     .put("bnt", "nmg")
295                     .put("bnt", "rof")
296                     .put("bnt", "rwk")
297                     .put("bnt", "sbp")
298                     .put("bnt", "seh")
299                     .put("bnt", "vun")
300                     .put("bnt", "xog")
301                     .put("cpp", "kea")
302                     .put("euq", "eu")
303                     .put("gmw", "ksh") // gmw = West Germanic
304                     .put("gmw", "lb")
305                     .put("gmw", "wae")
306                     .put("grk", "el")
307                     .put("grk", "gmy")
308                     .put("grk", "grc")
309                     .put("ira", "lrc")
310                     .put("ira", "bgn") // Iranian <= Western Balochi
311                     .put("inc", "trw") // Indo-Aryan <= Torwali
312                     .put("jpx", "ja")
313                     .put("ngb", "sg")
314                     .put("roa", "cpf")
315                     .put("roa", "cpp")
316                     .put("sdv", "saq")
317                     .put("son", "khq")
318                     .put("sw", "swc")
319                     .put("tai", "blt") // tai [Tai] <= blt [Tai Dam]
320                     .put("tai", "lo")
321                     .put("tai", "th")
322                     .put("zlw", "szl") // West Slavic <= Silesian
323 
324                     // Restoring languages removed in 2024-08 wikidata
325                     .put("inc", "ur") // Urdu is indic
326                     .put("inc", "pa") // Punjabi is indic
327                     .put("inc", "skr") // Saraiki is indic
328                     .put("zls", "bs") // South Slavic (sh has problems)
329                     .put("zls", "hr") // South Slavic (sh has problems)
330                     .put("zls", "sr") // South Slavic (sh has problems)
331                     .put("inc", "hi") // Indic
332                     .put("inc", "kok") // Indic
333                     .put("inc", "ks") // Indic
334                     .put("inc", "mr") // Indic
335                     .put("inc", "sd") // Indic
336                     .put("cr", "csw") // Cree
337                     .put("tai", "za") // Tai
338                     .put("fiu", "hu") // Finno-Ugric
339                     .put("alg", "cr") // Algonquin
340                     .put("sit", "bo") // Sino-Tibetan
341                     .put("poz", "mg") // Malayo-Polynesian languages
342                     .put("esx", "iu") // Eskimo-Aleut languages
343                     .put("esx", "kl") // Eskimo-Aleut languages
344                     .build();
345 
346     /**
347      * To remove parent-child relations from Wikidata, eg if a child has two parents (where that
348      * causes problems). Don't do it if there is an explicit parent above.
349      */
350     static final Multimap<String, String> REMOVE_PARENT_CHILDREN =
351             ImmutableMultimap.<String, String>builder()
352                     .put("alv", "ukg") // ngf [Trans-New Guinea languages] <= ukg [Ukuriguma]
353                     .put(
354                             "crp",
355                             "*") // general Creole group interferes with French/Spanish/... language
356                     // grouping
357                     .put("cus", "mhd") // bnt [Bantu] <= mhd [Mbugu] (not cus [Cushitic])
358                     .put("gmw", "pih") // cpe [Creoles and pidgins, English based] <= pih
359                     // [Pitcairn-Norfolk]
360                     .put("inc", "rmg")
361                     // Indo-European
362                     .put("nic", "kcp") // ssa [Nilo-Saharan] <= kcp [Kanga]
363                     .put("nic", "kec") // ssa [Nilo-Saharan] <= kec [Keiga]
364                     .put("nic", "kgo") // ssa [Nilo-Saharan] <= kgo [Krongo]
365                     .put("nic", "tbr") // ssa [Nilo-Saharan] <= tbr [Tumtum]
366                     .put("nic", "tey") // ssa [Nilo-Saharan] <= tey [Tulishi]
367                     .put("sit", "dz") // sit <= tbq <= dz
368                     .put("sit", "zh")
369                     .put("sla", "cu")
370                     .put("tbq", "psq") // paa [Papuan]; for psq [Pasi] - not tbq [Tibeto-Burman
371                     // languages]; (There is also a variety of the Sino-Tibetan Adi
372                     // language called Pasi.
373                     .build();
374 
375     static {
376         // If a child is in RESET_PARENT_CHILDREN, it should not be in
377         // REMOVE_PARENT_CHILDREN
378         // That is because the RESET_PARENT_CHILDREN will cause the removal of any other
379         // parents anyway.
380         SetView<String> bad =
381                 Sets.intersection(
382                         Set.copyOf(RESET_PARENT_CHILDREN.values()),
383                         Set.copyOf(REMOVE_PARENT_CHILDREN.values()));
384         if (!bad.isEmpty())
385             System.err.println(
386                     "Remove from REMOVE_PARENT_CHILDREN, child values: \""
387                             + Joiner.on("\",\"").join(bad)
388                             + "\"");
389     }
390 
main(String[] args)391     public static void main(String[] args) throws IOException {
392         new GenerateLanguageContainment().run(args);
393         if (Containment.hadErrors) {
394             System.err.println("ERROR: Containment Errors detected, see errors above.");
395             System.exit(1);
396         }
397     }
398 
run(String[] args)399     void run(String[] args) throws IOException {
400         if (true) {
401             // check on items
402             for (String check : Arrays.asList("sw", "km", "ksh", "wae", "kea", "mfe", "th", "lo")) {
403                 System.out.println("Checking " + ENGLISH.getName(check) + "[" + check + "]");
404                 Collection<String> entities = QUERY_HELPER.codeToEntity.get(check);
405                 if (entities.isEmpty()) {
406                     System.out.println("no code for " + check + ": " + entities);
407                     continue;
408                 }
409                 for (String entity : entities) {
410                     Set<List<String>> ancestors = getAllAncestors(entity);
411                     showEntityLists(entity + " parents ", ancestors);
412                     System.out.println();
413                 }
414             }
415         }
416 
417         Map<Status, Set<String>> table = Validity.getInstance().getStatusToCodes(LstrType.language);
418         TreeMultimap<String, String> _parentToChild = TreeMultimap.create();
419         TreeSet<String> missing = new TreeSet<>(table.get(Status.regular));
420         _parentToChild.put(LocaleNames.MUL, LocaleNames.UND);
421         Set<String> skipping = new LinkedHashSet<>();
422         for (String code : table.get(Status.regular)) {
423             if (ONLY_LIVING) {
424                 Type type = Iso639Data.getType(code);
425                 if (type != Type.Living) {
426                     continue;
427                 }
428             }
429             if (code.compareTo("hdz") > 0) {
430                 int debug = 0;
431             }
432             // if (COLLECTIONS.contains(code)) {
433             // continue;
434             // }
435             Collection<String> entities = QUERY_HELPER.codeToEntity.get(code);
436             if (entities.isEmpty()) {
437                 continue;
438             }
439             for (String entity : entities) {
440                 if (QUERY_HELPER.childToParent.get(entity).isEmpty()) {
441                     continue;
442                 }
443                 Set<Set<String>> chains = getAncestors(entity, skipping);
444                 if (chains.size() > 1) {
445                     int debug = 0;
446                 }
447                 for (Set<String> chain : chains) {
448                     String last = null;
449                     for (String link : chain) {
450                         if (last != null) {
451                             _parentToChild.put(link, last);
452                         }
453                         last = link;
454                     }
455                 }
456             }
457         }
458         System.out.println("Writing " + "skippingCodes.tsv");
459         try (PrintWriter w =
460                 FileUtilities.openUTF8Writer(TsvWriter.getTsvDir(), "skippingCodes.tsv")) {
461             // TsvWriter.writeRow(w, "childCode\tLabel", "parentCode\tLabel"); // header
462             skipping.forEach(e -> w.println(e));
463         }
464 
465         // preflight
466         DiffLanguageGroups.show("en");
467 
468         Multimap<String, String> _childToParents =
469                 Multimaps.invertFrom(_parentToChild, TreeMultimap.create());
470 
471         System.out.println("\nOVERRIDE Remove parent");
472         System.out.println("OVERRIDE\tParent\tChild\tNew Parents");
473         for (Entry<String, String> entry : REMOVE_PARENT_CHILDREN.entries()) {
474             final String parent = entry.getKey();
475             final String child = entry.getValue();
476             Set<String> oldChildren = _parentToChild.get(parent);
477             String type;
478             if (child.equals("*")) {
479                 if (oldChildren == null) {
480                     type = "No remove";
481                 } else {
482                     type = "Removing parent";
483                     _parentToChild.removeAll(parent);
484                     _childToParents = Multimaps.invertFrom(_parentToChild, TreeMultimap.create());
485                 }
486             } else {
487                 if (oldChildren != null && oldChildren.contains(child)) {
488                     _parentToChild.remove(parent, child);
489                     _childToParents = Multimaps.invertFrom(_parentToChild, TreeMultimap.create());
490                     type = "Removing parent";
491                 } else {
492                     type = "No remove";
493                 }
494             }
495             System.out.println(
496                     JOIN_TAB.join(
497                             type,
498                             DiffLanguageGroups.show(parent),
499                             DiffLanguageGroups.show(child),
500                             _childToParents.get(child)));
501         }
502 
503         System.out.println("\nOVERRIDE Replace Parent");
504         System.out.println("OVERRIDE\tParent\tChild");
505         for (Entry<String, String> entry : RESET_PARENT_CHILDREN.entries()) {
506             final String parent = entry.getKey();
507             final String child = entry.getValue();
508             Set<String> oldValues = _parentToChild.get(parent);
509             Set<String> removals = new LinkedHashSet<>();
510 
511             String type;
512             if (oldValues != null && oldValues.contains(child)) {
513                 type = "Redundant add";
514             } else {
515                 type = "Changing";
516                 _parentToChild.put(parent, child);
517                 _childToParents = Multimaps.invertFrom(_parentToChild, TreeMultimap.create());
518                 Collection<String> newParents = _childToParents.get(child);
519                 if (newParents.size() > 1) {
520                     for (String parent2 : newParents) {
521                         if (!parent2.equals(parent)) {
522                             _parentToChild.remove(parent2, child);
523                             removals.add(parent2);
524                         }
525                         // rebuild
526                         _childToParents =
527                                 Multimaps.invertFrom(_parentToChild, TreeMultimap.create());
528                     }
529                 }
530             }
531             System.out.println(
532                     JOIN_TAB.join(
533                             type,
534                             DiffLanguageGroups.show(parent),
535                             DiffLanguageGroups.show(child),
536                             _childToParents.get(child),
537                             removals));
538         }
539 
540         // special code for artificial
541         for (String code : Iso639Data.getAvailable()) {
542             Type type = Iso639Data.getType(code);
543             if (type == Type.Constructed) {
544                 _parentToChild.put("art", code);
545             }
546         }
547 
548         Multimap<String, String> parentToChild = ImmutableMultimap.copyOf(_parentToChild);
549         Multimap<String, String> childToParent =
550                 ImmutableMultimap.copyOf(
551                         Multimaps.invertFrom(parentToChild, TreeMultimap.create()));
552         System.out.println(
553                 "Checking " + "he" + "\t" + Containment.getAllDirected(childToParent, "he"));
554 
555         try (PrintWriter w =
556                 FileUtilities.openUTF8Writer(TsvWriter.getTsvDir(), "RawLanguageContainment.txt")) {
557             print(w, parentToChild, new ArrayList<>(Arrays.asList(LocaleNames.MUL)));
558         }
559         SimpleXMLSource xmlSource = new SimpleXMLSource("languageGroup");
560         xmlSource.setNonInheriting(true); // should be gotten from DtdType...
561         CLDRFile newFile = new CLDRFile(xmlSource);
562         newFile.setDtdType(DtdType.supplementalData);
563         newFile.add("//" + DtdType.supplementalData + "/version[@number='$Revision$']", "");
564         printXML(newFile, parentToChild);
565 
566         try (PrintWriter outFile =
567                 FileUtilities.openUTF8Writer(
568                         CLDRPaths.SUPPLEMENTAL_DIRECTORY, "languageGroup.xml")) {
569             newFile.write(outFile);
570         } catch (IOException e1) {
571             throw new ICUUncheckedIOException("Can't write to languageGroup.xml", e1);
572         }
573 
574         // for (Entry<String,String> entry : childToParent.entries()) {
575         // String childNames = getName(entityToCode, entityToLabel, entry.getKey());
576         // String parentNames = getName(entityToCode, entityToLabel, entry.getValue());
577         // System.out.println(entry.getKey() + "\t" + entry.getValue() + "\t" +
578         // childNames + "\t" + parentNames);
579         // }
580         QUERY_HELPER.writeTsvs();
581         DiffLanguageGroups.main(new String[] {});
582     }
583 
showEntityLists(String title, Set<List<String>> ancestors)584     private static void showEntityLists(String title, Set<List<String>> ancestors) {
585         ancestors.forEach(
586                 new Consumer<List<String>>() {
587                     @Override
588                     public void accept(List<String> item) {
589                         item.forEach(
590                                 new Consumer<String>() {
591                                     @Override
592                                     public void accept(String t) {
593                                         System.out.println(
594                                                 t
595                                                         + "\t"
596                                                         + QUERY_HELPER.entityToCode.get(t)
597                                                         + "\t"
598                                                         + QUERY_HELPER.entityToLabel.get(t));
599                                     }
600                                 });
601                         System.out.println();
602                     }
603                 });
604     }
605 
printXML(CLDRFile newFile, Multimap<String, String> parentToChild)606     private static void printXML(CLDRFile newFile, Multimap<String, String> parentToChild) {
607         printXML(newFile, parentToChild, LocaleNames.MUL);
608     }
609 
printXML( CLDRFile newFile, Multimap<String, String> parentToChild, String base)610     private static void printXML(
611             CLDRFile newFile, Multimap<String, String> parentToChild, String base) {
612         Collection<String> children = parentToChild.get(base);
613         if (children.isEmpty()) {
614             return;
615         }
616         if (base.equals(LocaleNames.UND)) {
617             // skip, no good info
618         } else {
619             newFile.add(
620                     "//"
621                             + DtdType.supplementalData
622                             + "/languageGroups/languageGroup[@parent=\""
623                             + base
624                             + "\"]",
625                     Joiner.on(" ").join(children));
626         }
627         for (String child : children) {
628             printXML(newFile, parentToChild, child);
629         }
630     }
631 
print( Writer out, Multimap<String, String> parentToChild, List<String> line)632     private static void print(
633             Writer out, Multimap<String, String> parentToChild, List<String> line) {
634         String current = line.get(line.size() - 1);
635         Collection<String> children = parentToChild.get(current);
636         if (children.isEmpty()) {
637             try {
638                 String sep = "";
639                 for (String item : line) {
640                     out.append(sep).append(NAME.apply(item));
641                     sep = " > ";
642                 }
643                 out.append('\n');
644                 out.flush();
645             } catch (IOException e) {
646             }
647         } else {
648             for (String child : children) {
649                 line.add(child);
650                 print(out, parentToChild, line);
651                 line.remove(line.size() - 1);
652             }
653         }
654     }
655 
getAncestors(String leaf, Set<String> skipping)656     private static Set<Set<String>> getAncestors(String leaf, Set<String> skipping) {
657         Set<List<String>> items = Containment.getAllDirected(QUERY_HELPER.childToParent, leaf);
658         Set<Set<String>> itemsFixed = new LinkedHashSet<>();
659         main:
660         for (List<String> item : items) {
661             Set<String> chain = new LinkedHashSet<>();
662             for (String id : item) {
663                 String code = QUERY_HELPER.entityToCode.get(id);
664                 if (code == null) {
665                     continue;
666                 }
667 
668                 // skip leaf nodes after the first
669 
670                 if (!chain.isEmpty() && !COLLECTIONS.contains(code)) {
671                     if (code.equals("zh")) {
672                         code = "zhx"; // rewrite collections usage
673                     } else {
674                         skipping.add(
675                                 "Skipping inheritance from\t"
676                                         + chain
677                                         + "\t"
678                                         + code
679                                         + "\tfrom\t"
680                                         + QUERY_HELPER.showNameAndCode2(items));
681                         continue;
682                     }
683                 }
684 
685                 // check for cycle, and skip if we have one
686 
687                 boolean changed = chain.add(code);
688                 if (!changed) {
689                     log("Cycle in\t" + chain + "\tfrom\t" + QUERY_HELPER.showNameAndCode2(items));
690                     continue main;
691                 }
692             }
693             if (chain.size() > 1) {
694                 chain.add(LocaleNames.MUL); // root
695                 itemsFixed.add(chain);
696             }
697         }
698         // remove subsets
699         // eg [[smp, he, mul], [smp, he, sem, afa, mul]]
700         // => [[smp, he, sem, afa, mul]]
701         if (itemsFixed.size() > 1) {
702             Set<Set<String>> removals = new HashSet<>();
703             for (Set<String> chain1 : itemsFixed) {
704                 for (Set<String> chain2 : itemsFixed) {
705                     if (chain1.containsAll(chain2) && !chain2.containsAll(chain1)) {
706                         removals.add(chain2);
707                     }
708                 }
709             }
710             itemsFixed.removeAll(removals);
711         }
712         return itemsFixed;
713         // TODO: delete this commented-out code?
714         // while (true) {
715         // String code = entityToCode.get(leaf);
716         // if (code != null) {
717         // chain.add(code);
718         // }
719         // Collection<String> parents = childToParent.get(leaf);
720         // if (parents.isEmpty()) {
721         // // clean up duplicates
722         // chain = new ArrayList<>(new LinkedHashSet<>(chain));
723         // // wikipedia has non-collections as parents. Remove those if they are not
724         // first.
725         // break;
726         // }
727         // leaf = getBest(parents);
728         // }
729         // String last = chain.get(0);
730         // for (int i = 1; i < chain.size(); ++i) {
731         // String item = chain.get(i);
732         // if (!COLLECTIONS.contains(item)) {
733         // chain.set(i, item.equals("zh") ? "zhx" : "");
734         // DROPPED_PARENTS_TO_CHILDREN.put(item, last);
735         // } else {
736         // last = item;
737         // }
738         // }
739         // chain.removeIf(x -> x.isEmpty());
740         // if ("zh".equals(chain.get(0))) {
741         // chain.add(1,"zhx");
742         // }
743         // last = chain.get(chain.size()-1);
744         // if (!LocaleNames.MUL.equals(last)) {
745         // chain.add(LocaleNames.MUL); // make sure we have root.
746         // }
747         // if (chain.size() == 2) {
748         // chain.add(1,LocaleNames.UND);
749         // }
750         // return chain;
751     }
752 
log(String string)753     private static void log(String string) {
754         System.out.println(string);
755         // for (Entry<String, String> e : DROPPED_PARENTS_TO_CHILDREN.entries()) {
756         // System.out.println(NAME.apply(e.getKey()) + "\t" + NAME.apply(e.getValue())
757         // );
758         // }
759     }
760 
761     // TODO: This function is only called by other commented-out code above.
762     // private static String getBest(Collection<String> parents) {
763     // for (String parent : parents) {
764     // String code = QUERY_HELPER.entityToCode.get(parent);
765     // if (code == null) continue;
766     // Type type = Iso639Data.getType(code);
767     // if (type != Type.Living) {
768     // continue;
769     // }
770     // return parent;
771     // }
772     // // failed
773     // return parents.iterator().next();
774     // }
775 
loadQueryPairs( Class<?> class1, String file, Function<String, String> keyMapper, Function<String, String> valueMapper)776     private static Multimap<String, String> loadQueryPairs(
777             Class<?> class1,
778             String file,
779             Function<String, String> keyMapper,
780             Function<String, String> valueMapper)
781             throws IOException {
782         System.out.println("QUERY: " + file);
783         ResultSet rs = queryClient.execSelectFromSparql(file, QueryClient.WIKIDATA_SPARQL_SERVER);
784         // the query must return exactly two variables.
785         List<String> resultVars = rs.getResultVars();
786         assertTwoVars(resultVars);
787         final String keyName = resultVars.get(0);
788         final String valueName = resultVars.get(1);
789 
790         ImmutableMultimap.Builder<String, String> _keyToValues = ImmutableMultimap.builder();
791         for (; rs.hasNext(); ) {
792             final QuerySolution qs = rs.next();
793             String key = QueryClient.getStringOrNull(qs, keyName);
794             String value = QueryClient.getStringOrNull(qs, valueName);
795             _keyToValues.put(key, value);
796         }
797         ImmutableMultimap<String, String> result = _keyToValues.build();
798         showDups(file, result, keyMapper, valueMapper);
799         System.out.println("LOADED: " + file + " with rows " + rs.getRowNumber());
800         return result;
801     }
802 
803     /**
804      * Assuming that the SPARQL query returns exactly 2 results, treat them as Key=Value.
805      *
806      * @param class1
807      * @param file name of a sparql query, such as 'wikidata-childToParent'
808      * @param fixValue
809      * @param keyMapper
810      * @param valueMapper
811      * @return
812      * @throws IOException
813      */
loadQueryPairsUnique( Class<?> class1, String file, Function<String, String> fixValue, Function<String, String> keyMapper, Function<String, String> valueMapper)814     private static Map<String, String> loadQueryPairsUnique(
815             Class<?> class1,
816             String file,
817             Function<String, String> fixValue,
818             Function<String, String> keyMapper,
819             Function<String, String> valueMapper)
820             throws IOException {
821 
822         System.out.println("QUERY: " + file);
823         ResultSet rs = queryClient.execSelectFromSparql(file, QueryClient.WIKIDATA_SPARQL_SERVER);
824 
825         // the query must return exactly two variables.
826         List<String> resultVars = rs.getResultVars();
827         assertTwoVars(resultVars);
828         final String keyName = resultVars.get(0);
829         final String valueName = resultVars.get(1);
830 
831         Map<String, String> _keyToValue = new TreeMap<>();
832         Multimap<String, String> _keyToValues = TreeMultimap.create();
833         for (; rs.hasNext(); ) {
834             final QuerySolution qs = rs.next();
835             String key = QueryClient.getStringOrNull(qs, keyName);
836             String value = QueryClient.getStringOrNull(qs, valueName);
837             if (fixValue != null) {
838                 value = fixValue.apply(value);
839             }
840             _keyToValues.put(key, value);
841             String oldValue = _keyToValue.get(key);
842             if (oldValue == null || oldValue.equals("kxm")) {
843                 _keyToValue.put(key, value);
844             }
845         }
846         _keyToValue = ImmutableMap.copyOf(_keyToValue);
847         showDups(file, _keyToValues, keyMapper, valueMapper);
848         System.out.println("LOADED: " + file + " with rows " + rs.getRowNumber());
849         return _keyToValue;
850     }
851 
assertTwoVars(List<String> resultVars)852     private static void assertTwoVars(List<String> resultVars) {
853         if (resultVars.size() != 2) {
854             throw new IllegalArgumentException(
855                     "expected 2 result vars but got " + resultVars.size() + ": " + resultVars);
856         }
857     }
858 
showDups( String file, Multimap<String, String> _keyToValues, Function<String, String> keyMapper, Function<String, String> valueMapper)859     private static void showDups(
860             String file,
861             Multimap<String, String> _keyToValues,
862             Function<String, String> keyMapper,
863             Function<String, String> valueMapper) {
864         for (Entry<String, Collection<String>> entry : _keyToValues.asMap().entrySet()) {
865             Collection<String> valueSet = entry.getValue();
866             if (valueSet.size() > 1) {
867                 String key = entry.getKey();
868                 key = keyMapper == null ? key : keyMapper.apply(key);
869                 if (valueMapper != null) {
870                     Set<String> result = new LinkedHashSet<>();
871                     valueSet.stream().map(valueMapper).forEach(x -> result.add(x));
872                     valueSet = result;
873                 }
874                 log(file + "\tMultiple values: " + key + "\t" + valueSet);
875             }
876         }
877     }
878 
getAllAncestors(String lang)879     static Set<List<String>> getAllAncestors(String lang) {
880         return Containment.getAllDirected(QUERY_HELPER.childToParent, lang);
881     }
882 }
883