• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 package org.unicode.cldr.tool;
2 
3 import java.io.IOException;
4 import java.io.PrintWriter;
5 import java.io.Writer;
6 import java.util.ArrayList;
7 import java.util.Arrays;
8 import java.util.Collection;
9 import java.util.Collections;
10 import java.util.HashSet;
11 import java.util.LinkedHashSet;
12 import java.util.List;
13 import java.util.Map;
14 import java.util.Map.Entry;
15 import java.util.Set;
16 import java.util.TreeMap;
17 import java.util.TreeSet;
18 import java.util.function.Consumer;
19 import java.util.function.Function;
20 
21 import org.unicode.cldr.draft.FileUtilities;
22 import org.unicode.cldr.util.CLDRConfig;
23 import org.unicode.cldr.util.CLDRFile;
24 import org.unicode.cldr.util.CLDRPaths;
25 import org.unicode.cldr.util.Containment;
26 import org.unicode.cldr.util.DtdType;
27 import org.unicode.cldr.util.Iso639Data;
28 import org.unicode.cldr.util.Iso639Data.Type;
29 import org.unicode.cldr.util.SimpleXMLSource;
30 import org.unicode.cldr.util.StandardCodes;
31 import org.unicode.cldr.util.StandardCodes.LstrField;
32 import org.unicode.cldr.util.StandardCodes.LstrType;
33 import org.unicode.cldr.util.Validity;
34 import org.unicode.cldr.util.Validity.Status;
35 
36 import com.google.common.base.Joiner;
37 import com.google.common.base.Splitter;
38 import com.google.common.collect.ImmutableMap;
39 import com.google.common.collect.ImmutableMultimap;
40 import com.google.common.collect.ImmutableSet;
41 import com.google.common.collect.ImmutableSet.Builder;
42 import com.google.common.collect.LinkedHashMultimap;
43 import com.google.common.collect.Multimap;
44 import com.google.common.collect.Multimaps;
45 import com.google.common.collect.TreeMultimap;
46 import com.ibm.icu.impl.Row.R2;
47 import com.ibm.icu.util.ICUUncheckedIOException;
48 
49 public class GenerateLanguageContainment {
50     private static final boolean ONLY_LIVING = false;
51     private static final CLDRConfig CONFIG = CLDRConfig.getInstance();
52     static final Splitter TAB = Splitter.on('\t').trimResults();
53     static final CLDRFile ENGLISH = CONFIG.getEnglish();
54     static final String relDir = "../util/data/languages/";
55     static final Map<String, R2<List<String>, String>> ALIAS_MAP = CONFIG
56         .getSupplementalDataInfo()
57         .getLocaleAliasInfo()
58         .get("language");
59     static final Map<String, String> entityToLabel = loadTsvPairsUnique(GenerateLanguageContainment.class, relDir + "entityToLabel.tsv",
60         null, null, null);
61 
62     static final Function<String, String> NAME = code -> code.equals("mul") ? "root" : ENGLISH.getName(code) + " (" + code + ")";
63 
64     static final Map<String, String> entityToCode = loadTsvPairsUnique(GenerateLanguageContainment.class, relDir + "entityToCode.tsv",
65         code -> {
66             code = code.replace("\"", "");
67             R2<List<String>, String> v = ALIAS_MAP.get(code);
68             String result = v == null
69                 ? code : v.get0().get(0);
70             result = result.contains("_")
71                 ? code
72                 : result;
73             return result;
74         },
75         null, NAME);
76 
77     static final Multimap<String, String> codeToEntity = ImmutableMultimap.copyOf(
78         Multimaps.invertFrom(Multimaps.forMap(entityToCode), LinkedHashMultimap.create()));
79 
80     static final Multimap<String, String> childToParent = loadTsvPairs(GenerateLanguageContainment.class, relDir + "childToParent.tsv",
81         code -> getEntityName(code), code -> getEntityName(code));
82 
83     static final Set<String> COLLECTIONS;
84     static {
85         Map<String, Map<LstrField, String>> languages = StandardCodes.getEnumLstreg().get(LstrType.language);
86         Builder<String> _collections = ImmutableSet.<String> builder();
87         for (Entry<String, Map<LstrField, String>> e : languages.entrySet()) {
88             String scope = e.getValue().get(LstrField.Scope);
89             if (scope != null
90                 && "Collection".equalsIgnoreCase(scope)) {
e.getKey()91                 _collections.add(e.getKey());
92             }
93         }
94         COLLECTIONS = _collections.build();
95     }
96 
97     static class Tree {
98         Set<String> leaves = new LinkedHashSet<>();
99 
add(List<String> chain)100         void add(List<String> chain) {
101             Collections.reverse(chain);
102         }
103     }
104 
105     static final Multimap<String, String> EXTRA_PARENT_CHILDREN = ImmutableMultimap.<String, String> builder()
106         .put("mul", "art") // we add art programmatically
107         .put("gmw", "ksh")
108         .put("gmw", "wae")
109         .put("mul", "tai")
110         .put("tai", "th")
111         .put("tai", "lo")
112         .put("roa", "cpf")
113         .put("roa", "cpp")
114         .put("ber", "zgh")
115         .put("sdv", "saq")
116         .put("sw", "swc")
117         .put("alv", "agq")
118         .put("bnt", "asa")
119         .put("bnt", "bez")
120         .put("bnt", "cgg")
121         .put("bnt", "ebu")
122         .put("bnt", "ksb")
123         .put("bnt", "lag")
124         .put("bnt", "rof")
125         .put("bnt", "sbp")
126         .put("ngb", "sg")
127         .put("alv", "ngb")
128         .put("bnt", "jmc")
129         .put("bnt", "mer")
130         .put("bnt", "mgh")
131         .put("bnt", "nmg")
132         .put("bnt", "rwk")
133         .put("bnt", "seh")
134         .put("bnt", "vun")
135         .put("bnt", "xog")
136         .put("alv", "yav")
137         .put("son", "khq")
138         .put("euq", "eu")
139         .put("mul", "euq")
140         .put("mul", "jpx")
141         .put("jpx", "ja")
142         .put("ira", "lrc")
143         .put("grk", "el")
144         .put("grk", "grc")
145         .put("grk", "gmy")
146         .build();
147 
148     static final Multimap<String, String> REMOVE_PARENT_CHILDREN = ImmutableMultimap.<String, String> builder()
149         .put("mul", "und") // anomaly
150         .put("mul", "crp")
151         .put("crp", "*") // general Creole group interferes with French/Spanish/... language grouping
152         .put("sit", "zh") // other cases where we have to remove items we add in different place above.
153         .put("inc", "rmg")
154         .put("sla", "cu")
155         .put("ine", "gmy")
156         .put("ine", "el")
157         .put("ine", "grc")
158         .build();
159 
main(String[] args)160     public static void main(String[] args) {
161         if (true) {
162             // check on items
163             for (String check : Arrays.asList("sw", "km", "ksh", "wae", "kea", "mfe", "th", "lo")) {
164                 System.out.println("Checking " + ENGLISH.getName(check) + "[" + check + "]");
165                 Collection<String> entities = codeToEntity.get(check);
166                 if (entities.isEmpty()) {
167                     System.out.println("no code for " + check + ": " + entities);
168                     continue;
169                 }
170                 for (String entity : entities) {
171                     Set<List<String>> ancestors = getAllAncestors(entity);
172                     showEntityLists(entity + " parents ", ancestors);
173                     System.out.println();
174                 }
175             }
176         }
177 
178         Map<Status, Set<String>> table = Validity.getInstance().getStatusToCodes(LstrType.language);
179         TreeMultimap<String, String> _parentToChild = TreeMultimap.create();
180         TreeSet<String> missing = new TreeSet<>(table.get(Status.regular));
181         _parentToChild.put("mul", "und");
182         for (String code : table.get(Status.regular)) {
183             if (ONLY_LIVING) {
184                 Type type = Iso639Data.getType(code);
185                 if (type != Type.Living) {
186                     continue;
187                 }
188             }
189             if (code.compareTo("hdz") > 0) {
190                 int debug = 0;
191             }
192 //            if (COLLECTIONS.contains(code)) {
193 //                continue;
194 //            }
195             Collection<String> entities = codeToEntity.get(code);
196             if (entities.isEmpty()) {
197                 continue;
198             }
199             for (String entity : entities) {
200                 if (childToParent.get(entity).isEmpty()) {
201                     continue;
202                 }
203                 Set<Set<String>> chains = getAncestors(entity);
204                 if (chains.size() > 1) {
205                     int debug = 0;
206                 }
207                 for (Set<String> chain : chains) {
208                     String last = null;
209                     for (String link : chain) {
210                         if (last != null) {
211                             _parentToChild.put(link, last);
212                         }
213                         last = link;
214                     }
215                 }
216             }
217         }
218 
219         for (Entry<String, Collection<String>> entity : REMOVE_PARENT_CHILDREN.asMap().entrySet()) {
220             String key = entity.getKey();
221             for (String value : entity.getValue()) {
222                 if (value.equals("*")) {
223                     _parentToChild.removeAll(key);
224                 } else {
225                     _parentToChild.remove(key, value);
226                 }
227             }
228         }
229 
230         _parentToChild.putAll(EXTRA_PARENT_CHILDREN);
231 
232         // special code for artificial
233         for (String code : Iso639Data.getAvailable()) {
234             Type type = Iso639Data.getType(code);
235             if (type == Type.Constructed) {
236                 _parentToChild.put("art", code);
237             }
238         }
239 
240         Multimap<String, String> parentToChild = ImmutableMultimap.copyOf(_parentToChild);
241         Multimap<String, String> childToParent = ImmutableMultimap.copyOf(Multimaps.invertFrom(parentToChild, TreeMultimap.create()));
242         System.out.println("Checking " + "he" + "\t" + Containment.getAllDirected(childToParent, "he"));
243 
244         PrintWriter out = new PrintWriter(System.out);
245         print(out, parentToChild, new ArrayList<>(Arrays.asList("mul")));
246         System.out.println(out);
247         SimpleXMLSource xmlSource = new SimpleXMLSource("languageGroup");
248         xmlSource.setNonInheriting(true); // should be gotten from DtdType...
249         CLDRFile newFile = new CLDRFile(xmlSource);
250         newFile.setDtdType(DtdType.supplementalData);
251         newFile.add("//" + DtdType.supplementalData + "/version[@number='$Revision$']", "");
252         printXML(newFile, parentToChild);
253 
254         try (PrintWriter outFile = FileUtilities.openUTF8Writer(CLDRPaths.SUPPLEMENTAL_DIRECTORY, "languageGroup.xml")) {
255             newFile.write(outFile);
256         } catch (IOException e1) {
257             throw new ICUUncheckedIOException("Can't write to languageGroup.xml", e1);
258         }
259 
260 //        for (Entry<String,String> entry : childToParent.entries()) {
261 //            String childNames = getName(entityToCode, entityToLabel, entry.getKey());
262 //            String parentNames = getName(entityToCode, entityToLabel, entry.getValue());
263 //            System.out.println(entry.getKey() + "\t" + entry.getValue() + "\t" + childNames + "\t" + parentNames);
264 //        }
265     }
266 
showEntityLists(String title, Set<List<String>> ancestors)267     private static void showEntityLists(String title, Set<List<String>> ancestors) {
268         ancestors.forEach(new Consumer<List<String>>() {
269             @Override
270             public void accept(List<String> item) {
271                 item.forEach(new Consumer<String>() {
272                     @Override
273                     public void accept(String t) {
274                         System.out.println(t + "\t" + entityToCode.get(t) + "\t" + entityToLabel.get(t));
275                     }
276                 });
277                 System.out.println();
278             }
279         });
280     }
281 
printXML(CLDRFile newFile, Multimap<String, String> parentToChild)282     private static void printXML(CLDRFile newFile, Multimap<String, String> parentToChild) {
283         printXML(newFile, parentToChild, "mul");
284     }
285 
printXML(CLDRFile newFile, Multimap<String, String> parentToChild, String base)286     private static void printXML(CLDRFile newFile, Multimap<String, String> parentToChild, String base) {
287         Collection<String> children = parentToChild.get(base);
288         if (children.isEmpty()) {
289             return;
290         }
291         if (base.equals("und")) {
292             // skip, no good info
293         } else {
294             newFile.add("//" + DtdType.supplementalData + "/languageGroups/languageGroup[@parent=\"" + base + "\"]",
295                 Joiner.on(" ").join(children));
296         }
297         for (String child : children) {
298             printXML(newFile, parentToChild, child);
299         }
300     }
301 
print(Writer out, Multimap<String, String> parentToChild, List<String> line)302     private static void print(Writer out, Multimap<String, String> parentToChild, List<String> line) {
303         String current = line.get(line.size() - 1);
304         Collection<String> children = parentToChild.get(current);
305         if (children.isEmpty()) {
306             try {
307                 String sep = "";
308                 for (String item : line) {
309                     out.append(sep).append(NAME.apply(item));
310                     sep = " > ";
311                 }
312                 out.append('\n');
313                 out.flush();
314             } catch (IOException e) {
315             }
316         } else {
317             for (String child : children) {
318                 line.add(child);
319                 print(out, parentToChild, line);
320                 line.remove(line.size() - 1);
321             }
322         }
323     }
324 
getAncestors(String leaf)325     private static Set<Set<String>> getAncestors(String leaf) {
326         Set<List<String>> items = Containment.getAllDirected(childToParent, leaf);
327         Set<Set<String>> itemsFixed = new LinkedHashSet<>();
328         main: for (List<String> item : items) {
329             Set<String> chain = new LinkedHashSet<>();
330             for (String id : item) {
331                 String code = entityToCode.get(id);
332                 if (code == null) {
333                     continue;
334                 }
335 
336                 // skip leaf nodes after the first
337 
338                 if (!chain.isEmpty() && !COLLECTIONS.contains(code)) {
339                     if (code.equals("zh")) {
340                         code = "zhx"; // rewrite collections usage
341                     } else {
342                         log("Skipping inheritance from\t" + chain + "\t" + code + "\tfrom\t" + items);
343                         continue;
344                     }
345                 }
346 
347                 // check for cycle, and skip if we have one
348 
349                 boolean changed = chain.add(code);
350                 if (!changed) {
351                     log("Cycle in\t" + chain + "\tfrom\t" + items);
352                     continue main;
353                 }
354             }
355             if (chain.size() > 1) {
356                 chain.add("mul"); // root
357                 itemsFixed.add(chain);
358             }
359         }
360         // remove subsets
361         // eg [[smp, he, mul], [smp, he, sem, afa, mul]]
362         // => [[smp, he, sem, afa, mul]]
363         if (itemsFixed.size() > 1) {
364             Set<Set<String>> removals = new HashSet<>();
365             for (Set<String> chain1 : itemsFixed) {
366                 for (Set<String> chain2 : itemsFixed) {
367                     if (chain1.containsAll(chain2) && !chain2.containsAll(chain1)) {
368                         removals.add(chain2);
369                     }
370                 }
371             }
372             itemsFixed.removeAll(removals);
373         }
374         return itemsFixed;
375 //        while (true) {
376 //            String code = entityToCode.get(leaf);
377 //            if (code != null) {
378 //                chain.add(code);
379 //            }
380 //            Collection<String> parents = childToParent.get(leaf);
381 //            if (parents.isEmpty()) {
382 //                // clean up duplicates
383 //                chain = new ArrayList<>(new LinkedHashSet<>(chain));
384 //                // wikipedia has non-collections as parents. Remove those if they are not first.
385 //                break;
386 //            }
387 //            leaf = getBest(parents);
388 //        }
389 //        String last = chain.get(0);
390 //        for (int i = 1; i < chain.size(); ++i) {
391 //            String item = chain.get(i);
392 //            if (!COLLECTIONS.contains(item)) {
393 //                chain.set(i, item.equals("zh") ? "zhx" : "");
394 //                DROPPED_PARENTS_TO_CHILDREN.put(item, last);
395 //            } else {
396 //                last = item;
397 //            }
398 //        }
399 //        chain.removeIf(x -> x.isEmpty());
400 //        if ("zh".equals(chain.get(0))) {
401 //            chain.add(1,"zhx");
402 //        }
403 //        last = chain.get(chain.size()-1);
404 //        if (!"mul".equals(last)) {
405 //            chain.add("mul"); // make sure we have root.
406 //        }
407 //        if (chain.size() == 2) {
408 //            chain.add(1,"und");
409 //        }
410 //        return chain;
411     }
412 
log(String string)413     private static void log(String string) {
414         System.out.println(string);
415 //        for (Entry<String, String> e : DROPPED_PARENTS_TO_CHILDREN.entries()) {
416 //            System.out.println(NAME.apply(e.getKey()) + "\t" + NAME.apply(e.getValue())
417 //                );
418 //        }
419     }
420 
getBest(Collection<String> parents)421     private static String getBest(Collection<String> parents) {
422         for (String parent : parents) {
423             String code = entityToCode.get(parent);
424             if (code == null) continue;
425             Type type = Iso639Data.getType(code);
426             if (type != Type.Living) {
427                 continue;
428             }
429             return parent;
430         }
431         // failed
432         return parents.iterator().next();
433     }
434 
getEntityName(String key)435     private static String getEntityName(String key) {
436         String code = entityToCode.get(key);
437         if (code != null) {
438             try {
439                 String name = NAME.apply(code);
440                 if (name != null) {
441                     return name;
442                 }
443             } catch (Exception e) {
444             }
445         }
446         String name = entityToLabel.get(key);
447         if (name != null) {
448             return name;
449         }
450         int last = key.lastIndexOf('/');
451         return key.substring(last + 1, key.length() - 1);
452     }
453 
loadTsvPairs(Class<?> class1, String file, Function<String, String> keyMapper, Function<String, String> valueMapper)454     private static Multimap<String, String> loadTsvPairs(Class<?> class1, String file,
455         Function<String, String> keyMapper, Function<String, String> valueMapper) {
456         String rel = FileUtilities.getRelativeFileName(class1, file);
457         System.out.println(rel);
458         ImmutableMultimap.Builder<String, String> _keyToValues = ImmutableMultimap.builder();
459         for (String line : FileUtilities.in(class1, file)) {
460             if (line.startsWith("?") || line.isEmpty()) continue;
461             List<String> parts = TAB.splitToList(line);
462             String key = parts.get(0);
463             String value = parts.get(1);
464             _keyToValues.put(key, value);
465         }
466         ImmutableMultimap<String, String> result = _keyToValues.build();
467         showDups(file, result, keyMapper, valueMapper);
468         return result;
469     }
470 
loadTsvPairsUnique(Class<?> class1, String file, Function<String, String> fixValue, Function<String, String> keyMapper, Function<String, String> valueMapper)471     private static Map<String, String> loadTsvPairsUnique(Class<?> class1, String file,
472         Function<String, String> fixValue,
473         Function<String, String> keyMapper, Function<String, String> valueMapper) {
474         String rel = FileUtilities.getRelativeFileName(class1, file);
475         System.out.println(rel);
476         Map<String, String> _keyToValue = new TreeMap<>();
477         Multimap<String, String> _keyToValues = TreeMultimap.create();
478         for (String line : FileUtilities.in(class1, file)) {
479             if (line.startsWith("?") || line.isEmpty()) continue;
480             List<String> parts = TAB.splitToList(line);
481             String key = parts.get(0);
482             String value = parts.get(1);
483             if (fixValue != null) {
484                 value = fixValue.apply(value);
485             }
486             _keyToValues.put(key, value);
487             String oldValue = _keyToValue.get(key);
488             if (oldValue == null || oldValue.equals("kxm")) {
489                 _keyToValue.put(key, value);
490             }
491         }
492         _keyToValue = ImmutableMap.copyOf(_keyToValue);
493         showDups(file, _keyToValues, keyMapper, valueMapper);
494         return _keyToValue;
495     }
496 
showDups(String file, Multimap<String, String> _keyToValues, Function<String, String> keyMapper, Function<String, String> valueMapper)497     private static void showDups(String file, Multimap<String, String> _keyToValues,
498         Function<String, String> keyMapper, Function<String, String> valueMapper) {
499         for (Entry<String, Collection<String>> entry : _keyToValues.asMap().entrySet()) {
500             Collection<String> valueSet = entry.getValue();
501             if (valueSet.size() > 1) {
502                 String key = entry.getKey();
503                 key = keyMapper == null ? key : keyMapper.apply(key);
504                 if (valueMapper != null) {
505                     Set<String> result = new LinkedHashSet<>();
506                     valueSet.stream().map(valueMapper).forEach(x -> result.add(x));
507                     valueSet = result;
508                 }
509                 log(file + "\tMultiple values: " + key + "\t" + valueSet);
510             }
511         }
512     }
513 
getAllAncestors(String lang)514     static Set<List<String>> getAllAncestors(String lang) {
515         return Containment.getAllDirected(childToParent, lang);
516     }
517 }
518