• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 package org.unicode.cldr.util;
2 
3 import com.google.common.base.Joiner;
4 import com.google.common.base.MoreObjects;
5 import com.google.common.base.Objects;
6 import com.google.common.collect.ComparisonChain;
7 import com.google.common.collect.ImmutableMap;
8 import com.google.common.collect.ImmutableMultimap;
9 import com.google.common.collect.ImmutableSet;
10 import com.google.common.collect.Multimap;
11 import com.google.common.collect.TreeMultimap;
12 import com.ibm.icu.impl.Row.R2;
13 import java.util.ArrayList;
14 import java.util.Collection;
15 import java.util.EnumSet;
16 import java.util.HashSet;
17 import java.util.LinkedHashSet;
18 import java.util.List;
19 import java.util.Map;
20 import java.util.Map.Entry;
21 import java.util.Set;
22 import java.util.TreeMap;
23 import java.util.TreeSet;
24 import org.unicode.cldr.util.StandardCodes.LstrType;
25 
26 /**
27  * Provides Unicode Language Identifier canonicalization for use in testing. The implementation is
28  * designed to be simple, and is not at all optimized for production use. It is used to verify the
29  * correctness of the specification algorithm, sanity-check the supplementalMetadata.xml alias data,
30  * and generate test files for use by implementations.
31  */
32 public class LsrvCanonicalizer {
33 
34     public static final Set<LstrType> LSRV =
35             ImmutableSet.of(LstrType.language, LstrType.script, LstrType.region, LstrType.variant);
36     public static final Joiner UNDERBAR_JOINER = Joiner.on('_');
37 
38     /**
39      * A representation of a Unicode Language Identifier in a format that makes it simple to
40      * process. The LSRV fields are represented as multimaps, though the LSR fields restricted to
41      * have only have 0 or 1 element.
42      */
43     public static class XLanguageTag {
44         final Multimap<LstrType, String> data;
45 
XLanguageTag(Multimap<LstrType, String> result)46         private XLanguageTag(Multimap<LstrType, String> result) {
47             data = ImmutableMultimap.copyOf(result);
48         }
49 
keys()50         public Set<LstrType> keys() {
51             return data.keySet();
52         }
53 
get(LstrType lstrType)54         public Collection<String> get(LstrType lstrType) {
55             return data.get(lstrType);
56         }
57 
toLocaleString()58         public String toLocaleString() {
59             StringBuilder buffer = new StringBuilder();
60             final Collection<String> region = data.get(LstrType.language);
61             if (!region.isEmpty()) {
62                 buffer.append(UNDERBAR_JOINER.join(region));
63             } else {
64                 buffer.append("und");
65             }
66             addItem(buffer, LstrType.script, "", "_", UNDERBAR_JOINER);
67             addItem(buffer, LstrType.region, "", "_", UNDERBAR_JOINER);
68             addItem(buffer, LstrType.variant, "", "_", UNDERBAR_JOINER);
69 
70             return buffer.toString();
71         }
72 
73         @Override
toString()74         public String toString() {
75             StringBuilder buffer = new StringBuilder();
76             addItem(buffer, LstrType.language, "", "L:", UNDERBAR_JOINER);
77             addItem(buffer, LstrType.script, ";", "S:", UNDERBAR_JOINER);
78             addItem(buffer, LstrType.region, ";", "R:", UNDERBAR_JOINER);
79             addItem(buffer, LstrType.variant, ";", "V:", UNDERBAR_JOINER);
80             return buffer.toString();
81         }
82 
addItem( StringBuilder buffer, LstrType lstrType, String separator, String prefix, final Joiner dashJoiner)83         public void addItem(
84                 StringBuilder buffer,
85                 LstrType lstrType,
86                 String separator,
87                 String prefix,
88                 final Joiner dashJoiner) {
89             final Collection<String> region = data.get(lstrType);
90             if (!region.isEmpty()) {
91                 if (buffer.length() > 0) {
92                     buffer.append(separator);
93                 }
94                 buffer.append(prefix).append(dashJoiner.join(region));
95             }
96         }
97 
fromTag(LstrType lstrType, String tag)98         public static XLanguageTag fromTag(LstrType lstrType, String tag) {
99             Multimap<LstrType, String> result = TreeMultimap.create();
100             LanguageTagParser source = new LanguageTagParser();
101             final boolean isLanguage = lstrType == LstrType.language;
102             String prefix = isLanguage ? "" : "und_";
103             try {
104                 source.set(prefix + tag);
105             } catch (Exception e) {
106                 return null; // skip ill-formed for now
107                 //                if (lstrType == LstrType.region && tag.length() == 3) {
108                 //                    //result.put(LstrType.language, "und");
109                 //                    result.put(LstrType.region, tag);
110                 //                } else {
111                 //                    result.put(LstrType.language, tag);
112                 //                }
113                 //                //System.out.println("ILLEGAL SOURCE\t" + lstrType + ":\t" + tag +
114                 // " ⇒ " + result); // for debugging
115                 //                return new XLanguageTag(result);
116             }
117             if (!source.getLanguage().isEmpty() && !source.getLanguage().contains("und")) {
118                 result.put(LstrType.language, source.getLanguage());
119             }
120             if (!source.getScript().isEmpty()) {
121                 result.put(LstrType.script, source.getScript());
122             }
123             if (!source.getRegion().isEmpty()) {
124                 result.put(LstrType.region, source.getRegion());
125             }
126             if (!source.getVariants().isEmpty()) {
127                 result.putAll(LstrType.variant, source.getVariants());
128             }
129             return new XLanguageTag(result);
130         }
131 
132         @Override
equals(Object obj)133         public boolean equals(Object obj) {
134             return data.equals(((XLanguageTag) obj).data);
135         }
136 
137         @Override
hashCode()138         public int hashCode() {
139             return data.hashCode();
140         }
141 
set(LstrType lstrType, String string)142         public XLanguageTag set(LstrType lstrType, String string) {
143             Multimap<LstrType, String> result = TreeMultimap.create(data);
144             if (lstrType != LstrType.variant) {
145                 result.removeAll(lstrType);
146             }
147             result.put(lstrType, string);
148             return new XLanguageTag(result);
149         }
150 
151         /**
152          * containsAll is used in matching a ReplacementRule.<br>
153          * It is here instead of on ReplacementRule so we can use in the denormalization utility
154          * used in testing.
155          */
containsAll(XLanguageTag type)156         public boolean containsAll(XLanguageTag type) {
157             for (LstrType lstrType : LSRV) {
158                 final Collection<String> sources = get(lstrType);
159                 final Collection<String> types = type.get(lstrType);
160                 if (!sources.containsAll(types)) {
161                     return false;
162                 }
163             }
164             return true;
165         }
166 
167         /**
168          * Once a rule matches, this actually does the replacement.<br>
169          * It is here instead of on ReplacementRule so we can use it in the denormalization utility
170          * used in testing.
171          */
replacePartsFrom( XLanguageTag typeParts, XLanguageTag replacementParts)172         public XLanguageTag replacePartsFrom(
173                 XLanguageTag typeParts, XLanguageTag replacementParts) {
174             Multimap<LstrType, String> result = TreeMultimap.create();
175             for (LstrType lstrType : LSRV) {
176                 Collection<String> sources = get(lstrType);
177                 Collection<String> types = typeParts.get(lstrType);
178                 Collection<String> replacements = replacementParts.get(lstrType);
179                 result.putAll(lstrType, sources);
180                 if (!types.isEmpty() && !replacements.isEmpty()) {
181                     removeAll(result, lstrType, types);
182                     result.putAll(lstrType, replacements);
183                 } else if (!types.isEmpty() && replacements.isEmpty()) {
184                     removeAll(result, lstrType, types);
185                 } else if (types.isEmpty() && !replacements.isEmpty()) {
186                     if (sources.isEmpty()) {
187                         result.putAll(lstrType, replacements);
188                     }
189                 } else {
190                     // otherwise both empty, skip
191                 }
192             }
193             return new XLanguageTag(result);
194         }
195     }
196 
197     /**
198      * A representation of the alias data for Unicode Language Identifiers in the
199      * supplementalMetadata.txt file.
200      */
201     public static class ReplacementRule implements Comparable<ReplacementRule> {
202         private final XLanguageTag typeParts;
203         final XLanguageTag replacementParts;
204         final List<XLanguageTag>
205                 secondaryReplacementSet; // TODO, using this information in special cases to impute
206         // the best language according to LDML
207         final String reason;
208         final boolean regular;
209 
ReplacementRule( LstrType lstrType, String type, XLanguageTag typeParts, XLanguageTag replacementParts, List<XLanguageTag> secondaryReplacementSet, String reason)210         private ReplacementRule(
211                 LstrType lstrType,
212                 String type,
213                 XLanguageTag typeParts,
214                 XLanguageTag replacementParts,
215                 List<XLanguageTag> secondaryReplacementSet,
216                 String reason) {
217             this.typeParts = typeParts;
218             this.replacementParts = replacementParts;
219             this.secondaryReplacementSet = secondaryReplacementSet;
220             this.reason = reason;
221             this.regular =
222                     typeParts.keys().equals(replacementParts.keys())
223                             && typeParts.get(LstrType.variant).size()
224                                     == replacementParts.get(LstrType.variant).size();
225         }
226 
from( LstrType lstrType, String type, List<String> replacement, String reason)227         static ReplacementRule from(
228                 LstrType lstrType, String type, List<String> replacement, String reason) {
229             XLanguageTag typeParts = XLanguageTag.fromTag(lstrType, type);
230             if (typeParts == null) {
231                 return null; // skip ill-formed for now
232             }
233             XLanguageTag replacementParts = XLanguageTag.fromTag(lstrType, replacement.get(0));
234             if (replacementParts == null) {
235                 return null; // skip ill-formed for now
236             }
237             List<XLanguageTag> secondaryReplacementSet = new ArrayList<>();
238             for (int i = 1; i < replacement.size(); ++i) {
239                 secondaryReplacementSet.add(XLanguageTag.fromTag(lstrType, replacement.get(i)));
240             }
241             return new ReplacementRule(
242                     lstrType, type, typeParts, replacementParts, secondaryReplacementSet, reason);
243         }
244 
245         @Override
compareTo(ReplacementRule o)246         public int compareTo(ReplacementRule o) {
247             return ComparisonChain.start()
248                     .compare(
249                             -getType().keys().size(),
250                             -o.getType().keys().size()) // sort most keys first
251                     .compare(getType().toString(), o.getType().toString())
252                     .result();
253         }
254 
255         @Override
equals(Object obj)256         public boolean equals(Object obj) {
257             return compareTo((ReplacementRule) obj) == 0;
258         }
259 
260         @Override
hashCode()261         public int hashCode() {
262             return Objects.hashCode(getType());
263         }
264 
265         @Override
toString()266         public String toString() {
267             return MoreObjects.toStringHelper(getClass())
268                     .add("type", getType())
269                     .add("replacement", replacementParts)
270                     .toString();
271         }
272 
getType()273         public XLanguageTag getType() {
274             return typeParts;
275         }
276 
getReplacement()277         public XLanguageTag getReplacement() {
278             return replacementParts;
279         }
280     }
281 
282     /** Utility to remove multiple items from Multimap */
removeAll(Multimap<K, V> result, K key, Iterable<V> value)283     public static <K, V> Multimap<K, V> removeAll(Multimap<K, V> result, K key, Iterable<V> value) {
284         for (V type : value) {
285             result.remove(key, type);
286         }
287         return result;
288     }
289 
290     private Set<ReplacementRule> rules = new TreeSet<>();
291     private Multimap<LstrType, String> inType = TreeMultimap.create();
292     private Map<LstrType, String> irrelevant = new TreeMap<>();
293 
add(ReplacementRule replacementRule)294     private void add(ReplacementRule replacementRule) {
295         getRules().add(replacementRule);
296     }
297 
298     /**
299      * Canonicalize a Unicode Language Identifier (LSRV - language, script, region, variants)
300      *
301      * @param lstrType This is a special flag used to indicate which supplementalMetadata alias type
302      *     the languageTag is from. That determines whether to extend the type and replacement to be
303      *     full LSRVs if they are partial, by adding "und_", for example.
304      * @param languageTag May be partial, if the lstrType is not LstrType.language.
305      */
canonicalize(LstrType lstrType, String languageTag)306     public String canonicalize(LstrType lstrType, String languageTag) {
307         XLanguageTag newTag = canonicalizeToX(XLanguageTag.fromTag(lstrType, languageTag), null);
308         return newTag.toString();
309     }
310 
311     /**
312      * Canonicalize a Unicode Language Identifier (LSRV - language, script, region, variants) in the
313      * XLanguageTag format. Also returns the rules used in the canonicalization.<br>
314      * NOT OPTIMIZED: just uses a linear search for simplicity; production code would use more
315      * efficient mechanisms
316      */
canonicalizeToX(XLanguageTag fromTag, List<ReplacementRule> rulesUsed)317     public XLanguageTag canonicalizeToX(XLanguageTag fromTag, List<ReplacementRule> rulesUsed) {
318         if (rulesUsed != null) {
319             rulesUsed.clear();
320         }
321         XLanguageTag newTag = fromTag;
322         startAtTheTop:
323         while (true) {
324             for (ReplacementRule rule : getRules()) {
325                 if (newTag.containsAll(rule.getType())) {
326                     XLanguageTag temp =
327                             newTag.replacePartsFrom(rule.getType(), rule.getReplacement());
328                     if (!temp.equals(newTag)) {
329                         newTag = temp;
330                         if (rulesUsed != null) {
331                             rulesUsed.add(rule);
332                         }
333                         continue startAtTheTop;
334                     }
335                 }
336             }
337             return newTag;
338         }
339     }
340 
341     /**
342      * Decanonicalize a Unicode Language Identifier (LSRV - language, script, region, variants) in
343      * the XLanguageTag format. Also returns the rules used in the canonicalization. Used in test
344      * case generation NOT OPTIMIZED: just for testing
345      */
decanonicalizeToX(XLanguageTag fromTag)346     public Set<XLanguageTag> decanonicalizeToX(XLanguageTag fromTag) {
347         Set<XLanguageTag> result = new HashSet<>();
348         result.add(fromTag);
349         Set<XLanguageTag> intermediate = new HashSet<>();
350         while (true) {
351             for (ReplacementRule rule : getRules()) {
352                 if (!rule.getType().get(LstrType.variant).isEmpty()) {
353                     continue;
354                 }
355                 for (XLanguageTag newTag : result) {
356                     if (newTag.containsAll(rule.getReplacement())) { // reverse normal order
357                         XLanguageTag changed =
358                                 newTag.replacePartsFrom(
359                                         rule.getReplacement(),
360                                         rule.getType()); // reverse normal order
361                         if (!intermediate.contains(changed) && !result.contains(changed)) {
362                             intermediate.add(changed);
363                         }
364                     }
365                 }
366             }
367             if (intermediate.isEmpty()) {
368                 result.remove(fromTag);
369                 return result;
370             }
371             result.addAll(intermediate);
372             intermediate.clear();
373         }
374     }
375 
376     /** Utility for getting a filtered list of rules, mostly useful in debugging. */
filter(LstrType lstrType, String value)377     public List<ReplacementRule> filter(LstrType lstrType, String value) {
378         List<ReplacementRule> result = new ArrayList<>();
379         for (ReplacementRule rule : getRules()) {
380             final Collection<String> items = rule.getType().get(lstrType);
381             if (value == null && !items.isEmpty() || value != null && items.contains(value)) {
382                 result.add(rule);
383             }
384         }
385         return result;
386     }
387 
getInstance()388     public static final LsrvCanonicalizer getInstance() {
389         return SINGLETON;
390     }
391 
392     private static final LsrvCanonicalizer SINGLETON = load();
393 
load()394     private static LsrvCanonicalizer load() {
395         SupplementalDataInfo SDI = CLDRConfig.getInstance().getSupplementalDataInfo();
396         Map<String, Map<String, R2<List<String>, String>>> aliases = SDI.getLocaleAliasInfo();
397         // type -> tag -> , like "language" -> "sh" -> <{"sr_Latn"}, reason>
398 
399         LsrvCanonicalizer rrs = new LsrvCanonicalizer();
400         for (Entry<String, Map<String, R2<List<String>, String>>> typeTagReplacement :
401                 aliases.entrySet()) {
402             String type = typeTagReplacement.getKey();
403             if (type.contains("-")) {
404                 throw new IllegalArgumentException(
405                         "Bad format for alias: should have _ instead of -.");
406             }
407             LstrType lstrType = LstrType.fromString(type);
408             if (!LSRV.contains(lstrType)) {
409                 continue;
410             }
411             for (Entry<String, R2<List<String>, String>> tagReplacementReason :
412                     typeTagReplacement.getValue().entrySet()) {
413                 String tag = tagReplacementReason.getKey();
414                 if (tag.contains("-")) {
415                     throw new IllegalArgumentException(
416                             "Bad format for alias: should have _ instead of -.");
417                 }
418                 List<String> replacement = tagReplacementReason.getValue().get0();
419                 if (replacement == null) {
420                     System.out.println("No replacement: " + tagReplacementReason);
421                     continue;
422                 }
423                 String reason = tagReplacementReason.getValue().get1();
424                 final ReplacementRule replacementRule =
425                         ReplacementRule.from(lstrType, tag, replacement, reason);
426                 if (replacementRule == null) {
427                     // System.out.println("No rule: " + tagReplacementReason);
428                     continue;
429                 }
430                 rrs.add(replacementRule);
431             }
432         }
433         rrs.rules = ImmutableSet.copyOf(rrs.rules);
434         for (ReplacementRule rule : rrs.rules) {
435             XLanguageTag type = rule.getType();
436             XLanguageTag replacement = rule.getReplacement();
437             for (LstrType lstrType : LsrvCanonicalizer.LSRV) {
438                 rrs.inType.putAll(lstrType, type.get(lstrType));
439                 rrs.inType.putAll(lstrType, replacement.get(lstrType));
440             }
441         }
442         rrs.inType = ImmutableMultimap.copyOf(rrs.inType);
443 
444         for (LstrType lstrType : LsrvCanonicalizer.LSRV) {
445             Set<String> all =
446                     new LinkedHashSet<>(
447                             Validity.getInstance()
448                                     .getStatusToCodes(lstrType)
449                                     .get(Validity.Status.regular));
450             all.removeAll(rrs.inType.get(lstrType));
451             if (lstrType == LstrType.variant && all.contains("fonipa")) {
452                 rrs.irrelevant.put(lstrType, "fonipa");
453             } else {
454                 rrs.irrelevant.put(lstrType, all.iterator().next());
455             }
456         }
457         rrs.irrelevant = ImmutableMap.copyOf(rrs.irrelevant);
458         return rrs;
459     }
460 
461     /** Returns the set of all the Replacement rules in the canonicalizer. */
getRules()462     public Set<ReplacementRule> getRules() {
463         return rules;
464     }
465 
466     /** Types of test data */
467     public enum TestDataTypes {
468         explicit,
469         fromAliases,
470         decanonicalized,
471         withIrrelevants
472     }
473 
474     /**
475      * Returns test data for the rules, used to generate test data files.
476      *
477      * @param testDataTypes if null, returns all the data; otherwise the specified set.
478      * @return
479      */
getTestData(Set<TestDataTypes> testDataTypes)480     public Map<TestDataTypes, Map<String, String>> getTestData(Set<TestDataTypes> testDataTypes) {
481         Map<TestDataTypes, Map<String, String>> result = new TreeMap<>();
482 
483         if (testDataTypes == null) {
484             testDataTypes = EnumSet.allOf(TestDataTypes.class);
485         }
486         Set<String> allToTest = new TreeSet<>();
487         if (testDataTypes.contains(TestDataTypes.explicit)) {
488             Map<String, String> testData2 = new TreeMap<>();
489             String[][] tests = {
490                 {"hye_arevmda", "hyw"},
491                 {"art_lojban", "jbo"},
492                 {"en_arevela", "en"},
493                 {"hy_arevela", "hy"},
494                 {"en_arevmda_arevela", "en"},
495                 {"hy_arevmda", "hyw"},
496                 {"hy_arevmda_arevela", "hyw"},
497                 {"en_lojban", "en"},
498                 {"en_US_polytoni", "en_US_polyton"},
499                 {"en_US_heploc", "en_US_alalc97"},
500                 {"en_US_aaland", "en_US"},
501                 {"en_aaland", "en_AX"},
502                 {"no_nynorsk_bokmal", "nb"},
503                 {"no_bokmal_nynorsk", "nb"},
504                 {"zh_guoyu_hakka_xiang", "hak"},
505                 {"zh_hakka_xiang", "hak"},
506             };
507             for (String row[] : tests) {
508                 String toTest = row[0];
509                 String expected = row[1];
510                 testData2.put(toTest, expected);
511             }
512             allToTest.addAll(testData2.keySet());
513             result.put(TestDataTypes.explicit, ImmutableMap.copyOf(testData2));
514         }
515 
516         if (testDataTypes.contains(TestDataTypes.fromAliases)) {
517             Map<String, String> testData2 = new TreeMap<>();
518             for (ReplacementRule rule : getRules()) {
519                 String toTest = rule.getType().toLocaleString();
520                 String expected = rule.getReplacement().toLocaleString();
521                 if (!allToTest.contains(toTest)) {
522                     testData2.put(toTest, expected);
523                 }
524             }
525             allToTest.addAll(testData2.keySet());
526             result.put(TestDataTypes.fromAliases, ImmutableMap.copyOf(testData2));
527         }
528 
529         if (testDataTypes.contains(TestDataTypes.decanonicalized)) {
530             Map<String, String> testData2 = new TreeMap<>();
531             for (String testItem : allToTest) {
532                 for (XLanguageTag decon :
533                         decanonicalizeToX(XLanguageTag.fromTag(LstrType.language, testItem))) {
534                     XLanguageTag newTag = canonicalizeToX(decon, null);
535                     final String toTest = decon.toLocaleString();
536                     if (!allToTest.contains(toTest)) {
537                         testData2.put(toTest, newTag.toLocaleString());
538                     }
539                 }
540             }
541             allToTest.addAll(testData2.keySet());
542             result.put(TestDataTypes.decanonicalized, ImmutableMap.copyOf(testData2));
543         }
544 
545         if (testDataTypes.contains(TestDataTypes.withIrrelevants)) {
546             Map<String, String> testData2 = new TreeMap<>();
547             for (String testItem : allToTest) {
548                 XLanguageTag fluffedUp =
549                         fluff(XLanguageTag.fromTag(LstrType.language, testItem), irrelevant);
550                 XLanguageTag newTag = canonicalizeToX(fluffedUp, null);
551                 final String toTest = fluffedUp.toLocaleString();
552                 if (!allToTest.contains(toTest)) {
553                     testData2.put(toTest, newTag.toLocaleString());
554                 }
555             }
556             allToTest.addAll(testData2.keySet());
557             result.put(TestDataTypes.withIrrelevants, ImmutableMap.copyOf(testData2));
558         }
559 
560         result = ImmutableMap.copyOf(result);
561         return result;
562     }
563 
fluff(XLanguageTag type, Map<LstrType, String> toAddIfMissing)564     private static XLanguageTag fluff(XLanguageTag type, Map<LstrType, String> toAddIfMissing) {
565         XLanguageTag newTag = type;
566         for (LstrType lstrType : LsrvCanonicalizer.LSRV) {
567             if (type.get(lstrType).isEmpty() || lstrType == LstrType.variant) {
568                 newTag = newTag.set(lstrType, toAddIfMissing.get(lstrType));
569             }
570         }
571         return newTag;
572     }
573 
574     /** Returns all the fields used in the type attribute of the alias rule. */
getInType(LstrType language)575     public Collection<String> getInType(LstrType language) {
576         return inType.get(language);
577     }
578 
579     /**
580      * Returns some sample fields that do not appear in the type attribute of the alias rule, used
581      * for testing.
582      */
getIrrelevantField(LstrType language)583     public String getIrrelevantField(LstrType language) {
584         return irrelevant.get(language);
585     }
586 }
587