• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 package org.unicode.cldr.test;
2 
3 import java.util.Collection;
4 import java.util.Collections;
5 import java.util.Map;
6 import java.util.Map.Entry;
7 import java.util.NavigableSet;
8 import java.util.Set;
9 import java.util.TreeMap;
10 
11 import org.unicode.cldr.util.CLDRConfig;
12 import org.unicode.cldr.util.CLDRFile;
13 import org.unicode.cldr.util.GrammarInfo;
14 import org.unicode.cldr.util.GrammarInfo.GrammaticalFeature;
15 import org.unicode.cldr.util.GrammarInfo.GrammaticalScope;
16 import org.unicode.cldr.util.GrammarInfo.GrammaticalTarget;
17 import org.unicode.cldr.util.ICUServiceBuilder;
18 import org.unicode.cldr.util.Pair;
19 import org.unicode.cldr.util.SupplementalDataInfo;
20 import org.unicode.cldr.util.SupplementalDataInfo.PluralInfo.Count;
21 import org.unicode.cldr.util.SupplementalDataInfo.PluralType;
22 import org.unicode.cldr.util.UnitConverter.UnitSystem;
23 import org.unicode.cldr.util.UnitPathType;
24 
25 import com.google.common.base.Joiner;
26 import com.google.common.collect.ImmutableMultimap;
27 import com.google.common.collect.ImmutableSet;
28 import com.google.common.collect.Maps;
29 import com.google.common.collect.Multimap;
30 import com.google.common.collect.Multimaps;
31 import com.google.common.collect.TreeMultimap;
32 import com.ibm.icu.impl.locale.XCldrStub.ImmutableMap;
33 import com.ibm.icu.impl.number.DecimalQuantity;
34 import com.ibm.icu.text.DecimalFormat;
35 import com.ibm.icu.text.PluralRules;
36 import com.ibm.icu.text.PluralRules.DecimalQuantitySamples;
37 import com.ibm.icu.text.PluralRules.DecimalQuantitySamplesRange;
38 import com.ibm.icu.text.PluralRules.Operand;
39 import com.ibm.icu.text.PluralRules.SampleType;
40 import com.ibm.icu.util.Output;
41 
42 /**
43  * Return the best samples for illustrating minimal pairs
44  * @author markdavis
45  *
46  */
47 public class BestMinimalPairSamples {
48     public static final String EQUALS_NOMINATIVE = "=nominative";
49     private static final Joiner PLUS_JOINER = Joiner.on("+");
50     private static final CLDRConfig CONFIG = CLDRConfig.getInstance();
51     private static final SupplementalDataInfo supplementalDataInfo = SupplementalDataInfo.getInstance();
52 
53     final private CLDRFile cldrFile;
54     final private GrammarInfo grammarInfo;
55     final private PluralRules pluralInfo;
56     final private PluralRules ordinalInfo;
57     final private ICUServiceBuilder icuServiceBuilder;
58     private CaseAndGenderSamples caseAndGenderSamples = null; // lazy evaluated
59     private Multimap<String, String> genderToUnits;
60     private Multimap<Integer, String> uniqueCaseAndCountToUnits;
61     private Multimap<String, String> distinctNominativeCaseToUnit;
62     private final boolean gatherStats;
63 
BestMinimalPairSamples(CLDRFile cldrFile, ICUServiceBuilder icuServiceBuilder, boolean gatherStats)64     public BestMinimalPairSamples(CLDRFile cldrFile, ICUServiceBuilder icuServiceBuilder, boolean gatherStats) {
65         this.cldrFile = cldrFile;
66         grammarInfo = supplementalDataInfo.getGrammarInfo(cldrFile.getLocaleID());
67         pluralInfo = supplementalDataInfo.getPlurals(PluralType.cardinal, cldrFile.getLocaleID()).getPluralRules();
68         ordinalInfo = supplementalDataInfo.getPlurals(PluralType.ordinal, cldrFile.getLocaleID()).getPluralRules();
69         this.icuServiceBuilder = icuServiceBuilder;
70         genderToUnits = TreeMultimap.create();
71         uniqueCaseAndCountToUnits = TreeMultimap.create();
72         this.gatherStats = gatherStats;
73     }
74 
75 
76     static final class CaseAndGenderSamples {
77         private final Map<String, Pair<String, String>> genderCache;
78         private final Map<String, String> caseCache;
79         private final String caseUnitId;
80 
CaseAndGenderSamples(Map<String, String> caseCache2, String bestCaseUnitId, Map<String, Pair<String, String>> genderCache2)81         public CaseAndGenderSamples(Map<String, String> caseCache2, String bestCaseUnitId,  Map<String, Pair<String, String>> genderCache2) {
82             genderCache  = genderCache2;
83             caseCache = caseCache2;
84             caseUnitId = bestCaseUnitId;
85         }
86 
getGender(String gender, Output<String> shortUnitId)87         public String getGender(String gender, Output<String> shortUnitId) {
88             Pair<String, String> result = genderCache.get(gender);
89             if (result == null) {
90                 return null;
91             }
92             shortUnitId.value = result.getFirst();
93             return result.getSecond();
94         }
95 
getCase(String unitCase, Output<String> shortUnitId)96         public String getCase(String unitCase, Output<String> shortUnitId) {
97             shortUnitId.value = caseUnitId;
98             return caseCache.get(unitCase);
99         }
100     }
101 
102     /**
103      * Returns a "good" value for a unit. Favors metric units, and simple units
104      * @param shortUnitId
105      */
getBestUnitWithGender(String gender, Output<String> shortUnitId)106     public synchronized String getBestUnitWithGender(String gender, Output<String> shortUnitId) {
107         if (gender == null || grammarInfo == null) {
108             return null;
109         }
110         if (caseAndGenderSamples == null) {
111             caseAndGenderSamples = loadCaches();
112         }
113         return caseAndGenderSamples.getGender(gender, shortUnitId);
114     }
115 
116     /**
117      * Returns a "good" value for a unit. Favors metric units, and simple units
118      * @param shortUnitId
119      */
getBestUnitWithCase(String unitCase, Output<String> shortUnitId)120     public synchronized String getBestUnitWithCase(String unitCase, Output<String> shortUnitId) {
121         if (unitCase == null || grammarInfo == null) {
122             return null;
123         }
124         if (caseAndGenderSamples == null) {
125             caseAndGenderSamples = loadCaches();
126         }
127         return caseAndGenderSamples.getCase(unitCase, shortUnitId);
128     }
129 
130     static final Set<String> SKIP_CASE = ImmutableSet.of(
131         "concentr-ofglucose",
132         "concentr-portion",
133         "length-100-kilometer",
134         "pressure-ofhg");
135 
loadCaches()136     public CaseAndGenderSamples loadCaches() {
137         Collection<String> unitCases = grammarInfo.get(GrammaticalTarget.nominal, GrammaticalFeature.grammaticalCase, GrammaticalScope.units);
138         Map<String,String> genderResults = Maps.newHashMap();
139         Multimap<String, Pair<String,String>> unitPatternToCaseAndCounts = TreeMultimap.create();
140         distinctNominativeCaseToUnit = TreeMultimap.create();
141 
142         int bestCaseFormCount = 0;
143         String bestCaseUnitId = null;
144         Multimap<String, Pair<String,String>> bestUnitPatternToCases = null;
145         Multimap<String, String> unitToDistinctNominativeCase = TreeMultimap.create();
146 
147         for (String longUnitId : GrammarInfo.getUnitsToAddGrammar()) {
148             String possibleGender = cldrFile.getStringValue("//ldml/units/unitLength[@type=\"long\"]/unit[@type=\"" + longUnitId + "\"]/gender");
149             String shortUnitId = ExampleGenerator.UNIT_CONVERTER.getShortId(longUnitId);
150             if (shortUnitId.equals("hour") && cldrFile.getLocaleID().equals("ta")) {
151                 int debug = 0;
152             }
153             if (possibleGender != null) {
154                 if (gatherStats) {
155                     genderToUnits.put(possibleGender, shortUnitId);
156                 }
157                 String formerLongUnitId = genderResults.get(possibleGender);
158                 if (formerLongUnitId == null || isBetterUnit(longUnitId, formerLongUnitId)) {
159                     genderResults.put(possibleGender, longUnitId);
160                 }
161             }
162             if (!unitCases.isEmpty()) {
163                 unitPatternToCaseAndCounts.clear();
164                 for (String count : pluralInfo.getKeywords()) {
165                     for (String unitCase : unitCases) {
166                         String grammarAttributes = GrammarInfo.getGrammaticalInfoAttributes(grammarInfo, UnitPathType.unit, count, null, unitCase);
167                         String unitPattern = cldrFile.getStringValue("//ldml/units/unitLength[@type=\"long\"]/unit[@type=\"" + longUnitId + "\"]/unitPattern" + grammarAttributes);
168                         if (unitPattern == null) {
169                             continue;
170                         }
171                         unitPattern = unitPattern.replace("\u00A0", "").trim();
172                         final Pair<String, String> caseAndCount = Pair.of(unitCase, count);
173                         unitPatternToCaseAndCounts.put(unitPattern, caseAndCount);
174                     }
175                 }
176                 int caseFormCount = unitPatternToCaseAndCounts.keySet().size();
177 
178                 boolean alwaysSameAsNominative = true;
179                 TreeMultimap<Pair<String, String>, String> caseAndCountToPattern = Multimaps.invertFrom(unitPatternToCaseAndCounts, TreeMultimap.create());
180                 for (Entry<Pair<String, String>, String> entry : caseAndCountToPattern.entries()) {
181                     Pair<String, String> caseAndCount = entry.getKey();
182                     String pattern = entry.getValue();
183                     String gCase = caseAndCount.getFirst();
184                     if (!gCase.equals("nominative")) {
185                         Pair<String, String> nomPair = Pair.of("nominative", caseAndCount.getSecond());
186                         NavigableSet<String> nomPatterns = caseAndCountToPattern.get(nomPair);
187                         if (!nomPatterns.contains(pattern)) {
188                             unitToDistinctNominativeCase.put(shortUnitId, gCase);
189                             alwaysSameAsNominative = false;
190                         }
191                     }
192                 }
193                 for (Entry<String, Collection<String>> entry : unitToDistinctNominativeCase.asMap().entrySet()) {
194                     distinctNominativeCaseToUnit.put(PLUS_JOINER.join(entry.getValue()), entry.getKey());
195                 }
196                 if (alwaysSameAsNominative) {
197                     distinctNominativeCaseToUnit.put(EQUALS_NOMINATIVE, shortUnitId);
198                 }
199 
200                 if (gatherStats
201                     && !SKIP_CASE.contains(longUnitId)) {
202                     uniqueCaseAndCountToUnits.put(caseFormCount, shortUnitId);
203                 }
204 
205                 // For case, we should do something fancier, but for now we pick the units with the largest number of distinct forms.
206                 int diff = caseFormCount - bestCaseFormCount;
207                 if (diff > 0
208                     || diff == 0
209                     && isBetterUnit(longUnitId, bestCaseUnitId)) {
210 //                    System.out.println(cldrFile.getLocaleID() + "\t" + longUnitId + " better than " + bestCaseUnitId);
211 //                 if (WORSE.contains(longUnitId)) {
212 //                        isBetterUnit(longUnitId, bestCaseUnitId);
213 //                    }
214                     bestCaseFormCount = caseFormCount;
215                     bestCaseUnitId = longUnitId;
216                     bestUnitPatternToCases = TreeMultimap.create(unitPatternToCaseAndCounts);
217                 }
218             }
219         }
220         // Fill the case cache with the most distinctive forms.
221         Map<String, String> caseCache = getBestCasePatterns(bestUnitPatternToCases);
222 
223         // Make the gender cache be translated units as well as unit IDs
224         Count count = pluralInfo.getKeywords().contains("one") ? Count.one : Count.other;
225         Map<String,Pair<String,String>> result2 = Maps.newHashMap();
226 
227         for (Entry<String, String> entry : genderResults.entrySet()) {
228             String longUnitId = entry.getValue();
229             String unitPattern = cldrFile.getStringValue("//ldml/units/unitLength[@type=\"long\"]/unit[@type=\"" + longUnitId + "\"]/unitPattern[@count=\"" + count + "\"]");
230             unitPattern = unitPattern.replace("{0}", "").replace("\u00A0", "").trim();
231             result2.put(entry.getKey(), Pair.of(ExampleGenerator.UNIT_CONVERTER.getShortId(longUnitId), unitPattern));
232         }
233         // it doesn't matter if we reset this due to multiple threads
234         Map<String, Pair<String, String>> genderCache = ImmutableMap.copyOf(result2);
235         CaseAndGenderSamples result = new CaseAndGenderSamples(caseCache, ExampleGenerator.UNIT_CONVERTER.getShortId(bestCaseUnitId), genderCache);
236 
237         genderToUnits = ImmutableMultimap.copyOf(genderToUnits);
238         uniqueCaseAndCountToUnits = ImmutableMultimap.copyOf(uniqueCaseAndCountToUnits);
239         distinctNominativeCaseToUnit = ImmutableMultimap.copyOf(distinctNominativeCaseToUnit);
240         return result;
241     }
242 
243     /**
244      * Get the a pattern that is most unique for each case.
245      * @param bestUnitPatternToCases
246      * @return
247      */
getBestCasePatterns(Multimap<String, Pair<String, String>> bestUnitPatternToCases)248     private Map<String, String> getBestCasePatterns(Multimap<String, Pair<String, String>> bestUnitPatternToCases) {
249         if (bestUnitPatternToCases == null || bestUnitPatternToCases.isEmpty()) {
250             return Collections.emptyMap();
251         }
252         Map<String,String> result = new TreeMap<>();
253         while (true) {
254             String bestPattern = getBestPattern(bestUnitPatternToCases);
255             Pair<String, String> bestCaseCount = bestUnitPatternToCases.get(bestPattern).iterator().next();
256             String bestCase = bestCaseCount.getFirst();
257             String bestCount = bestCaseCount.getSecond();
258             String sample = getPluralOrOrdinalSample(PluralType.cardinal, bestCount);
259             if (sample == null) { // debugging
260                 getPluralOrOrdinalSample(PluralType.cardinal, bestCount);
261             }
262             result.put(bestCaseCount.getFirst(), bestPattern.replace("{0}", sample));
263             TreeMultimap<Pair<String, String>, String> caseToPatterns = Multimaps.invertFrom(bestUnitPatternToCases, TreeMultimap.create());
264             for (String count : pluralInfo.getKeywords()) {
265                 caseToPatterns.removeAll(Pair.of(bestCase, count));
266             }
267             if (caseToPatterns.keySet().isEmpty()) {
268                 return result;
269             }
270             bestUnitPatternToCases = Multimaps.invertFrom(caseToPatterns, TreeMultimap.create());
271         }
272     }
273 
getBestPattern(Multimap<String, Pair<String, String>> bestUnitPatternToCases)274     private String getBestPattern(Multimap<String, Pair<String, String>> bestUnitPatternToCases) {
275         int bestCaseSize = 1000;
276         String bestPattern = null;
277         Collection<Pair<String, String>> bestCase = null;
278         for (Entry<String, Collection<Pair<String, String>>> entry : bestUnitPatternToCases.asMap().entrySet()) {
279             final Collection<Pair<String, String>> setOfCases = entry.getValue();
280             if (setOfCases.size() < bestCaseSize) {
281                 bestCaseSize = setOfCases.size();
282                 bestPattern = entry.getKey();
283                 bestCase = setOfCases;
284             }
285         }
286         return bestPattern;
287     }
288 
isBetterUnit(String longUnitId, String formerLongUnitId)289     public boolean isBetterUnit(String longUnitId, String formerLongUnitId) {
290         // replace if as good or better (where better is smaller). Metric is better. If both metric, choose alphabetical
291         boolean isBetter = false;
292         int diff = systemWeight(longUnitId) - systemWeight(formerLongUnitId);
293         if (diff < 0) {
294             isBetter = true;
295         } else if (diff == 0) {
296             diff = categoryWeight(longUnitId) - categoryWeight(formerLongUnitId);
297             if (diff < 0) {
298                 isBetter = true;
299             } else if (diff == 0 && longUnitId.compareTo(formerLongUnitId) < 0) {
300                 isBetter = true;
301             }
302         }
303         return isBetter;
304     }
305 
306     static final Set<String> WORSE = ImmutableSet.of("length-100-kilometer", "length-mile-scandinavian");
307     /**
308      * better result is smaller
309      * @param longUnitId
310      * @return
311      */
systemWeight(String longUnitId)312     public int systemWeight(String longUnitId) {
313         if (WORSE.contains(longUnitId)) {
314             return 1;
315         }
316         Set<UnitSystem> systems = ExampleGenerator.UNIT_CONVERTER.getSystemsEnum(ExampleGenerator.UNIT_CONVERTER.getShortId(longUnitId));
317         if (systems.contains(UnitSystem.metric)) {
318             return 0;
319         }
320         return 1;
321     }
322 
categoryWeight(String longUnitId)323     private int categoryWeight(String longUnitId) {
324         if (longUnitId.startsWith("length")) {
325             return 0;
326         } else if (longUnitId.startsWith("weight")) {
327             return 1;
328         } else if (longUnitId.startsWith("duration")) {
329             return 2;
330         }
331         return 999;
332     }
333 
getPluralOrOrdinalSample(PluralType pluralType, String code)334     public String getPluralOrOrdinalSample(PluralType pluralType, String code) {
335         PluralRules rules = pluralType == PluralType.cardinal ? pluralInfo : ordinalInfo;
336         DecimalQuantitySamples samples = rules.getDecimalSamples(code, SampleType.INTEGER);
337         if (samples == null) {
338             samples = rules.getDecimalSamples(code, SampleType.DECIMAL);
339         }
340         if (samples == null) {
341             return null;
342         }
343 
344         // get good sample. Avoid zero if possible
345         DecimalQuantity sample = null;
346         for (DecimalQuantitySamplesRange sampleRange : samples.getSamples()) {
347             sample = sampleRange.start;
348             if (sample.toDouble() != 0d) {
349                 break;
350             }
351         }
352 
353         if (icuServiceBuilder != null) {
354             int visibleDigits = (int) sample.getPluralOperand(Operand.v);
355             DecimalFormat nf;
356             if (visibleDigits == 0) {
357                 nf = icuServiceBuilder.getNumberFormat(0); // 0 is integer, 1 is decimal
358             } else {
359                 nf = icuServiceBuilder.getNumberFormat(1); // 0 is integer, 1 is decimal
360                 int minFracDigits = nf.getMinimumFractionDigits();
361                 int maxFracDigits = nf.getMaximumFractionDigits();
362                 if (minFracDigits != visibleDigits || maxFracDigits != visibleDigits) {
363                     nf = (DecimalFormat) nf.clone();
364                     nf.setMaximumFractionDigits(visibleDigits);
365                     nf.setMinimumFractionDigits(visibleDigits);
366                 }
367             }
368             return nf.format(sample.toBigDecimal());
369         }
370         return sample.toString();
371     }
372 
373     /**
374      * Get the best value to show, plus the shortUnitId if relevant (case/gender)
375      */
getBestValue(String header, String code, Output<String> shortUnitId)376     public String getBestValue(String header, String code, Output<String> shortUnitId) {
377         String result = null;
378         switch(header) {
379         case "Case":
380             result = getBestUnitWithCase(code, shortUnitId);
381             break;
382         case "Gender":
383             result = getBestUnitWithGender(code, shortUnitId);
384             break;
385         case "Ordinal":
386             result = getPluralOrOrdinalSample(PluralType.ordinal, code);
387             shortUnitId.value = "n/a";
388             break;
389         case "Plural":
390             result = getPluralOrOrdinalSample(PluralType.cardinal, code);
391             shortUnitId.value = "n/a";
392             break;
393         }
394         return result == null ? "X" : result;
395     }
396 
getGenderToUnits()397     public Multimap<String, String> getGenderToUnits() {
398         return genderToUnits;
399     }
400 
getUniqueCaseAndCountToUnits()401     public Multimap<Integer, String> getUniqueCaseAndCountToUnits() {
402         return uniqueCaseAndCountToUnits;
403     }
getDistinctNominativeCaseToUnit()404     public Multimap<String, String> getDistinctNominativeCaseToUnit() {
405         return distinctNominativeCaseToUnit;
406     }
407 }