• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 package org.unicode.cldr.test;
2 
3 import java.util.Collection;
4 import java.util.Collections;
5 import java.util.Map;
6 import java.util.Map.Entry;
7 import java.util.NavigableSet;
8 import java.util.Set;
9 import java.util.TreeMap;
10 
11 import org.unicode.cldr.util.CLDRConfig;
12 import org.unicode.cldr.util.CLDRFile;
13 import org.unicode.cldr.util.GrammarInfo;
14 import org.unicode.cldr.util.GrammarInfo.GrammaticalFeature;
15 import org.unicode.cldr.util.GrammarInfo.GrammaticalScope;
16 import org.unicode.cldr.util.GrammarInfo.GrammaticalTarget;
17 import org.unicode.cldr.util.ICUServiceBuilder;
18 import org.unicode.cldr.util.Pair;
19 import org.unicode.cldr.util.SupplementalDataInfo;
20 import org.unicode.cldr.util.SupplementalDataInfo.PluralInfo.Count;
21 import org.unicode.cldr.util.SupplementalDataInfo.PluralType;
22 import org.unicode.cldr.util.UnitConverter.UnitSystem;
23 import org.unicode.cldr.util.UnitPathType;
24 
25 import com.google.common.base.Joiner;
26 import com.google.common.collect.ImmutableMultimap;
27 import com.google.common.collect.ImmutableSet;
28 import com.google.common.collect.Maps;
29 import com.google.common.collect.Multimap;
30 import com.google.common.collect.Multimaps;
31 import com.google.common.collect.TreeMultimap;
32 import com.ibm.icu.impl.locale.XCldrStub.ImmutableMap;
33 import com.ibm.icu.text.DecimalFormat;
34 import com.ibm.icu.text.PluralRules;
35 import com.ibm.icu.text.PluralRules.FixedDecimal;
36 import com.ibm.icu.text.PluralRules.FixedDecimalRange;
37 import com.ibm.icu.text.PluralRules.FixedDecimalSamples;
38 import com.ibm.icu.text.PluralRules.SampleType;
39 import com.ibm.icu.util.Output;
40 
41 /**
42  * Return the best samples for illustrating minimal pairs
43  * @author markdavis
44  *
45  */
46 public class BestMinimalPairSamples {
47     public static final String EQUALS_NOMINATIVE = "=nominative";
48     private static final Joiner PLUS_JOINER = Joiner.on("+");
49     private static final CLDRConfig CONFIG = CLDRConfig.getInstance();
50     private static final SupplementalDataInfo supplementalDataInfo = SupplementalDataInfo.getInstance();
51 
52     final private CLDRFile cldrFile;
53     final private GrammarInfo grammarInfo;
54     final private PluralRules pluralInfo;
55     final private PluralRules ordinalInfo;
56     final private ICUServiceBuilder icuServiceBuilder;
57     private CaseAndGenderSamples caseAndGenderSamples = null; // lazy evaluated
58     private Multimap<String, String> genderToUnits;
59     private Multimap<Integer, String> uniqueCaseAndCountToUnits;
60     private Multimap<String, String> distinctNominativeCaseToUnit;
61     private final boolean gatherStats;
62 
BestMinimalPairSamples(CLDRFile cldrFile, ICUServiceBuilder icuServiceBuilder, boolean gatherStats)63     public BestMinimalPairSamples(CLDRFile cldrFile, ICUServiceBuilder icuServiceBuilder, boolean gatherStats) {
64         this.cldrFile = cldrFile;
65         grammarInfo = supplementalDataInfo.getGrammarInfo(cldrFile.getLocaleID());
66         pluralInfo = supplementalDataInfo.getPlurals(PluralType.cardinal, cldrFile.getLocaleID()).getPluralRules();
67         ordinalInfo = supplementalDataInfo.getPlurals(PluralType.ordinal, cldrFile.getLocaleID()).getPluralRules();
68         this.icuServiceBuilder = icuServiceBuilder;
69         genderToUnits = TreeMultimap.create();
70         uniqueCaseAndCountToUnits = TreeMultimap.create();
71         this.gatherStats = gatherStats;
72     }
73 
74 
75     static final class CaseAndGenderSamples {
76         private final Map<String, Pair<String, String>> genderCache;
77         private final Map<String, String> caseCache;
78         private final String caseUnitId;
79 
CaseAndGenderSamples(Map<String, String> caseCache2, String bestCaseUnitId, Map<String, Pair<String, String>> genderCache2)80         public CaseAndGenderSamples(Map<String, String> caseCache2, String bestCaseUnitId,  Map<String, Pair<String, String>> genderCache2) {
81             genderCache  = genderCache2;
82             caseCache = caseCache2;
83             caseUnitId = bestCaseUnitId;
84         }
85 
getGender(String gender, Output<String> shortUnitId)86         public String getGender(String gender, Output<String> shortUnitId) {
87             Pair<String, String> result = genderCache.get(gender);
88             if (result == null) {
89                 return null;
90             }
91             shortUnitId.value = result.getFirst();
92             return result.getSecond();
93         }
94 
getCase(String unitCase, Output<String> shortUnitId)95         public String getCase(String unitCase, Output<String> shortUnitId) {
96             shortUnitId.value = caseUnitId;
97             return caseCache.get(unitCase);
98         }
99     }
100 
101     /**
102      * Returns a "good" value for a unit. Favors metric units, and simple units
103      * @param shortUnitId
104      */
getBestUnitWithGender(String gender, Output<String> shortUnitId)105     public synchronized String getBestUnitWithGender(String gender, Output<String> shortUnitId) {
106         if (grammarInfo == null) {
107             return null;
108         }
109         if (caseAndGenderSamples == null) {
110             caseAndGenderSamples = loadCaches();
111         }
112         return caseAndGenderSamples.getGender(gender, shortUnitId);
113     }
114 
115     /**
116      * Returns a "good" value for a unit. Favors metric units, and simple units
117      * @param shortUnitId
118      */
getBestUnitWithCase(String unitCase, Output<String> shortUnitId)119     public synchronized String getBestUnitWithCase(String unitCase, Output<String> shortUnitId) {
120         if (grammarInfo == null) {
121             return null;
122         }
123         if (caseAndGenderSamples == null) {
124             caseAndGenderSamples = loadCaches();
125         }
126         return caseAndGenderSamples.getCase(unitCase, shortUnitId);
127     }
128 
129     static final Set<String> SKIP_CASE = ImmutableSet.of(
130         "concentr-ofglucose",
131         "concentr-portion",
132         "length-100-kilometer",
133         "pressure-ofhg");
134 
loadCaches()135     public CaseAndGenderSamples loadCaches() {
136         Collection<String> unitCases = grammarInfo.get(GrammaticalTarget.nominal, GrammaticalFeature.grammaticalCase, GrammaticalScope.units);
137         Map<String,String> genderResults = Maps.newHashMap();
138         Multimap<String, Pair<String,String>> unitPatternToCaseAndCounts = TreeMultimap.create();
139         distinctNominativeCaseToUnit = TreeMultimap.create();
140 
141         int bestCaseFormCount = 0;
142         String bestCaseUnitId = null;
143         Multimap<String, Pair<String,String>> bestUnitPatternToCases = null;
144         Multimap<String, String> unitToDistinctNominativeCase = TreeMultimap.create();
145 
146         for (String longUnitId : GrammarInfo.getUnitsToAddGrammar()) {
147             String possibleGender = cldrFile.getStringValue("//ldml/units/unitLength[@type=\"long\"]/unit[@type=\"" + longUnitId + "\"]/gender");
148             String shortUnitId = ExampleGenerator.UNIT_CONVERTER.getShortId(longUnitId);
149             if (shortUnitId.equals("hour") && cldrFile.getLocaleID().equals("ta")) {
150                 int debug = 0;
151             }
152             if (possibleGender != null) {
153                 if (gatherStats) {
154                     genderToUnits.put(possibleGender, shortUnitId);
155                 }
156                 String formerLongUnitId = genderResults.get(possibleGender);
157                 if (formerLongUnitId == null || isBetterUnit(longUnitId, formerLongUnitId)) {
158                     genderResults.put(possibleGender, longUnitId);
159                 }
160             }
161             if (!unitCases.isEmpty()) {
162                 unitPatternToCaseAndCounts.clear();
163                 for (String count : pluralInfo.getKeywords()) {
164                     for (String unitCase : unitCases) {
165                         String grammarAttributes = GrammarInfo.getGrammaticalInfoAttributes(grammarInfo, UnitPathType.unit, count, null, unitCase);
166                         String unitPattern = cldrFile.getStringValue("//ldml/units/unitLength[@type=\"long\"]/unit[@type=\"" + longUnitId + "\"]/unitPattern" + grammarAttributes);
167                         if (unitPattern == null) {
168                             continue;
169                         }
170                         unitPattern = unitPattern.replace("\u00A0", "").trim();
171                         final Pair<String, String> caseAndCount = Pair.of(unitCase, count);
172                         unitPatternToCaseAndCounts.put(unitPattern, caseAndCount);
173                     }
174                 }
175                 int caseFormCount = unitPatternToCaseAndCounts.keySet().size();
176 
177                 boolean alwaysSameAsNominative = true;
178                 TreeMultimap<Pair<String, String>, String> caseAndCountToPattern = Multimaps.invertFrom(unitPatternToCaseAndCounts, TreeMultimap.create());
179                 for (Entry<Pair<String, String>, String> entry : caseAndCountToPattern.entries()) {
180                     Pair<String, String> caseAndCount = entry.getKey();
181                     String pattern = entry.getValue();
182                     String gCase = caseAndCount.getFirst();
183                     if (!gCase.equals("nominative")) {
184                         Pair<String, String> nomPair = Pair.of("nominative", caseAndCount.getSecond());
185                         NavigableSet<String> nomPatterns = caseAndCountToPattern.get(nomPair);
186                         if (!nomPatterns.contains(pattern)) {
187                             unitToDistinctNominativeCase.put(shortUnitId, gCase);
188                             alwaysSameAsNominative = false;
189                         }
190                     }
191                 }
192                 for (Entry<String, Collection<String>> entry : unitToDistinctNominativeCase.asMap().entrySet()) {
193                     distinctNominativeCaseToUnit.put(PLUS_JOINER.join(entry.getValue()), entry.getKey());
194                 }
195                 if (alwaysSameAsNominative) {
196                     distinctNominativeCaseToUnit.put(EQUALS_NOMINATIVE, shortUnitId);
197                 }
198 
199                 if (gatherStats
200                     && !SKIP_CASE.contains(longUnitId)) {
201                     uniqueCaseAndCountToUnits.put(caseFormCount, shortUnitId);
202                 }
203 
204                 // For case, we should do something fancier, but for now we pick the units with the largest number of distinct forms.
205                 int diff = caseFormCount - bestCaseFormCount;
206                 if (diff > 0
207                     || diff == 0
208                     && isBetterUnit(longUnitId, bestCaseUnitId)) {
209 //                    System.out.println(cldrFile.getLocaleID() + "\t" + longUnitId + " better than " + bestCaseUnitId);
210 //                 if (WORSE.contains(longUnitId)) {
211 //                        isBetterUnit(longUnitId, bestCaseUnitId);
212 //                    }
213                     bestCaseFormCount = caseFormCount;
214                     bestCaseUnitId = longUnitId;
215                     bestUnitPatternToCases = TreeMultimap.create(unitPatternToCaseAndCounts);
216                 }
217             }
218         }
219         // Fill the case cache with the most distinctive forms.
220         Map<String, String> caseCache = getBestCasePatterns(bestUnitPatternToCases);
221 
222         // Make the gender cache be translated units as well as unit IDs
223         Count count = pluralInfo.getKeywords().contains("one") ? Count.one : Count.other;
224         Map<String,Pair<String,String>> result2 = Maps.newHashMap();
225 
226         for (Entry<String, String> entry : genderResults.entrySet()) {
227             String longUnitId = entry.getValue();
228             String unitPattern = cldrFile.getStringValue("//ldml/units/unitLength[@type=\"long\"]/unit[@type=\"" + longUnitId + "\"]/unitPattern[@count=\"" + count + "\"]");
229             unitPattern = unitPattern.replace("{0}", "").replace("\u00A0", "").trim();
230             result2.put(entry.getKey(), Pair.of(ExampleGenerator.UNIT_CONVERTER.getShortId(longUnitId), unitPattern));
231         }
232         // it doesn't matter if we reset this due to multiple threads
233         Map<String, Pair<String, String>> genderCache = ImmutableMap.copyOf(result2);
234         CaseAndGenderSamples result = new CaseAndGenderSamples(caseCache, ExampleGenerator.UNIT_CONVERTER.getShortId(bestCaseUnitId), genderCache);
235 
236         genderToUnits = ImmutableMultimap.copyOf(genderToUnits);
237         uniqueCaseAndCountToUnits = ImmutableMultimap.copyOf(uniqueCaseAndCountToUnits);
238         distinctNominativeCaseToUnit = ImmutableMultimap.copyOf(distinctNominativeCaseToUnit);
239         return result;
240     }
241 
242     /**
243      * Get the a pattern that is most unique for each case.
244      * @param bestUnitPatternToCases
245      * @return
246      */
getBestCasePatterns(Multimap<String, Pair<String, String>> bestUnitPatternToCases)247     private Map<String, String> getBestCasePatterns(Multimap<String, Pair<String, String>> bestUnitPatternToCases) {
248         if (bestUnitPatternToCases == null || bestUnitPatternToCases.isEmpty()) {
249             return Collections.emptyMap();
250         }
251         Map<String,String> result = new TreeMap<>();
252         while (true) {
253             String bestPattern = getBestPattern(bestUnitPatternToCases);
254             Pair<String, String> bestCaseCount = bestUnitPatternToCases.get(bestPattern).iterator().next();
255             String bestCase = bestCaseCount.getFirst();
256             String bestCount = bestCaseCount.getSecond();
257             String sample = getPluralOrOrdinalSample(PluralType.cardinal, bestCount);
258             if (sample == null) { // debugging
259                 getPluralOrOrdinalSample(PluralType.cardinal, bestCount);
260             }
261             result.put(bestCaseCount.getFirst(), bestPattern.replace("{0}", sample));
262             TreeMultimap<Pair<String, String>, String> caseToPatterns = Multimaps.invertFrom(bestUnitPatternToCases, TreeMultimap.create());
263             for (String count : pluralInfo.getKeywords()) {
264                 caseToPatterns.removeAll(Pair.of(bestCase, count));
265             }
266             if (caseToPatterns.keySet().isEmpty()) {
267                 return result;
268             }
269             bestUnitPatternToCases = Multimaps.invertFrom(caseToPatterns, TreeMultimap.create());
270         }
271     }
272 
getBestPattern(Multimap<String, Pair<String, String>> bestUnitPatternToCases)273     private String getBestPattern(Multimap<String, Pair<String, String>> bestUnitPatternToCases) {
274         int bestCaseSize = 1000;
275         String bestPattern = null;
276         Collection<Pair<String, String>> bestCase = null;
277         for (Entry<String, Collection<Pair<String, String>>> entry : bestUnitPatternToCases.asMap().entrySet()) {
278             final Collection<Pair<String, String>> setOfCases = entry.getValue();
279             if (setOfCases.size() < bestCaseSize) {
280                 bestCaseSize = setOfCases.size();
281                 bestPattern = entry.getKey();
282                 bestCase = setOfCases;
283             }
284         }
285         return bestPattern;
286     }
287 
isBetterUnit(String longUnitId, String formerLongUnitId)288     public boolean isBetterUnit(String longUnitId, String formerLongUnitId) {
289         // replace if as good or better (where better is smaller). Metric is better. If both metric, choose alphabetical
290         boolean isBetter = false;
291         int diff = systemWeight(longUnitId) - systemWeight(formerLongUnitId);
292         if (diff < 0) {
293             isBetter = true;
294         } else if (diff == 0) {
295             diff = categoryWeight(longUnitId) - categoryWeight(formerLongUnitId);
296             if (diff < 0) {
297                 isBetter = true;
298             } else if (diff == 0 && longUnitId.compareTo(formerLongUnitId) < 0) {
299                 isBetter = true;
300             }
301         }
302         return isBetter;
303     }
304 
305     static final Set<String> WORSE = ImmutableSet.of("length-100-kilometer", "length-mile-scandinavian");
306     /**
307      * better result is smaller
308      * @param longUnitId
309      * @return
310      */
systemWeight(String longUnitId)311     public int systemWeight(String longUnitId) {
312         if (WORSE.contains(longUnitId)) {
313             return 1;
314         }
315         Set<UnitSystem> systems = ExampleGenerator.UNIT_CONVERTER.getSystemsEnum(ExampleGenerator.UNIT_CONVERTER.getShortId(longUnitId));
316         if (systems.contains(UnitSystem.metric)) {
317             return 0;
318         }
319         return 1;
320     }
321 
categoryWeight(String longUnitId)322     private int categoryWeight(String longUnitId) {
323         if (longUnitId.startsWith("length")) {
324             return 0;
325         } else if (longUnitId.startsWith("weight")) {
326             return 1;
327         } else if (longUnitId.startsWith("duration")) {
328             return 2;
329         }
330         return 999;
331     }
332 
getPluralOrOrdinalSample(PluralType pluralType, String code)333     public String getPluralOrOrdinalSample(PluralType pluralType, String code) {
334         PluralRules rules = pluralType == PluralType.cardinal ? pluralInfo : ordinalInfo;
335         FixedDecimalSamples samples = rules.getDecimalSamples(code, SampleType.INTEGER);
336         if (samples == null) {
337             samples = rules.getDecimalSamples(code, SampleType.DECIMAL);
338         }
339         if (samples == null) {
340             return null;
341         }
342 
343         // get good sample. Avoid zero if possible
344         FixedDecimal sample = null;
345         for (FixedDecimalRange sampleRange : samples.getSamples()) {
346             sample = sampleRange.start;
347             if (sample.doubleValue() != 0d) {
348                 break;
349             }
350         }
351 
352         if (icuServiceBuilder != null) {
353             int visibleDigits = sample.getVisibleDecimalDigitCount();
354             DecimalFormat nf;
355             if (visibleDigits == 0) {
356                 nf = icuServiceBuilder.getNumberFormat(0); // 0 is integer, 1 is decimal
357             } else {
358                 nf = icuServiceBuilder.getNumberFormat(1); // 0 is integer, 1 is decimal
359                 int minFracDigits = nf.getMinimumFractionDigits();
360                 int maxFracDigits = nf.getMaximumFractionDigits();
361                 if (minFracDigits != visibleDigits || maxFracDigits != visibleDigits) {
362                     nf = (DecimalFormat) nf.clone();
363                     nf.setMaximumFractionDigits(visibleDigits);
364                     nf.setMinimumFractionDigits(visibleDigits);
365                 }
366             }
367             return nf.format(sample);
368         }
369         return sample.toString();
370     }
371 
372     /**
373      * Get the best value to show, plus the shortUnitId if relevant (case/gender)
374      */
getBestValue(String header, String code, Output<String> shortUnitId)375     public String getBestValue(String header, String code, Output<String> shortUnitId) {
376         String result = null;
377         switch(header) {
378         case "Case":
379             result = getBestUnitWithCase(code, shortUnitId);
380             break;
381         case "Gender":
382             result = getBestUnitWithGender(code, shortUnitId);
383             break;
384         case "Ordinal":
385             result = getPluralOrOrdinalSample(PluralType.ordinal, code);
386             shortUnitId.value = "n/a";
387             break;
388         case "Plural":
389             result = getPluralOrOrdinalSample(PluralType.cardinal, code);
390             shortUnitId.value = "n/a";
391             break;
392         }
393         return result == null ? "X" : result;
394     }
395 
getGenderToUnits()396     public Multimap<String, String> getGenderToUnits() {
397         return genderToUnits;
398     }
399 
getUniqueCaseAndCountToUnits()400     public Multimap<Integer, String> getUniqueCaseAndCountToUnits() {
401         return uniqueCaseAndCountToUnits;
402     }
getDistinctNominativeCaseToUnit()403     public Multimap<String, String> getDistinctNominativeCaseToUnit() {
404         return distinctNominativeCaseToUnit;
405     }
406 }