1 package org.unicode.cldr.test; 2 3 import java.util.Collection; 4 import java.util.Collections; 5 import java.util.Map; 6 import java.util.Map.Entry; 7 import java.util.NavigableSet; 8 import java.util.Set; 9 import java.util.TreeMap; 10 11 import org.unicode.cldr.util.CLDRConfig; 12 import org.unicode.cldr.util.CLDRFile; 13 import org.unicode.cldr.util.GrammarInfo; 14 import org.unicode.cldr.util.GrammarInfo.GrammaticalFeature; 15 import org.unicode.cldr.util.GrammarInfo.GrammaticalScope; 16 import org.unicode.cldr.util.GrammarInfo.GrammaticalTarget; 17 import org.unicode.cldr.util.ICUServiceBuilder; 18 import org.unicode.cldr.util.Pair; 19 import org.unicode.cldr.util.SupplementalDataInfo; 20 import org.unicode.cldr.util.SupplementalDataInfo.PluralInfo.Count; 21 import org.unicode.cldr.util.SupplementalDataInfo.PluralType; 22 import org.unicode.cldr.util.UnitConverter.UnitSystem; 23 import org.unicode.cldr.util.UnitPathType; 24 25 import com.google.common.base.Joiner; 26 import com.google.common.collect.ImmutableMultimap; 27 import com.google.common.collect.ImmutableSet; 28 import com.google.common.collect.Maps; 29 import com.google.common.collect.Multimap; 30 import com.google.common.collect.Multimaps; 31 import com.google.common.collect.TreeMultimap; 32 import com.ibm.icu.impl.locale.XCldrStub.ImmutableMap; 33 import com.ibm.icu.text.DecimalFormat; 34 import com.ibm.icu.text.PluralRules; 35 import com.ibm.icu.text.PluralRules.FixedDecimal; 36 import com.ibm.icu.text.PluralRules.FixedDecimalRange; 37 import com.ibm.icu.text.PluralRules.FixedDecimalSamples; 38 import com.ibm.icu.text.PluralRules.SampleType; 39 import com.ibm.icu.util.Output; 40 41 /** 42 * Return the best samples for illustrating minimal pairs 43 * @author markdavis 44 * 45 */ 46 public class BestMinimalPairSamples { 47 public static final String EQUALS_NOMINATIVE = "=nominative"; 48 private static final Joiner PLUS_JOINER = Joiner.on("+"); 49 private static final CLDRConfig CONFIG = CLDRConfig.getInstance(); 50 private static final SupplementalDataInfo supplementalDataInfo = SupplementalDataInfo.getInstance(); 51 52 final private CLDRFile cldrFile; 53 final private GrammarInfo grammarInfo; 54 final private PluralRules pluralInfo; 55 final private PluralRules ordinalInfo; 56 final private ICUServiceBuilder icuServiceBuilder; 57 private CaseAndGenderSamples caseAndGenderSamples = null; // lazy evaluated 58 private Multimap<String, String> genderToUnits; 59 private Multimap<Integer, String> uniqueCaseAndCountToUnits; 60 private Multimap<String, String> distinctNominativeCaseToUnit; 61 private final boolean gatherStats; 62 BestMinimalPairSamples(CLDRFile cldrFile, ICUServiceBuilder icuServiceBuilder, boolean gatherStats)63 public BestMinimalPairSamples(CLDRFile cldrFile, ICUServiceBuilder icuServiceBuilder, boolean gatherStats) { 64 this.cldrFile = cldrFile; 65 grammarInfo = supplementalDataInfo.getGrammarInfo(cldrFile.getLocaleID()); 66 pluralInfo = supplementalDataInfo.getPlurals(PluralType.cardinal, cldrFile.getLocaleID()).getPluralRules(); 67 ordinalInfo = supplementalDataInfo.getPlurals(PluralType.ordinal, cldrFile.getLocaleID()).getPluralRules(); 68 this.icuServiceBuilder = icuServiceBuilder; 69 genderToUnits = TreeMultimap.create(); 70 uniqueCaseAndCountToUnits = TreeMultimap.create(); 71 this.gatherStats = gatherStats; 72 } 73 74 75 static final class CaseAndGenderSamples { 76 private final Map<String, Pair<String, String>> genderCache; 77 private final Map<String, String> caseCache; 78 private final String caseUnitId; 79 CaseAndGenderSamples(Map<String, String> caseCache2, String bestCaseUnitId, Map<String, Pair<String, String>> genderCache2)80 public CaseAndGenderSamples(Map<String, String> caseCache2, String bestCaseUnitId, Map<String, Pair<String, String>> genderCache2) { 81 genderCache = genderCache2; 82 caseCache = caseCache2; 83 caseUnitId = bestCaseUnitId; 84 } 85 getGender(String gender, Output<String> shortUnitId)86 public String getGender(String gender, Output<String> shortUnitId) { 87 Pair<String, String> result = genderCache.get(gender); 88 if (result == null) { 89 return null; 90 } 91 shortUnitId.value = result.getFirst(); 92 return result.getSecond(); 93 } 94 getCase(String unitCase, Output<String> shortUnitId)95 public String getCase(String unitCase, Output<String> shortUnitId) { 96 shortUnitId.value = caseUnitId; 97 return caseCache.get(unitCase); 98 } 99 } 100 101 /** 102 * Returns a "good" value for a unit. Favors metric units, and simple units 103 * @param shortUnitId 104 */ getBestUnitWithGender(String gender, Output<String> shortUnitId)105 public synchronized String getBestUnitWithGender(String gender, Output<String> shortUnitId) { 106 if (grammarInfo == null) { 107 return null; 108 } 109 if (caseAndGenderSamples == null) { 110 caseAndGenderSamples = loadCaches(); 111 } 112 return caseAndGenderSamples.getGender(gender, shortUnitId); 113 } 114 115 /** 116 * Returns a "good" value for a unit. Favors metric units, and simple units 117 * @param shortUnitId 118 */ getBestUnitWithCase(String unitCase, Output<String> shortUnitId)119 public synchronized String getBestUnitWithCase(String unitCase, Output<String> shortUnitId) { 120 if (grammarInfo == null) { 121 return null; 122 } 123 if (caseAndGenderSamples == null) { 124 caseAndGenderSamples = loadCaches(); 125 } 126 return caseAndGenderSamples.getCase(unitCase, shortUnitId); 127 } 128 129 static final Set<String> SKIP_CASE = ImmutableSet.of( 130 "concentr-ofglucose", 131 "concentr-portion", 132 "length-100-kilometer", 133 "pressure-ofhg"); 134 loadCaches()135 public CaseAndGenderSamples loadCaches() { 136 Collection<String> unitCases = grammarInfo.get(GrammaticalTarget.nominal, GrammaticalFeature.grammaticalCase, GrammaticalScope.units); 137 Map<String,String> genderResults = Maps.newHashMap(); 138 Multimap<String, Pair<String,String>> unitPatternToCaseAndCounts = TreeMultimap.create(); 139 distinctNominativeCaseToUnit = TreeMultimap.create(); 140 141 int bestCaseFormCount = 0; 142 String bestCaseUnitId = null; 143 Multimap<String, Pair<String,String>> bestUnitPatternToCases = null; 144 Multimap<String, String> unitToDistinctNominativeCase = TreeMultimap.create(); 145 146 for (String longUnitId : GrammarInfo.getUnitsToAddGrammar()) { 147 String possibleGender = cldrFile.getStringValue("//ldml/units/unitLength[@type=\"long\"]/unit[@type=\"" + longUnitId + "\"]/gender"); 148 String shortUnitId = ExampleGenerator.UNIT_CONVERTER.getShortId(longUnitId); 149 if (shortUnitId.equals("hour") && cldrFile.getLocaleID().equals("ta")) { 150 int debug = 0; 151 } 152 if (possibleGender != null) { 153 if (gatherStats) { 154 genderToUnits.put(possibleGender, shortUnitId); 155 } 156 String formerLongUnitId = genderResults.get(possibleGender); 157 if (formerLongUnitId == null || isBetterUnit(longUnitId, formerLongUnitId)) { 158 genderResults.put(possibleGender, longUnitId); 159 } 160 } 161 if (!unitCases.isEmpty()) { 162 unitPatternToCaseAndCounts.clear(); 163 for (String count : pluralInfo.getKeywords()) { 164 for (String unitCase : unitCases) { 165 String grammarAttributes = GrammarInfo.getGrammaticalInfoAttributes(grammarInfo, UnitPathType.unit, count, null, unitCase); 166 String unitPattern = cldrFile.getStringValue("//ldml/units/unitLength[@type=\"long\"]/unit[@type=\"" + longUnitId + "\"]/unitPattern" + grammarAttributes); 167 if (unitPattern == null) { 168 continue; 169 } 170 unitPattern = unitPattern.replace("\u00A0", "").trim(); 171 final Pair<String, String> caseAndCount = Pair.of(unitCase, count); 172 unitPatternToCaseAndCounts.put(unitPattern, caseAndCount); 173 } 174 } 175 int caseFormCount = unitPatternToCaseAndCounts.keySet().size(); 176 177 boolean alwaysSameAsNominative = true; 178 TreeMultimap<Pair<String, String>, String> caseAndCountToPattern = Multimaps.invertFrom(unitPatternToCaseAndCounts, TreeMultimap.create()); 179 for (Entry<Pair<String, String>, String> entry : caseAndCountToPattern.entries()) { 180 Pair<String, String> caseAndCount = entry.getKey(); 181 String pattern = entry.getValue(); 182 String gCase = caseAndCount.getFirst(); 183 if (!gCase.equals("nominative")) { 184 Pair<String, String> nomPair = Pair.of("nominative", caseAndCount.getSecond()); 185 NavigableSet<String> nomPatterns = caseAndCountToPattern.get(nomPair); 186 if (!nomPatterns.contains(pattern)) { 187 unitToDistinctNominativeCase.put(shortUnitId, gCase); 188 alwaysSameAsNominative = false; 189 } 190 } 191 } 192 for (Entry<String, Collection<String>> entry : unitToDistinctNominativeCase.asMap().entrySet()) { 193 distinctNominativeCaseToUnit.put(PLUS_JOINER.join(entry.getValue()), entry.getKey()); 194 } 195 if (alwaysSameAsNominative) { 196 distinctNominativeCaseToUnit.put(EQUALS_NOMINATIVE, shortUnitId); 197 } 198 199 if (gatherStats 200 && !SKIP_CASE.contains(longUnitId)) { 201 uniqueCaseAndCountToUnits.put(caseFormCount, shortUnitId); 202 } 203 204 // For case, we should do something fancier, but for now we pick the units with the largest number of distinct forms. 205 int diff = caseFormCount - bestCaseFormCount; 206 if (diff > 0 207 || diff == 0 208 && isBetterUnit(longUnitId, bestCaseUnitId)) { 209 // System.out.println(cldrFile.getLocaleID() + "\t" + longUnitId + " better than " + bestCaseUnitId); 210 // if (WORSE.contains(longUnitId)) { 211 // isBetterUnit(longUnitId, bestCaseUnitId); 212 // } 213 bestCaseFormCount = caseFormCount; 214 bestCaseUnitId = longUnitId; 215 bestUnitPatternToCases = TreeMultimap.create(unitPatternToCaseAndCounts); 216 } 217 } 218 } 219 // Fill the case cache with the most distinctive forms. 220 Map<String, String> caseCache = getBestCasePatterns(bestUnitPatternToCases); 221 222 // Make the gender cache be translated units as well as unit IDs 223 Count count = pluralInfo.getKeywords().contains("one") ? Count.one : Count.other; 224 Map<String,Pair<String,String>> result2 = Maps.newHashMap(); 225 226 for (Entry<String, String> entry : genderResults.entrySet()) { 227 String longUnitId = entry.getValue(); 228 String unitPattern = cldrFile.getStringValue("//ldml/units/unitLength[@type=\"long\"]/unit[@type=\"" + longUnitId + "\"]/unitPattern[@count=\"" + count + "\"]"); 229 unitPattern = unitPattern.replace("{0}", "").replace("\u00A0", "").trim(); 230 result2.put(entry.getKey(), Pair.of(ExampleGenerator.UNIT_CONVERTER.getShortId(longUnitId), unitPattern)); 231 } 232 // it doesn't matter if we reset this due to multiple threads 233 Map<String, Pair<String, String>> genderCache = ImmutableMap.copyOf(result2); 234 CaseAndGenderSamples result = new CaseAndGenderSamples(caseCache, ExampleGenerator.UNIT_CONVERTER.getShortId(bestCaseUnitId), genderCache); 235 236 genderToUnits = ImmutableMultimap.copyOf(genderToUnits); 237 uniqueCaseAndCountToUnits = ImmutableMultimap.copyOf(uniqueCaseAndCountToUnits); 238 distinctNominativeCaseToUnit = ImmutableMultimap.copyOf(distinctNominativeCaseToUnit); 239 return result; 240 } 241 242 /** 243 * Get the a pattern that is most unique for each case. 244 * @param bestUnitPatternToCases 245 * @return 246 */ getBestCasePatterns(Multimap<String, Pair<String, String>> bestUnitPatternToCases)247 private Map<String, String> getBestCasePatterns(Multimap<String, Pair<String, String>> bestUnitPatternToCases) { 248 if (bestUnitPatternToCases == null || bestUnitPatternToCases.isEmpty()) { 249 return Collections.emptyMap(); 250 } 251 Map<String,String> result = new TreeMap<>(); 252 while (true) { 253 String bestPattern = getBestPattern(bestUnitPatternToCases); 254 Pair<String, String> bestCaseCount = bestUnitPatternToCases.get(bestPattern).iterator().next(); 255 String bestCase = bestCaseCount.getFirst(); 256 String bestCount = bestCaseCount.getSecond(); 257 String sample = getPluralOrOrdinalSample(PluralType.cardinal, bestCount); 258 if (sample == null) { // debugging 259 getPluralOrOrdinalSample(PluralType.cardinal, bestCount); 260 } 261 result.put(bestCaseCount.getFirst(), bestPattern.replace("{0}", sample)); 262 TreeMultimap<Pair<String, String>, String> caseToPatterns = Multimaps.invertFrom(bestUnitPatternToCases, TreeMultimap.create()); 263 for (String count : pluralInfo.getKeywords()) { 264 caseToPatterns.removeAll(Pair.of(bestCase, count)); 265 } 266 if (caseToPatterns.keySet().isEmpty()) { 267 return result; 268 } 269 bestUnitPatternToCases = Multimaps.invertFrom(caseToPatterns, TreeMultimap.create()); 270 } 271 } 272 getBestPattern(Multimap<String, Pair<String, String>> bestUnitPatternToCases)273 private String getBestPattern(Multimap<String, Pair<String, String>> bestUnitPatternToCases) { 274 int bestCaseSize = 1000; 275 String bestPattern = null; 276 Collection<Pair<String, String>> bestCase = null; 277 for (Entry<String, Collection<Pair<String, String>>> entry : bestUnitPatternToCases.asMap().entrySet()) { 278 final Collection<Pair<String, String>> setOfCases = entry.getValue(); 279 if (setOfCases.size() < bestCaseSize) { 280 bestCaseSize = setOfCases.size(); 281 bestPattern = entry.getKey(); 282 bestCase = setOfCases; 283 } 284 } 285 return bestPattern; 286 } 287 isBetterUnit(String longUnitId, String formerLongUnitId)288 public boolean isBetterUnit(String longUnitId, String formerLongUnitId) { 289 // replace if as good or better (where better is smaller). Metric is better. If both metric, choose alphabetical 290 boolean isBetter = false; 291 int diff = systemWeight(longUnitId) - systemWeight(formerLongUnitId); 292 if (diff < 0) { 293 isBetter = true; 294 } else if (diff == 0) { 295 diff = categoryWeight(longUnitId) - categoryWeight(formerLongUnitId); 296 if (diff < 0) { 297 isBetter = true; 298 } else if (diff == 0 && longUnitId.compareTo(formerLongUnitId) < 0) { 299 isBetter = true; 300 } 301 } 302 return isBetter; 303 } 304 305 static final Set<String> WORSE = ImmutableSet.of("length-100-kilometer", "length-mile-scandinavian"); 306 /** 307 * better result is smaller 308 * @param longUnitId 309 * @return 310 */ systemWeight(String longUnitId)311 public int systemWeight(String longUnitId) { 312 if (WORSE.contains(longUnitId)) { 313 return 1; 314 } 315 Set<UnitSystem> systems = ExampleGenerator.UNIT_CONVERTER.getSystemsEnum(ExampleGenerator.UNIT_CONVERTER.getShortId(longUnitId)); 316 if (systems.contains(UnitSystem.metric)) { 317 return 0; 318 } 319 return 1; 320 } 321 categoryWeight(String longUnitId)322 private int categoryWeight(String longUnitId) { 323 if (longUnitId.startsWith("length")) { 324 return 0; 325 } else if (longUnitId.startsWith("weight")) { 326 return 1; 327 } else if (longUnitId.startsWith("duration")) { 328 return 2; 329 } 330 return 999; 331 } 332 getPluralOrOrdinalSample(PluralType pluralType, String code)333 public String getPluralOrOrdinalSample(PluralType pluralType, String code) { 334 PluralRules rules = pluralType == PluralType.cardinal ? pluralInfo : ordinalInfo; 335 FixedDecimalSamples samples = rules.getDecimalSamples(code, SampleType.INTEGER); 336 if (samples == null) { 337 samples = rules.getDecimalSamples(code, SampleType.DECIMAL); 338 } 339 if (samples == null) { 340 return null; 341 } 342 343 // get good sample. Avoid zero if possible 344 FixedDecimal sample = null; 345 for (FixedDecimalRange sampleRange : samples.getSamples()) { 346 sample = sampleRange.start; 347 if (sample.doubleValue() != 0d) { 348 break; 349 } 350 } 351 352 if (icuServiceBuilder != null) { 353 int visibleDigits = sample.getVisibleDecimalDigitCount(); 354 DecimalFormat nf; 355 if (visibleDigits == 0) { 356 nf = icuServiceBuilder.getNumberFormat(0); // 0 is integer, 1 is decimal 357 } else { 358 nf = icuServiceBuilder.getNumberFormat(1); // 0 is integer, 1 is decimal 359 int minFracDigits = nf.getMinimumFractionDigits(); 360 int maxFracDigits = nf.getMaximumFractionDigits(); 361 if (minFracDigits != visibleDigits || maxFracDigits != visibleDigits) { 362 nf = (DecimalFormat) nf.clone(); 363 nf.setMaximumFractionDigits(visibleDigits); 364 nf.setMinimumFractionDigits(visibleDigits); 365 } 366 } 367 return nf.format(sample); 368 } 369 return sample.toString(); 370 } 371 372 /** 373 * Get the best value to show, plus the shortUnitId if relevant (case/gender) 374 */ getBestValue(String header, String code, Output<String> shortUnitId)375 public String getBestValue(String header, String code, Output<String> shortUnitId) { 376 String result = null; 377 switch(header) { 378 case "Case": 379 result = getBestUnitWithCase(code, shortUnitId); 380 break; 381 case "Gender": 382 result = getBestUnitWithGender(code, shortUnitId); 383 break; 384 case "Ordinal": 385 result = getPluralOrOrdinalSample(PluralType.ordinal, code); 386 shortUnitId.value = "n/a"; 387 break; 388 case "Plural": 389 result = getPluralOrOrdinalSample(PluralType.cardinal, code); 390 shortUnitId.value = "n/a"; 391 break; 392 } 393 return result == null ? "X" : result; 394 } 395 getGenderToUnits()396 public Multimap<String, String> getGenderToUnits() { 397 return genderToUnits; 398 } 399 getUniqueCaseAndCountToUnits()400 public Multimap<Integer, String> getUniqueCaseAndCountToUnits() { 401 return uniqueCaseAndCountToUnits; 402 } getDistinctNominativeCaseToUnit()403 public Multimap<String, String> getDistinctNominativeCaseToUnit() { 404 return distinctNominativeCaseToUnit; 405 } 406 }