1 package org.unicode.cldr.test; 2 3 import java.util.Collection; 4 import java.util.Collections; 5 import java.util.Map; 6 import java.util.Map.Entry; 7 import java.util.NavigableSet; 8 import java.util.Set; 9 import java.util.TreeMap; 10 11 import org.unicode.cldr.util.CLDRConfig; 12 import org.unicode.cldr.util.CLDRFile; 13 import org.unicode.cldr.util.GrammarInfo; 14 import org.unicode.cldr.util.GrammarInfo.GrammaticalFeature; 15 import org.unicode.cldr.util.GrammarInfo.GrammaticalScope; 16 import org.unicode.cldr.util.GrammarInfo.GrammaticalTarget; 17 import org.unicode.cldr.util.ICUServiceBuilder; 18 import org.unicode.cldr.util.Pair; 19 import org.unicode.cldr.util.SupplementalDataInfo; 20 import org.unicode.cldr.util.SupplementalDataInfo.PluralInfo.Count; 21 import org.unicode.cldr.util.SupplementalDataInfo.PluralType; 22 import org.unicode.cldr.util.UnitConverter.UnitSystem; 23 import org.unicode.cldr.util.UnitPathType; 24 25 import com.google.common.base.Joiner; 26 import com.google.common.collect.ImmutableMultimap; 27 import com.google.common.collect.ImmutableSet; 28 import com.google.common.collect.Maps; 29 import com.google.common.collect.Multimap; 30 import com.google.common.collect.Multimaps; 31 import com.google.common.collect.TreeMultimap; 32 import com.ibm.icu.impl.locale.XCldrStub.ImmutableMap; 33 import com.ibm.icu.impl.number.DecimalQuantity; 34 import com.ibm.icu.text.DecimalFormat; 35 import com.ibm.icu.text.PluralRules; 36 import com.ibm.icu.text.PluralRules.DecimalQuantitySamples; 37 import com.ibm.icu.text.PluralRules.DecimalQuantitySamplesRange; 38 import com.ibm.icu.text.PluralRules.Operand; 39 import com.ibm.icu.text.PluralRules.SampleType; 40 import com.ibm.icu.util.Output; 41 42 /** 43 * Return the best samples for illustrating minimal pairs 44 * @author markdavis 45 * 46 */ 47 public class BestMinimalPairSamples { 48 public static final String EQUALS_NOMINATIVE = "=nominative"; 49 private static final Joiner PLUS_JOINER = Joiner.on("+"); 50 private static final CLDRConfig CONFIG = CLDRConfig.getInstance(); 51 private static final SupplementalDataInfo supplementalDataInfo = SupplementalDataInfo.getInstance(); 52 53 final private CLDRFile cldrFile; 54 final private GrammarInfo grammarInfo; 55 final private PluralRules pluralInfo; 56 final private PluralRules ordinalInfo; 57 final private ICUServiceBuilder icuServiceBuilder; 58 private CaseAndGenderSamples caseAndGenderSamples = null; // lazy evaluated 59 private Multimap<String, String> genderToUnits; 60 private Multimap<Integer, String> uniqueCaseAndCountToUnits; 61 private Multimap<String, String> distinctNominativeCaseToUnit; 62 private final boolean gatherStats; 63 BestMinimalPairSamples(CLDRFile cldrFile, ICUServiceBuilder icuServiceBuilder, boolean gatherStats)64 public BestMinimalPairSamples(CLDRFile cldrFile, ICUServiceBuilder icuServiceBuilder, boolean gatherStats) { 65 this.cldrFile = cldrFile; 66 grammarInfo = supplementalDataInfo.getGrammarInfo(cldrFile.getLocaleID()); 67 pluralInfo = supplementalDataInfo.getPlurals(PluralType.cardinal, cldrFile.getLocaleID()).getPluralRules(); 68 ordinalInfo = supplementalDataInfo.getPlurals(PluralType.ordinal, cldrFile.getLocaleID()).getPluralRules(); 69 this.icuServiceBuilder = icuServiceBuilder; 70 genderToUnits = TreeMultimap.create(); 71 uniqueCaseAndCountToUnits = TreeMultimap.create(); 72 this.gatherStats = gatherStats; 73 } 74 75 76 static final class CaseAndGenderSamples { 77 private final Map<String, Pair<String, String>> genderCache; 78 private final Map<String, String> caseCache; 79 private final String caseUnitId; 80 CaseAndGenderSamples(Map<String, String> caseCache2, String bestCaseUnitId, Map<String, Pair<String, String>> genderCache2)81 public CaseAndGenderSamples(Map<String, String> caseCache2, String bestCaseUnitId, Map<String, Pair<String, String>> genderCache2) { 82 genderCache = genderCache2; 83 caseCache = caseCache2; 84 caseUnitId = bestCaseUnitId; 85 } 86 getGender(String gender, Output<String> shortUnitId)87 public String getGender(String gender, Output<String> shortUnitId) { 88 Pair<String, String> result = genderCache.get(gender); 89 if (result == null) { 90 return null; 91 } 92 shortUnitId.value = result.getFirst(); 93 return result.getSecond(); 94 } 95 getCase(String unitCase, Output<String> shortUnitId)96 public String getCase(String unitCase, Output<String> shortUnitId) { 97 shortUnitId.value = caseUnitId; 98 return caseCache.get(unitCase); 99 } 100 } 101 102 /** 103 * Returns a "good" value for a unit. Favors metric units, and simple units 104 * @param shortUnitId 105 */ getBestUnitWithGender(String gender, Output<String> shortUnitId)106 public synchronized String getBestUnitWithGender(String gender, Output<String> shortUnitId) { 107 if (gender == null || grammarInfo == null) { 108 return null; 109 } 110 if (caseAndGenderSamples == null) { 111 caseAndGenderSamples = loadCaches(); 112 } 113 return caseAndGenderSamples.getGender(gender, shortUnitId); 114 } 115 116 /** 117 * Returns a "good" value for a unit. Favors metric units, and simple units 118 * @param shortUnitId 119 */ getBestUnitWithCase(String unitCase, Output<String> shortUnitId)120 public synchronized String getBestUnitWithCase(String unitCase, Output<String> shortUnitId) { 121 if (unitCase == null || grammarInfo == null) { 122 return null; 123 } 124 if (caseAndGenderSamples == null) { 125 caseAndGenderSamples = loadCaches(); 126 } 127 return caseAndGenderSamples.getCase(unitCase, shortUnitId); 128 } 129 130 static final Set<String> SKIP_CASE = ImmutableSet.of( 131 "concentr-ofglucose", 132 "concentr-portion", 133 "length-100-kilometer", 134 "pressure-ofhg"); 135 loadCaches()136 public CaseAndGenderSamples loadCaches() { 137 Collection<String> unitCases = grammarInfo.get(GrammaticalTarget.nominal, GrammaticalFeature.grammaticalCase, GrammaticalScope.units); 138 Map<String,String> genderResults = Maps.newHashMap(); 139 Multimap<String, Pair<String,String>> unitPatternToCaseAndCounts = TreeMultimap.create(); 140 distinctNominativeCaseToUnit = TreeMultimap.create(); 141 142 int bestCaseFormCount = 0; 143 String bestCaseUnitId = null; 144 Multimap<String, Pair<String,String>> bestUnitPatternToCases = null; 145 Multimap<String, String> unitToDistinctNominativeCase = TreeMultimap.create(); 146 147 for (String longUnitId : GrammarInfo.getUnitsToAddGrammar()) { 148 String possibleGender = cldrFile.getStringValue("//ldml/units/unitLength[@type=\"long\"]/unit[@type=\"" + longUnitId + "\"]/gender"); 149 String shortUnitId = ExampleGenerator.UNIT_CONVERTER.getShortId(longUnitId); 150 if (shortUnitId.equals("hour") && cldrFile.getLocaleID().equals("ta")) { 151 int debug = 0; 152 } 153 if (possibleGender != null) { 154 if (gatherStats) { 155 genderToUnits.put(possibleGender, shortUnitId); 156 } 157 String formerLongUnitId = genderResults.get(possibleGender); 158 if (formerLongUnitId == null || isBetterUnit(longUnitId, formerLongUnitId)) { 159 genderResults.put(possibleGender, longUnitId); 160 } 161 } 162 if (!unitCases.isEmpty()) { 163 unitPatternToCaseAndCounts.clear(); 164 for (String count : pluralInfo.getKeywords()) { 165 for (String unitCase : unitCases) { 166 String grammarAttributes = GrammarInfo.getGrammaticalInfoAttributes(grammarInfo, UnitPathType.unit, count, null, unitCase); 167 String unitPattern = cldrFile.getStringValue("//ldml/units/unitLength[@type=\"long\"]/unit[@type=\"" + longUnitId + "\"]/unitPattern" + grammarAttributes); 168 if (unitPattern == null) { 169 continue; 170 } 171 unitPattern = unitPattern.replace("\u00A0", "").trim(); 172 final Pair<String, String> caseAndCount = Pair.of(unitCase, count); 173 unitPatternToCaseAndCounts.put(unitPattern, caseAndCount); 174 } 175 } 176 int caseFormCount = unitPatternToCaseAndCounts.keySet().size(); 177 178 boolean alwaysSameAsNominative = true; 179 TreeMultimap<Pair<String, String>, String> caseAndCountToPattern = Multimaps.invertFrom(unitPatternToCaseAndCounts, TreeMultimap.create()); 180 for (Entry<Pair<String, String>, String> entry : caseAndCountToPattern.entries()) { 181 Pair<String, String> caseAndCount = entry.getKey(); 182 String pattern = entry.getValue(); 183 String gCase = caseAndCount.getFirst(); 184 if (!gCase.equals("nominative")) { 185 Pair<String, String> nomPair = Pair.of("nominative", caseAndCount.getSecond()); 186 NavigableSet<String> nomPatterns = caseAndCountToPattern.get(nomPair); 187 if (!nomPatterns.contains(pattern)) { 188 unitToDistinctNominativeCase.put(shortUnitId, gCase); 189 alwaysSameAsNominative = false; 190 } 191 } 192 } 193 for (Entry<String, Collection<String>> entry : unitToDistinctNominativeCase.asMap().entrySet()) { 194 distinctNominativeCaseToUnit.put(PLUS_JOINER.join(entry.getValue()), entry.getKey()); 195 } 196 if (alwaysSameAsNominative) { 197 distinctNominativeCaseToUnit.put(EQUALS_NOMINATIVE, shortUnitId); 198 } 199 200 if (gatherStats 201 && !SKIP_CASE.contains(longUnitId)) { 202 uniqueCaseAndCountToUnits.put(caseFormCount, shortUnitId); 203 } 204 205 // For case, we should do something fancier, but for now we pick the units with the largest number of distinct forms. 206 int diff = caseFormCount - bestCaseFormCount; 207 if (diff > 0 208 || diff == 0 209 && isBetterUnit(longUnitId, bestCaseUnitId)) { 210 // System.out.println(cldrFile.getLocaleID() + "\t" + longUnitId + " better than " + bestCaseUnitId); 211 // if (WORSE.contains(longUnitId)) { 212 // isBetterUnit(longUnitId, bestCaseUnitId); 213 // } 214 bestCaseFormCount = caseFormCount; 215 bestCaseUnitId = longUnitId; 216 bestUnitPatternToCases = TreeMultimap.create(unitPatternToCaseAndCounts); 217 } 218 } 219 } 220 // Fill the case cache with the most distinctive forms. 221 Map<String, String> caseCache = getBestCasePatterns(bestUnitPatternToCases); 222 223 // Make the gender cache be translated units as well as unit IDs 224 Count count = pluralInfo.getKeywords().contains("one") ? Count.one : Count.other; 225 Map<String,Pair<String,String>> result2 = Maps.newHashMap(); 226 227 for (Entry<String, String> entry : genderResults.entrySet()) { 228 String longUnitId = entry.getValue(); 229 String unitPattern = cldrFile.getStringValue("//ldml/units/unitLength[@type=\"long\"]/unit[@type=\"" + longUnitId + "\"]/unitPattern[@count=\"" + count + "\"]"); 230 unitPattern = unitPattern.replace("{0}", "").replace("\u00A0", "").trim(); 231 result2.put(entry.getKey(), Pair.of(ExampleGenerator.UNIT_CONVERTER.getShortId(longUnitId), unitPattern)); 232 } 233 // it doesn't matter if we reset this due to multiple threads 234 Map<String, Pair<String, String>> genderCache = ImmutableMap.copyOf(result2); 235 CaseAndGenderSamples result = new CaseAndGenderSamples(caseCache, ExampleGenerator.UNIT_CONVERTER.getShortId(bestCaseUnitId), genderCache); 236 237 genderToUnits = ImmutableMultimap.copyOf(genderToUnits); 238 uniqueCaseAndCountToUnits = ImmutableMultimap.copyOf(uniqueCaseAndCountToUnits); 239 distinctNominativeCaseToUnit = ImmutableMultimap.copyOf(distinctNominativeCaseToUnit); 240 return result; 241 } 242 243 /** 244 * Get the a pattern that is most unique for each case. 245 * @param bestUnitPatternToCases 246 * @return 247 */ getBestCasePatterns(Multimap<String, Pair<String, String>> bestUnitPatternToCases)248 private Map<String, String> getBestCasePatterns(Multimap<String, Pair<String, String>> bestUnitPatternToCases) { 249 if (bestUnitPatternToCases == null || bestUnitPatternToCases.isEmpty()) { 250 return Collections.emptyMap(); 251 } 252 Map<String,String> result = new TreeMap<>(); 253 while (true) { 254 String bestPattern = getBestPattern(bestUnitPatternToCases); 255 Pair<String, String> bestCaseCount = bestUnitPatternToCases.get(bestPattern).iterator().next(); 256 String bestCase = bestCaseCount.getFirst(); 257 String bestCount = bestCaseCount.getSecond(); 258 String sample = getPluralOrOrdinalSample(PluralType.cardinal, bestCount); 259 if (sample == null) { // debugging 260 getPluralOrOrdinalSample(PluralType.cardinal, bestCount); 261 } 262 result.put(bestCaseCount.getFirst(), bestPattern.replace("{0}", sample)); 263 TreeMultimap<Pair<String, String>, String> caseToPatterns = Multimaps.invertFrom(bestUnitPatternToCases, TreeMultimap.create()); 264 for (String count : pluralInfo.getKeywords()) { 265 caseToPatterns.removeAll(Pair.of(bestCase, count)); 266 } 267 if (caseToPatterns.keySet().isEmpty()) { 268 return result; 269 } 270 bestUnitPatternToCases = Multimaps.invertFrom(caseToPatterns, TreeMultimap.create()); 271 } 272 } 273 getBestPattern(Multimap<String, Pair<String, String>> bestUnitPatternToCases)274 private String getBestPattern(Multimap<String, Pair<String, String>> bestUnitPatternToCases) { 275 int bestCaseSize = 1000; 276 String bestPattern = null; 277 Collection<Pair<String, String>> bestCase = null; 278 for (Entry<String, Collection<Pair<String, String>>> entry : bestUnitPatternToCases.asMap().entrySet()) { 279 final Collection<Pair<String, String>> setOfCases = entry.getValue(); 280 if (setOfCases.size() < bestCaseSize) { 281 bestCaseSize = setOfCases.size(); 282 bestPattern = entry.getKey(); 283 bestCase = setOfCases; 284 } 285 } 286 return bestPattern; 287 } 288 isBetterUnit(String longUnitId, String formerLongUnitId)289 public boolean isBetterUnit(String longUnitId, String formerLongUnitId) { 290 // replace if as good or better (where better is smaller). Metric is better. If both metric, choose alphabetical 291 boolean isBetter = false; 292 int diff = systemWeight(longUnitId) - systemWeight(formerLongUnitId); 293 if (diff < 0) { 294 isBetter = true; 295 } else if (diff == 0) { 296 diff = categoryWeight(longUnitId) - categoryWeight(formerLongUnitId); 297 if (diff < 0) { 298 isBetter = true; 299 } else if (diff == 0 && longUnitId.compareTo(formerLongUnitId) < 0) { 300 isBetter = true; 301 } 302 } 303 return isBetter; 304 } 305 306 static final Set<String> WORSE = ImmutableSet.of("length-100-kilometer", "length-mile-scandinavian"); 307 /** 308 * better result is smaller 309 * @param longUnitId 310 * @return 311 */ systemWeight(String longUnitId)312 public int systemWeight(String longUnitId) { 313 if (WORSE.contains(longUnitId)) { 314 return 1; 315 } 316 Set<UnitSystem> systems = ExampleGenerator.UNIT_CONVERTER.getSystemsEnum(ExampleGenerator.UNIT_CONVERTER.getShortId(longUnitId)); 317 if (systems.contains(UnitSystem.metric)) { 318 return 0; 319 } 320 return 1; 321 } 322 categoryWeight(String longUnitId)323 private int categoryWeight(String longUnitId) { 324 if (longUnitId.startsWith("length")) { 325 return 0; 326 } else if (longUnitId.startsWith("weight")) { 327 return 1; 328 } else if (longUnitId.startsWith("duration")) { 329 return 2; 330 } 331 return 999; 332 } 333 getPluralOrOrdinalSample(PluralType pluralType, String code)334 public String getPluralOrOrdinalSample(PluralType pluralType, String code) { 335 PluralRules rules = pluralType == PluralType.cardinal ? pluralInfo : ordinalInfo; 336 DecimalQuantitySamples samples = rules.getDecimalSamples(code, SampleType.INTEGER); 337 if (samples == null) { 338 samples = rules.getDecimalSamples(code, SampleType.DECIMAL); 339 } 340 if (samples == null) { 341 return null; 342 } 343 344 // get good sample. Avoid zero if possible 345 DecimalQuantity sample = null; 346 for (DecimalQuantitySamplesRange sampleRange : samples.getSamples()) { 347 sample = sampleRange.start; 348 if (sample.toDouble() != 0d) { 349 break; 350 } 351 } 352 353 if (icuServiceBuilder != null) { 354 int visibleDigits = (int) sample.getPluralOperand(Operand.v); 355 DecimalFormat nf; 356 if (visibleDigits == 0) { 357 nf = icuServiceBuilder.getNumberFormat(0); // 0 is integer, 1 is decimal 358 } else { 359 nf = icuServiceBuilder.getNumberFormat(1); // 0 is integer, 1 is decimal 360 int minFracDigits = nf.getMinimumFractionDigits(); 361 int maxFracDigits = nf.getMaximumFractionDigits(); 362 if (minFracDigits != visibleDigits || maxFracDigits != visibleDigits) { 363 nf = (DecimalFormat) nf.clone(); 364 nf.setMaximumFractionDigits(visibleDigits); 365 nf.setMinimumFractionDigits(visibleDigits); 366 } 367 } 368 return nf.format(sample.toBigDecimal()); 369 } 370 return sample.toString(); 371 } 372 373 /** 374 * Get the best value to show, plus the shortUnitId if relevant (case/gender) 375 */ getBestValue(String header, String code, Output<String> shortUnitId)376 public String getBestValue(String header, String code, Output<String> shortUnitId) { 377 String result = null; 378 switch(header) { 379 case "Case": 380 result = getBestUnitWithCase(code, shortUnitId); 381 break; 382 case "Gender": 383 result = getBestUnitWithGender(code, shortUnitId); 384 break; 385 case "Ordinal": 386 result = getPluralOrOrdinalSample(PluralType.ordinal, code); 387 shortUnitId.value = "n/a"; 388 break; 389 case "Plural": 390 result = getPluralOrOrdinalSample(PluralType.cardinal, code); 391 shortUnitId.value = "n/a"; 392 break; 393 } 394 return result == null ? "X" : result; 395 } 396 getGenderToUnits()397 public Multimap<String, String> getGenderToUnits() { 398 return genderToUnits; 399 } 400 getUniqueCaseAndCountToUnits()401 public Multimap<Integer, String> getUniqueCaseAndCountToUnits() { 402 return uniqueCaseAndCountToUnits; 403 } getDistinctNominativeCaseToUnit()404 public Multimap<String, String> getDistinctNominativeCaseToUnit() { 405 return distinctNominativeCaseToUnit; 406 } 407 }