1 package org.unicode.cldr.test; 2 3 import java.util.LinkedHashSet; 4 import java.util.List; 5 import java.util.Map.Entry; 6 import java.util.Set; 7 import java.util.concurrent.ExecutionException; 8 import java.util.regex.Matcher; 9 import java.util.regex.Pattern; 10 11 import org.unicode.cldr.test.CheckCLDR.CheckStatus.Subtype; 12 import org.unicode.cldr.util.ApproximateWidth; 13 import org.unicode.cldr.util.CLDRFile; 14 import org.unicode.cldr.util.Level; 15 import org.unicode.cldr.util.PatternCache; 16 import org.unicode.cldr.util.Rational; 17 import org.unicode.cldr.util.RegexLookup; 18 import org.unicode.cldr.util.StandardCodes.LstrType; 19 import org.unicode.cldr.util.SupplementalDataInfo; 20 import org.unicode.cldr.util.UnitConverter; 21 import org.unicode.cldr.util.UnitConverter.UnitId; 22 import org.unicode.cldr.util.Validity; 23 24 import com.google.common.cache.CacheBuilder; 25 import com.google.common.cache.CacheLoader; 26 import com.google.common.cache.LoadingCache; 27 import com.ibm.icu.util.ICUException; 28 import com.ibm.icu.util.Output; 29 30 public class CheckWidths extends CheckCLDR { 31 // remember to add this class to the list in CheckCLDR.getCheckAll 32 // to run just this test, on just locales starting with 'nl', use CheckCLDR with -fnl.* -t.*CheckWidths.* 33 private static CoverageLevel2 coverageLevel; 34 private Level requiredLevel; 35 36 private static UnitWidthUtil UNIT_WIDTHS_UTIL = UnitWidthUtil.getInstance(); 37 38 /** 39 * Controls for the warning about too many components, and for when to cause error. 40 */ 41 public static final int WARN_COMPONENTS_PER_ANNOTATION = 7; 42 public static final int MAX_COMPONENTS_PER_ANNOTATION = 16; 43 44 SupplementalDataInfo supplementalData; 45 46 private static final double EM = ApproximateWidth.getWidth("月"); 47 48 private static final boolean DEBUG = true; 49 50 private enum Measure { 51 CODE_POINTS, DISPLAY_WIDTH, SET_ELEMENTS 52 } 53 54 private enum LimitType { 55 MINIMUM, MAXIMUM 56 } 57 58 private enum Special { 59 NONE, QUOTES, PLACEHOLDERS, NUMBERSYMBOLS, NUMBERFORMAT, BARS, PLACEHOLDER_UNITS 60 } 61 62 private static final Pattern PLACEHOLDER_PATTERN = PatternCache.get("\\{\\d\\}"); 63 64 private static class Limit { 65 final double warningReference; 66 final double errorReference; 67 final LimitType limit; 68 final Measure measure; 69 final Special special; 70 final String message; 71 final Subtype subtype; 72 final boolean debug; 73 Limit(double warningReference, double errorReference, Measure measure, LimitType limit, Special special, boolean debug)74 public Limit(double warningReference, double errorReference, Measure measure, LimitType limit, Special special, boolean debug) { 75 this.debug = debug; 76 this.warningReference = warningReference; 77 this.errorReference = errorReference; 78 this.limit = limit; 79 this.measure = measure; 80 this.special = special; 81 switch (limit) { 82 case MINIMUM: 83 this.subtype = Subtype.valueTooNarrow; 84 switch (measure) { 85 case CODE_POINTS: 86 this.message = "Expected no fewer than {0} character(s), but was {1}."; 87 break; 88 case DISPLAY_WIDTH: 89 this.message = "Too narrow by about {2}% (with common fonts)."; 90 break; 91 default: 92 throw new IllegalArgumentException(); 93 } 94 break; 95 case MAXIMUM: 96 switch (measure) { 97 case CODE_POINTS: 98 this.message = "Expected no more than {0} character(s), but was {1}."; 99 this.subtype = Subtype.valueTooWide; 100 break; 101 case DISPLAY_WIDTH: 102 this.message = "Too wide by about {2}% (with common fonts)."; 103 this.subtype = Subtype.valueTooWide; 104 break; 105 case SET_ELEMENTS: 106 this.message = "Expected no more than {0} items(s), but was {1}."; 107 this.subtype = Subtype.tooManyValues; 108 break; 109 default: 110 throw new IllegalArgumentException(); 111 } 112 break; 113 default: 114 throw new IllegalArgumentException(); 115 } 116 } 117 Limit(double d, double e, Measure displayWidth, LimitType maximum, Special placeholders)118 public Limit(double d, double e, Measure displayWidth, LimitType maximum, Special placeholders) { 119 this(d, e, displayWidth, maximum, placeholders, false); 120 } 121 hasProblem(String path, String value, List<CheckStatus> result, CheckCLDR cause, Boolean aliasedAndComprehensive)122 boolean hasProblem(String path, String value, List<CheckStatus> result, CheckCLDR cause, Boolean aliasedAndComprehensive) { 123 double factor = 1d; 124 switch (special) { 125 case NUMBERFORMAT: 126 String[] values = value.split(";", 2); 127 // If it's a number format with positive and negative subpatterns, just check the longer one. 128 value = (values.length == 2 && values[1].length() > values[0].length()) ? values[1] : values[0]; 129 value = value.replace("'", ""); 130 break; 131 case QUOTES: 132 value = value.replace("'", ""); 133 break; 134 case PLACEHOLDER_UNITS: 135 factor = UNIT_WIDTHS_UTIL.getRoughComponentMax(path); 136 // fall through ok 137 case PLACEHOLDERS: 138 value = PLACEHOLDER_PATTERN.matcher(value).replaceAll(""); 139 break; 140 case NUMBERSYMBOLS: 141 value = value.replaceAll("[\u200E\u200F\u061C]", ""); // don't include LRM/RLM/ALM when checking length of number symbols 142 break; 143 case BARS: 144 value = value.replaceAll("[^|]", "")+"|"; // Check the number of items by counting separators. Bit of a hack... 145 break; 146 default: 147 } 148 double valueMeasure = measure == Measure.DISPLAY_WIDTH ? ApproximateWidth.getWidth(value) 149 : value.codePointCount(0, value.length()) ; 150 CheckStatus.Type errorType = CheckStatus.warningType; 151 switch (limit) { 152 case MINIMUM: 153 if (valueMeasure >= warningReference) { 154 return false; 155 } 156 if (valueMeasure < errorReference 157 && cause.getPhase() != Phase.BUILD 158 && !aliasedAndComprehensive) { 159 errorType = CheckStatus.errorType; 160 } 161 break; 162 case MAXIMUM: 163 if (valueMeasure <= warningReference * factor) { 164 return false; 165 } 166 if (valueMeasure > errorReference * factor 167 && cause.getPhase() != Phase.BUILD 168 && !aliasedAndComprehensive) { 169 // Workaround for ST submission phase only per TC discussion 2018-05-30 170 // Make too many keywords be only a warning until we decide policy (JCE) 171 if (cause.getPhase() == Phase.SUBMISSION && measure.equals(Measure.SET_ELEMENTS)) { 172 errorType = CheckStatus.warningType; 173 } else { 174 errorType = CheckStatus.errorType; 175 } 176 } 177 break; 178 } 179 // the 115 is so that we don't show small percentages 180 // the /10 ...*10 is to round to multiples of 10% percent 181 double percent = (int) (Math.abs(115 * valueMeasure / warningReference - 100.0d) / 10 + 0.49999d) * 10; 182 result.add(new CheckStatus().setCause(cause) 183 .setMainType(errorType) 184 .setSubtype(subtype) 185 .setMessage(message, warningReference, valueMeasure, percent)); 186 return true; 187 } 188 } 189 190 static RegexLookup<Limit[]> lookup = new RegexLookup<Limit[]>() 191 .setPatternTransform(RegexLookup.RegexFinderTransformPath) 192 .addVariable("%A", "\"[^\"]+\"") 193 .addVariable("%P", "\"[ap]m\"") 194 .addVariable("%Q", "[^ap].*|[ap][^m].*") // Anything but am or pm 195 .add("//ldml/delimiters/(quotation|alternateQuotation)", new Limit[] { 196 new Limit(1, 1, Measure.CODE_POINTS, LimitType.MAXIMUM, Special.NONE) 197 }) 198 199 // Numeric items should be no more than a single character 200 201 .add("//ldml/numbers/symbols[@numberSystem=%A]/(decimal|group|minus|percent|perMille|plus)", new Limit[] { 202 new Limit(1, 1, Measure.CODE_POINTS, LimitType.MAXIMUM, Special.NUMBERSYMBOLS) 203 }) 204 205 // Now widths 206 // The following are rough measures, just to check strange cases 207 208 .add("//ldml/characters/ellipsis[@type=\"(final|initial|medial)\"]", new Limit[] { 209 new Limit(2 * EM, 5 * EM, Measure.DISPLAY_WIDTH, LimitType.MAXIMUM, Special.PLACEHOLDERS) 210 }) 211 212 .add("//ldml/localeDisplayNames/localeDisplayPattern/", new Limit[] { // {0}: {1}, {0} ({1}), , 213 new Limit(2 * EM, 3 * EM, Measure.DISPLAY_WIDTH, LimitType.MAXIMUM, Special.PLACEHOLDERS) 214 }) 215 216 .add("//ldml/listPatterns/listPattern/listPatternPart[@type=%A]", new Limit[] { // {0} and {1} 217 new Limit(5 * EM, 10 * EM, Measure.DISPLAY_WIDTH, LimitType.MAXIMUM, Special.PLACEHOLDERS) 218 }) 219 220 .add("//ldml/dates/timeZoneNames/fallbackFormat", new Limit[] { // {1} ({0}) 221 new Limit(2 * EM, 3 * EM, Measure.DISPLAY_WIDTH, LimitType.MAXIMUM, Special.PLACEHOLDERS) 222 }) 223 224 .add("//ldml/dates/timeZoneNames/(regionFormat|hourFormat)", new Limit[] { // {0} Time, 225 // +HH:mm;-HH:mm 226 new Limit(10 * EM, 20 * EM, Measure.DISPLAY_WIDTH, LimitType.MAXIMUM, Special.PLACEHOLDERS) 227 }) 228 229 .add("//ldml/dates/timeZoneNames/(gmtFormat|gmtZeroFormat)", new Limit[] { // GMT{0}, GMT 230 new Limit(5 * EM, 10 * EM, Measure.DISPLAY_WIDTH, LimitType.MAXIMUM, Special.PLACEHOLDERS) 231 }) 232 233 // Era Abbreviations 234 235 // Allow longer for Japanese calendar eras 236 .add("//ldml/dates/calendars/calendar[@type=\"japanese\"]/.*/eraAbbr/era[@type=%A]", new Limit[] { 237 new Limit(12 * EM, 16 * EM, Measure.DISPLAY_WIDTH, LimitType.MAXIMUM, Special.NONE) 238 }) 239 // Allow longer for ROC calendar eras 240 .add("//ldml/dates/calendars/calendar[@type=\"roc\"]/.*/eraAbbr/era[@type=%A]", new Limit[] { 241 new Limit(4 * EM, 8 * EM, Measure.DISPLAY_WIDTH, LimitType.MAXIMUM, Special.NONE) 242 }) 243 .add("//ldml/dates/calendars/calendar.*/eraAbbr/era[@type=%A]", new Limit[] { 244 new Limit(3 * EM, 6 * EM, Measure.DISPLAY_WIDTH, LimitType.MAXIMUM, Special.NONE) 245 }) 246 247 // am/pm abbreviated 248 .add("//ldml/dates/calendars/calendar[@type=\"gregorian\"]/dayPeriods/.*/dayPeriodWidth[@type=\"abbreviated\"]/dayPeriod[@type=%P]", new Limit[] { 249 new Limit(4 * EM, 6 * EM, Measure.DISPLAY_WIDTH, LimitType.MAXIMUM, Special.NONE) 250 }) 251 // other day periods abbreviated 252 .add("//ldml/dates/calendars/calendar[@type=\"gregorian\"]/dayPeriods/.*/dayPeriodWidth[@type=\"abbreviated\"]/dayPeriod[@type=%Q]", new Limit[] { 253 new Limit(8 * EM, 12 * EM, Measure.DISPLAY_WIDTH, LimitType.MAXIMUM, Special.NONE) 254 }) 255 // am/pm wide 256 .add("//ldml/dates/calendars/calendar[@type=\"gregorian\"]/dayPeriods/.*/dayPeriodWidth[@type=\"wide\"]/dayPeriod[@type=%P]", new Limit[] { 257 new Limit(5 * EM, 10 * EM, Measure.DISPLAY_WIDTH, LimitType.MAXIMUM, Special.NONE) 258 }) 259 // other day periods wide 260 .add("//ldml/dates/calendars/calendar[@type=\"gregorian\"]/dayPeriods/.*/dayPeriodWidth[@type=\"wide\"]/dayPeriod[@type=%Q]", new Limit[] { 261 new Limit(10 * EM, 20 * EM, Measure.DISPLAY_WIDTH, LimitType.MAXIMUM, Special.NONE) 262 }) 263 264 // Narrow items 265 266 .add("//ldml/dates/calendars/calendar.*[@type=\"narrow\"](?!/cyclic|/dayPeriod|/monthPattern)", new Limit[] { 267 new Limit(1.5 * EM, 2.25 * EM, Measure.DISPLAY_WIDTH, LimitType.MAXIMUM, Special.NONE) 268 }) 269 // \"(?!am|pm)[^\"]+\"\\ 270 271 // Compact number formats 272 // pattern[@type="100000000000000"] 273 .add("//ldml/numbers/decimalFormats[@numberSystem=%A]/decimalFormatLength[@type=\"short\"]/decimalFormat[@type=%A]/pattern[@type=\"100000000000000", 274 new Limit[] { 275 new Limit(4 * EM, 6 * EM, Measure.DISPLAY_WIDTH, LimitType.MAXIMUM, Special.NUMBERFORMAT) 276 }) 277 .add("//ldml/numbers/decimalFormats[@numberSystem=%A]/decimalFormatLength[@type=\"short\"]/decimalFormat[@type=%A]/pattern[@type=\"1", 278 new Limit[] { 279 new Limit(4 * EM, 5 * EM, Measure.DISPLAY_WIDTH, LimitType.MAXIMUM, Special.NUMBERFORMAT) 280 }) 281 282 // Short/Narrow units 283 // Note that the EM values are adjusted for units according to the number of components in the units 284 // See UnitWidthUtil for more information 285 .add("//ldml/units/unitLength[@type=\"(short|narrow)\"]/unit[@type=%A]/unitPattern", new Limit[] { 286 new Limit(3 * EM, 5 * EM, Measure.DISPLAY_WIDTH, LimitType.MAXIMUM, Special.PLACEHOLDER_UNITS) 287 }) 288 289 // Currency Symbols 290 .add("//ldml/numbers/currencies/currency[@type=%A]/symbol", new Limit[] { 291 new Limit(3 * EM, 5 * EM, Measure.DISPLAY_WIDTH, LimitType.MAXIMUM, Special.PLACEHOLDERS) 292 }) 293 294 // "grinning cat face with smiling eyes" should be normal max ~= 160 em 295 // emoji names (not keywords) 296 .add("//ldml/annotations/annotation[@cp=%A][@type=%A]", new Limit[] { 297 new Limit(20 * EM, 100 * EM, Measure.DISPLAY_WIDTH, LimitType.MAXIMUM, Special.NONE), 298 }) 299 .add("//ldml/annotations/annotation[@cp=%A]", new Limit[] { 300 new Limit(WARN_COMPONENTS_PER_ANNOTATION, MAX_COMPONENTS_PER_ANNOTATION, Measure.SET_ELEMENTS, LimitType.MAXIMUM, Special.BARS) // Allow up to 5 with no warning, up to 7 with no error. 301 }) 302 ; 303 304 // Quell noisy printout 305 // static { 306 // System.out.println("EMs: " + ApproximateWidth.getWidth("grinning cat face with smiling eyes")); 307 // } 308 309 Set<Limit> found = new LinkedHashSet<>(); 310 311 @Override handleCheck(String path, String fullPath, String value, Options options, List<CheckStatus> result)312 public CheckCLDR handleCheck(String path, String fullPath, String value, Options options, List<CheckStatus> result) { 313 if (value == null) { 314 return this; // skip 315 } 316 // String testPrefix = "//ldml/units/unitLength[@type=\"narrow\"]"; 317 // if (path.startsWith(testPrefix)) { 318 // int i = 0; 319 // } 320 // Limits item0 = 321 // lookup.get("//ldml/numbers/decimalFormats[@numberSystem=\"latn\"]/decimalFormatLength[@type=\"short\"]/decimalFormat[@type=\"standard\"]/pattern[@type=\"1000000000\"][@count=\"other\"]"); 322 // item0.check("123456789", result, this); 323 324 Limit[] items = lookup.get(path); 325 if (items != null) { 326 CLDRFile.Status status = new CLDRFile.Status(); 327 this.getCldrFileToCheck().getSourceLocaleID(path, status); 328 // This was put in specifically to deal with the fact that we added a bunch of new units in CLDR 26 329 // and didn't put the narrow forms of them into modern coverage. If/when the narrow forms of all units 330 // are modern coverage, then we can safely remove the aliasedAndComprehensive check. Right now if an 331 // item is aliased and coverage is comprehensive, then it can't generate anything worse than a warning. 332 Boolean aliasedAndComprehensive = (coverageLevel.getLevel(path).compareTo(Level.COMPREHENSIVE) == 0) 333 && (status.pathWhereFound.compareTo(path) != 0); 334 for (Limit item : items) { 335 if (item.hasProblem(path, value, result, this, aliasedAndComprehensive)) { 336 if (DEBUG && !found.contains(item)) { 337 found.add(item); 338 } 339 break; // only one error per item 340 } 341 } 342 } 343 return this; 344 } 345 346 @Override setCldrFileToCheck(CLDRFile cldrFileToCheck, Options options, List<CheckStatus> possibleErrors)347 public CheckCLDR setCldrFileToCheck(CLDRFile cldrFileToCheck, Options options, 348 List<CheckStatus> possibleErrors) { 349 final String localeID = cldrFileToCheck.getLocaleID(); 350 supplementalData = SupplementalDataInfo.getInstance(cldrFileToCheck.getSupplementalDirectory()); 351 coverageLevel = CoverageLevel2.getInstance(supplementalData, localeID); 352 353 super.setCldrFileToCheck(cldrFileToCheck, options, possibleErrors); 354 return this; 355 } 356 357 /** 358 * Provide a rough measure of how many unit components there are for the purpose of establishing a maximum width, with an special factor for non-metric. 359 */ 360 public static class UnitWidthUtil { 361 static final Pattern UNIT_PREFIX = Pattern.compile("//ldml/units/unitLength\\[@type=\"([^\"]*)\"]/unit\\[@type=\"([^\\\"]*)\"]"); 362 final UnitConverter CONVERTER = SupplementalDataInfo.getInstance().getUnitConverter(); 363 final Set<String> validLongUnitIDs = Validity.getInstance().getCodeToStatus(LstrType.unit).keySet(); 364 365 LoadingCache<String, Double> pathToUnitComponents = CacheBuilder.newBuilder().build( 366 new CacheLoader<String, Double>() { 367 @Override 368 public Double load(String path) throws ExecutionException { 369 final Matcher matcher = UNIT_PREFIX.matcher(path); 370 if (matcher.lookingAt()) { 371 //String length = matcher.group(1); 372 String longUnitId = matcher.group(2); 373 return unitToComponents.get(longUnitId); 374 } else { 375 throw new ICUException("Internal error"); 376 } 377 } 378 }); 379 380 LoadingCache<String, Double> unitToComponents = CacheBuilder.newBuilder().build(new CacheLoader<String, Double>() { 381 @Override 382 public Double load(String longUnitId) { 383 double components = 0; 384 String shortId = CONVERTER.getShortId(longUnitId); 385 386 Set<String> systems = CONVERTER.getSystems(shortId); 387 int widthFactor = systems.contains("metric") && !shortId.endsWith("-metric") ? 1 : 3; 388 // NOTE: allow cup-metric and pint-metric to be longer, since they aren't standard metric 389 390 // walk thorough the numerator and denominator to get the values 391 UnitId unitId = CONVERTER.createUnitId(shortId); 392 for (Entry<String, Integer> entry : unitId.numUnitsToPowers.entrySet()) { 393 components += getComponentCount(entry.getKey(), entry.getValue()); 394 } 395 for (Entry<String, Integer> entry : unitId.denUnitsToPowers.entrySet()) { 396 components += getComponentCount(entry.getKey(), entry.getValue()); 397 } 398 return widthFactor * components; 399 } 400 401 public double getComponentCount(String unit, Integer power) { 402 int result = 1; 403 if (power > 1) { 404 ++result; // add one component for a power 405 } 406 // hack for number 407 if (unit.startsWith("100-")) { 408 ++result; 409 unit = unit.substring(4); 410 } 411 Output<Rational> deprefix = new Output<>(); 412 unit = UnitConverter.stripPrefix(unit, deprefix); 413 if (!deprefix.value.equals(Rational.ONE)) { 414 ++result; // add 1 component for kilo, mega, etc. 415 } 416 for (int i = 0; i < unit.length(); ++i) { 417 if (unit.charAt(i) == '-') { 418 ++result; // add one component for -imperial, etc. 419 } 420 } 421 return result; 422 } 423 }); 424 UnitWidthUtil()425 private UnitWidthUtil() { } 426 getInstance()427 public static UnitWidthUtil getInstance() { 428 return new UnitWidthUtil(); 429 } 430 getRoughComponentMax(String path)431 public double getRoughComponentMax(String path) { 432 try { 433 return pathToUnitComponents.get(path); 434 } catch (ExecutionException e) { 435 throw new ICUException(e); 436 } 437 } 438 } 439 } 440