• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 package org.unicode.cldr.test;
2 
3 import java.util.LinkedHashSet;
4 import java.util.List;
5 import java.util.Map.Entry;
6 import java.util.Set;
7 import java.util.concurrent.ExecutionException;
8 import java.util.regex.Matcher;
9 import java.util.regex.Pattern;
10 
11 import org.unicode.cldr.test.CheckCLDR.CheckStatus.Subtype;
12 import org.unicode.cldr.util.ApproximateWidth;
13 import org.unicode.cldr.util.CLDRFile;
14 import org.unicode.cldr.util.Level;
15 import org.unicode.cldr.util.PatternCache;
16 import org.unicode.cldr.util.Rational;
17 import org.unicode.cldr.util.RegexLookup;
18 import org.unicode.cldr.util.StandardCodes.LstrType;
19 import org.unicode.cldr.util.SupplementalDataInfo;
20 import org.unicode.cldr.util.UnitConverter;
21 import org.unicode.cldr.util.UnitConverter.UnitId;
22 import org.unicode.cldr.util.Validity;
23 
24 import com.google.common.cache.CacheBuilder;
25 import com.google.common.cache.CacheLoader;
26 import com.google.common.cache.LoadingCache;
27 import com.ibm.icu.util.ICUException;
28 import com.ibm.icu.util.Output;
29 
30 public class CheckWidths extends CheckCLDR {
31     // remember to add this class to the list in CheckCLDR.getCheckAll
32     // to run just this test, on just locales starting with 'nl', use CheckCLDR with -fnl.* -t.*CheckWidths.*
33     private static CoverageLevel2 coverageLevel;
34     private Level requiredLevel;
35 
36     private static UnitWidthUtil UNIT_WIDTHS_UTIL = UnitWidthUtil.getInstance();
37 
38     /**
39      * Controls for the warning about too many components, and for when to cause error.
40      */
41     public static final int WARN_COMPONENTS_PER_ANNOTATION = 7;
42     public static final int MAX_COMPONENTS_PER_ANNOTATION = 16;
43 
44     SupplementalDataInfo supplementalData;
45 
46     private static final double EM = ApproximateWidth.getWidth("月");
47 
48     private static final boolean DEBUG = true;
49 
50     private enum Measure {
51         CODE_POINTS, DISPLAY_WIDTH, SET_ELEMENTS
52     }
53 
54     private enum LimitType {
55         MINIMUM, MAXIMUM
56     }
57 
58     private enum Special {
59         NONE, QUOTES, PLACEHOLDERS, NUMBERSYMBOLS, NUMBERFORMAT, BARS, PLACEHOLDER_UNITS
60     }
61 
62     private static final Pattern PLACEHOLDER_PATTERN = PatternCache.get("\\{\\d\\}");
63 
64     private static class Limit {
65         final double warningReference;
66         final double errorReference;
67         final LimitType limit;
68         final Measure measure;
69         final Special special;
70         final String message;
71         final Subtype subtype;
72         final boolean debug;
73 
Limit(double warningReference, double errorReference, Measure measure, LimitType limit, Special special, boolean debug)74         public Limit(double warningReference, double errorReference, Measure measure, LimitType limit, Special special, boolean debug) {
75             this.debug = debug;
76             this.warningReference = warningReference;
77             this.errorReference = errorReference;
78             this.limit = limit;
79             this.measure = measure;
80             this.special = special;
81             switch (limit) {
82             case MINIMUM:
83                 this.subtype = Subtype.valueTooNarrow;
84                 switch (measure) {
85                 case CODE_POINTS:
86                     this.message = "Expected no fewer than {0} character(s), but was {1}.";
87                     break;
88                 case DISPLAY_WIDTH:
89                     this.message = "Too narrow by about {2}% (with common fonts).";
90                     break;
91                 default:
92                     throw new IllegalArgumentException();
93                 }
94                 break;
95             case MAXIMUM:
96                 switch (measure) {
97                 case CODE_POINTS:
98                     this.message = "Expected no more than {0} character(s), but was {1}.";
99                     this.subtype = Subtype.valueTooWide;
100                     break;
101                 case DISPLAY_WIDTH:
102                     this.message = "Too wide by about {2}% (with common fonts).";
103                     this.subtype = Subtype.valueTooWide;
104                     break;
105                 case SET_ELEMENTS:
106                     this.message = "Expected no more than {0} items(s), but was {1}.";
107                     this.subtype = Subtype.tooManyValues;
108                     break;
109                 default:
110                     throw new IllegalArgumentException();
111                 }
112                 break;
113             default:
114                 throw new IllegalArgumentException();
115             }
116         }
117 
Limit(double d, double e, Measure displayWidth, LimitType maximum, Special placeholders)118         public Limit(double d, double e, Measure displayWidth, LimitType maximum, Special placeholders) {
119             this(d, e, displayWidth, maximum, placeholders, false);
120         }
121 
hasProblem(String path, String value, List<CheckStatus> result, CheckCLDR cause, Boolean aliasedAndComprehensive)122         boolean hasProblem(String path, String value, List<CheckStatus> result, CheckCLDR cause, Boolean aliasedAndComprehensive) {
123             double factor = 1d;
124             switch (special) {
125             case NUMBERFORMAT:
126                 String[] values = value.split(";", 2);
127                 // If it's a number format with positive and negative subpatterns, just check the longer one.
128                 value = (values.length == 2 && values[1].length() > values[0].length()) ? values[1] : values[0];
129                 value = value.replace("'", "");
130                 break;
131             case QUOTES:
132                 value = value.replace("'", "");
133                 break;
134             case PLACEHOLDER_UNITS:
135                 factor = UNIT_WIDTHS_UTIL.getRoughComponentMax(path);
136                 // fall through ok
137             case PLACEHOLDERS:
138                 value = PLACEHOLDER_PATTERN.matcher(value).replaceAll("");
139                 break;
140             case NUMBERSYMBOLS:
141                 value = value.replaceAll("[\u200E\u200F\u061C]", ""); // don't include LRM/RLM/ALM when checking length of number symbols
142                 break;
143             case BARS:
144                 value = value.replaceAll("[^|]", "")+"|"; // Check the number of items by counting separators. Bit of a hack...
145                 break;
146             default:
147             }
148             double valueMeasure = measure == Measure.DISPLAY_WIDTH ? ApproximateWidth.getWidth(value)
149                 : value.codePointCount(0, value.length()) ;
150             CheckStatus.Type errorType = CheckStatus.warningType;
151             switch (limit) {
152             case MINIMUM:
153                 if (valueMeasure >= warningReference) {
154                     return false;
155                 }
156                 if (valueMeasure < errorReference
157                 	&& cause.getPhase() != Phase.BUILD
158                 	&& !aliasedAndComprehensive) {
159                     errorType = CheckStatus.errorType;
160                 }
161                 break;
162             case MAXIMUM:
163                 if (valueMeasure <= warningReference * factor) {
164                     return false;
165                 }
166                 if (valueMeasure > errorReference * factor
167                 	&& cause.getPhase() != Phase.BUILD
168                 	&& !aliasedAndComprehensive) {
169                     // Workaround for ST submission phase only per TC discussion 2018-05-30
170                     // Make too many keywords be only a warning until we decide policy (JCE)
171                     if (cause.getPhase() == Phase.SUBMISSION && measure.equals(Measure.SET_ELEMENTS)) {
172                         errorType = CheckStatus.warningType;
173                     } else {
174                         errorType = CheckStatus.errorType;
175                     }
176                 }
177                 break;
178             }
179             // the 115 is so that we don't show small percentages
180             // the /10 ...*10 is to round to multiples of 10% percent
181             double percent = (int) (Math.abs(115 * valueMeasure / warningReference - 100.0d) / 10 + 0.49999d) * 10;
182             result.add(new CheckStatus().setCause(cause)
183                 .setMainType(errorType)
184                 .setSubtype(subtype)
185                 .setMessage(message, warningReference, valueMeasure, percent));
186             return true;
187         }
188     }
189 
190     static RegexLookup<Limit[]> lookup = new RegexLookup<Limit[]>()
191         .setPatternTransform(RegexLookup.RegexFinderTransformPath)
192         .addVariable("%A", "\"[^\"]+\"")
193         .addVariable("%P", "\"[ap]m\"")
194         .addVariable("%Q", "[^ap].*|[ap][^m].*") // Anything but am or pm
195         .add("//ldml/delimiters/(quotation|alternateQuotation)", new Limit[] {
196             new Limit(1, 1, Measure.CODE_POINTS, LimitType.MAXIMUM, Special.NONE)
197         })
198 
199         // Numeric items should be no more than a single character
200 
201         .add("//ldml/numbers/symbols[@numberSystem=%A]/(decimal|group|minus|percent|perMille|plus)", new Limit[] {
202             new Limit(1, 1, Measure.CODE_POINTS, LimitType.MAXIMUM, Special.NUMBERSYMBOLS)
203         })
204 
205         // Now widths
206         // The following are rough measures, just to check strange cases
207 
208         .add("//ldml/characters/ellipsis[@type=\"(final|initial|medial)\"]", new Limit[] {
209             new Limit(2 * EM, 5 * EM, Measure.DISPLAY_WIDTH, LimitType.MAXIMUM, Special.PLACEHOLDERS)
210         })
211 
212         .add("//ldml/localeDisplayNames/localeDisplayPattern/", new Limit[] { // {0}: {1}, {0} ({1}), ,
213             new Limit(2 * EM, 3 * EM, Measure.DISPLAY_WIDTH, LimitType.MAXIMUM, Special.PLACEHOLDERS)
214         })
215 
216         .add("//ldml/listPatterns/listPattern/listPatternPart[@type=%A]", new Limit[] { // {0} and {1}
217             new Limit(5 * EM, 10 * EM, Measure.DISPLAY_WIDTH, LimitType.MAXIMUM, Special.PLACEHOLDERS)
218         })
219 
220         .add("//ldml/dates/timeZoneNames/fallbackFormat", new Limit[] { // {1} ({0})
221             new Limit(2 * EM, 3 * EM, Measure.DISPLAY_WIDTH, LimitType.MAXIMUM, Special.PLACEHOLDERS)
222         })
223 
224         .add("//ldml/dates/timeZoneNames/(regionFormat|hourFormat)", new Limit[] { // {0} Time,
225             // +HH:mm;-HH:mm
226             new Limit(10 * EM, 20 * EM, Measure.DISPLAY_WIDTH, LimitType.MAXIMUM, Special.PLACEHOLDERS)
227         })
228 
229         .add("//ldml/dates/timeZoneNames/(gmtFormat|gmtZeroFormat)", new Limit[] { // GMT{0}, GMT
230             new Limit(5 * EM, 10 * EM, Measure.DISPLAY_WIDTH, LimitType.MAXIMUM, Special.PLACEHOLDERS)
231         })
232 
233         // Era Abbreviations
234 
235         // Allow longer for Japanese calendar eras
236         .add("//ldml/dates/calendars/calendar[@type=\"japanese\"]/.*/eraAbbr/era[@type=%A]", new Limit[] {
237             new Limit(12 * EM, 16 * EM, Measure.DISPLAY_WIDTH, LimitType.MAXIMUM, Special.NONE)
238         })
239         // Allow longer for ROC calendar eras
240         .add("//ldml/dates/calendars/calendar[@type=\"roc\"]/.*/eraAbbr/era[@type=%A]", new Limit[] {
241             new Limit(4 * EM, 8 * EM, Measure.DISPLAY_WIDTH, LimitType.MAXIMUM, Special.NONE)
242         })
243         .add("//ldml/dates/calendars/calendar.*/eraAbbr/era[@type=%A]", new Limit[] {
244             new Limit(3 * EM, 6 * EM, Measure.DISPLAY_WIDTH, LimitType.MAXIMUM, Special.NONE)
245         })
246 
247         // am/pm abbreviated
248         .add("//ldml/dates/calendars/calendar[@type=\"gregorian\"]/dayPeriods/.*/dayPeriodWidth[@type=\"abbreviated\"]/dayPeriod[@type=%P]", new Limit[] {
249             new Limit(4 * EM, 6 * EM, Measure.DISPLAY_WIDTH, LimitType.MAXIMUM, Special.NONE)
250         })
251         // other day periods abbreviated
252         .add("//ldml/dates/calendars/calendar[@type=\"gregorian\"]/dayPeriods/.*/dayPeriodWidth[@type=\"abbreviated\"]/dayPeriod[@type=%Q]", new Limit[] {
253             new Limit(8 * EM, 12 * EM, Measure.DISPLAY_WIDTH, LimitType.MAXIMUM, Special.NONE)
254         })
255         // am/pm wide
256         .add("//ldml/dates/calendars/calendar[@type=\"gregorian\"]/dayPeriods/.*/dayPeriodWidth[@type=\"wide\"]/dayPeriod[@type=%P]", new Limit[] {
257             new Limit(5 * EM, 10 * EM, Measure.DISPLAY_WIDTH, LimitType.MAXIMUM, Special.NONE)
258         })
259         // other day periods wide
260         .add("//ldml/dates/calendars/calendar[@type=\"gregorian\"]/dayPeriods/.*/dayPeriodWidth[@type=\"wide\"]/dayPeriod[@type=%Q]", new Limit[] {
261             new Limit(10 * EM, 20 * EM, Measure.DISPLAY_WIDTH, LimitType.MAXIMUM, Special.NONE)
262         })
263 
264         // Narrow items
265 
266         .add("//ldml/dates/calendars/calendar.*[@type=\"narrow\"](?!/cyclic|/dayPeriod|/monthPattern)", new Limit[] {
267             new Limit(1.5 * EM, 2.25 * EM, Measure.DISPLAY_WIDTH, LimitType.MAXIMUM, Special.NONE)
268         })
269         // \"(?!am|pm)[^\"]+\"\\
270 
271         // Compact number formats
272 // pattern[@type="100000000000000"]
273         .add("//ldml/numbers/decimalFormats[@numberSystem=%A]/decimalFormatLength[@type=\"short\"]/decimalFormat[@type=%A]/pattern[@type=\"100000000000000",
274             new Limit[] {
275                 new Limit(4 * EM, 6 * EM, Measure.DISPLAY_WIDTH, LimitType.MAXIMUM, Special.NUMBERFORMAT)
276         })
277         .add("//ldml/numbers/decimalFormats[@numberSystem=%A]/decimalFormatLength[@type=\"short\"]/decimalFormat[@type=%A]/pattern[@type=\"1",
278             new Limit[] {
279                 new Limit(4 * EM, 5 * EM, Measure.DISPLAY_WIDTH, LimitType.MAXIMUM, Special.NUMBERFORMAT)
280         })
281 
282         // Short/Narrow units
283         // Note that the EM values are adjusted for units according to the number of components in the units
284         // See UnitWidthUtil for more information
285         .add("//ldml/units/unitLength[@type=\"(short|narrow)\"]/unit[@type=%A]/unitPattern", new Limit[] {
286             new Limit(3 * EM, 5 * EM, Measure.DISPLAY_WIDTH, LimitType.MAXIMUM, Special.PLACEHOLDER_UNITS)
287         })
288 
289         // Currency Symbols
290         .add("//ldml/numbers/currencies/currency[@type=%A]/symbol", new Limit[] {
291             new Limit(3 * EM, 5 * EM, Measure.DISPLAY_WIDTH, LimitType.MAXIMUM, Special.PLACEHOLDERS)
292         })
293 
294         // "grinning cat face with smiling eyes" should be normal max ~= 160 em
295         // emoji names (not keywords)
296         .add("//ldml/annotations/annotation[@cp=%A][@type=%A]", new Limit[] {
297             new Limit(20 * EM, 100 * EM, Measure.DISPLAY_WIDTH, LimitType.MAXIMUM, Special.NONE),
298         })
299         .add("//ldml/annotations/annotation[@cp=%A]", new Limit[] {
300             new Limit(WARN_COMPONENTS_PER_ANNOTATION, MAX_COMPONENTS_PER_ANNOTATION, Measure.SET_ELEMENTS, LimitType.MAXIMUM, Special.BARS) // Allow up to 5 with no warning, up to 7 with no error.
301         })
302         ;
303 
304 // Quell noisy printout
305 //    static {
306 //        System.out.println("EMs: " + ApproximateWidth.getWidth("grinning cat face with smiling eyes"));
307 //    }
308 
309     Set<Limit> found = new LinkedHashSet<>();
310 
311     @Override
handleCheck(String path, String fullPath, String value, Options options, List<CheckStatus> result)312     public CheckCLDR handleCheck(String path, String fullPath, String value, Options options, List<CheckStatus> result) {
313         if (value == null) {
314             return this; // skip
315         }
316         //        String testPrefix = "//ldml/units/unitLength[@type=\"narrow\"]";
317         //        if (path.startsWith(testPrefix)) {
318         //            int i = 0;
319         //        }
320         // Limits item0 =
321         // lookup.get("//ldml/numbers/decimalFormats[@numberSystem=\"latn\"]/decimalFormatLength[@type=\"short\"]/decimalFormat[@type=\"standard\"]/pattern[@type=\"1000000000\"][@count=\"other\"]");
322         // item0.check("123456789", result, this);
323 
324         Limit[] items = lookup.get(path);
325         if (items != null) {
326             CLDRFile.Status status = new CLDRFile.Status();
327             this.getCldrFileToCheck().getSourceLocaleID(path, status);
328             // This was put in specifically to deal with the fact that we added a bunch of new units in CLDR 26
329             // and didn't put the narrow forms of them into modern coverage.  If/when the narrow forms of all units
330             // are modern coverage, then we can safely remove the aliasedAndComprehensive check.  Right now if an
331             // item is aliased and coverage is comprehensive, then it can't generate anything worse than a warning.
332             Boolean aliasedAndComprehensive = (coverageLevel.getLevel(path).compareTo(Level.COMPREHENSIVE) == 0)
333             && (status.pathWhereFound.compareTo(path) != 0);
334             for (Limit item : items) {
335                 if (item.hasProblem(path, value, result, this, aliasedAndComprehensive)) {
336                     if (DEBUG && !found.contains(item)) {
337                         found.add(item);
338                     }
339                     break; // only one error per item
340                 }
341             }
342         }
343         return this;
344     }
345 
346     @Override
setCldrFileToCheck(CLDRFile cldrFileToCheck, Options options, List<CheckStatus> possibleErrors)347     public CheckCLDR setCldrFileToCheck(CLDRFile cldrFileToCheck, Options options,
348         List<CheckStatus> possibleErrors) {
349         final String localeID = cldrFileToCheck.getLocaleID();
350         supplementalData = SupplementalDataInfo.getInstance(cldrFileToCheck.getSupplementalDirectory());
351         coverageLevel = CoverageLevel2.getInstance(supplementalData, localeID);
352 
353         super.setCldrFileToCheck(cldrFileToCheck, options, possibleErrors);
354         return this;
355     }
356 
357     /**
358      * Provide a rough measure of how many unit components there are for the purpose of establishing a maximum width, with an special factor for non-metric.
359      */
360     public static class UnitWidthUtil {
361         static final Pattern UNIT_PREFIX = Pattern.compile("//ldml/units/unitLength\\[@type=\"([^\"]*)\"]/unit\\[@type=\"([^\\\"]*)\"]");
362         final UnitConverter CONVERTER = SupplementalDataInfo.getInstance().getUnitConverter();
363         final Set<String> validLongUnitIDs = Validity.getInstance().getCodeToStatus(LstrType.unit).keySet();
364 
365         LoadingCache<String, Double> pathToUnitComponents = CacheBuilder.newBuilder().build(
366             new CacheLoader<String, Double>() {
367             @Override
368             public Double load(String path) throws ExecutionException {
369                 final Matcher matcher = UNIT_PREFIX.matcher(path);
370                 if (matcher.lookingAt()) {
371                     //String length = matcher.group(1);
372                     String longUnitId = matcher.group(2);
373                     return unitToComponents.get(longUnitId);
374                 } else {
375                     throw new ICUException("Internal error");
376                 }
377             }
378         });
379 
380         LoadingCache<String, Double> unitToComponents = CacheBuilder.newBuilder().build(new CacheLoader<String, Double>() {
381             @Override
382             public Double load(String longUnitId) {
383                 double components = 0;
384                 String shortId = CONVERTER.getShortId(longUnitId);
385 
386                 Set<String> systems = CONVERTER.getSystems(shortId);
387                 int widthFactor = systems.contains("metric") &&  !shortId.endsWith("-metric") ? 1 : 3;
388                 // NOTE: allow cup-metric and pint-metric to be longer, since they aren't standard metric
389 
390                 // walk thorough the numerator and denominator to get the values
391                 UnitId unitId = CONVERTER.createUnitId(shortId);
392                 for (Entry<String, Integer> entry : unitId.numUnitsToPowers.entrySet()) {
393                     components += getComponentCount(entry.getKey(), entry.getValue());
394                 }
395                 for (Entry<String, Integer> entry : unitId.denUnitsToPowers.entrySet()) {
396                     components += getComponentCount(entry.getKey(), entry.getValue());
397                 }
398                 return widthFactor * components;
399             }
400 
401             public double getComponentCount(String unit, Integer power) {
402                 int result = 1;
403                 if (power > 1) {
404                     ++result; // add one component for a power
405                 }
406                 // hack for number
407                 if (unit.startsWith("100-")) {
408                     ++result;
409                     unit = unit.substring(4);
410                 }
411                 Output<Rational> deprefix = new Output<>();
412                 unit = UnitConverter.stripPrefix(unit, deprefix);
413                 if (!deprefix.value.equals(Rational.ONE)) {
414                     ++result; // add 1 component for kilo, mega, etc.
415                 }
416                 for (int i = 0; i < unit.length(); ++i) {
417                     if (unit.charAt(i) == '-') {
418                         ++result; // add one component for -imperial, etc.
419                     }
420                 }
421                 return result;
422             }
423         });
424 
UnitWidthUtil()425         private UnitWidthUtil() { }
426 
getInstance()427         public static UnitWidthUtil getInstance() {
428             return new UnitWidthUtil();
429         }
430 
getRoughComponentMax(String path)431         public double getRoughComponentMax(String path) {
432             try {
433                 return pathToUnitComponents.get(path);
434             } catch (ExecutionException e) {
435                 throw new ICUException(e);
436             }
437         }
438     }
439 }
440