• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 package org.unicode.cldr.test;
2 
3 import com.google.common.base.Joiner;
4 import com.ibm.icu.lang.UCharacter;
5 import com.ibm.icu.text.BreakIterator;
6 import com.ibm.icu.util.ULocale;
7 import java.util.Collections;
8 import java.util.EnumMap;
9 import java.util.List;
10 import java.util.Map;
11 import java.util.Set;
12 import java.util.TreeSet;
13 import java.util.regex.Matcher;
14 import org.unicode.cldr.draft.ScriptMetadata;
15 import org.unicode.cldr.draft.ScriptMetadata.Info;
16 import org.unicode.cldr.draft.ScriptMetadata.Trinary;
17 import org.unicode.cldr.test.CheckCLDR.CheckStatus.Subtype;
18 import org.unicode.cldr.tool.LikelySubtags;
19 import org.unicode.cldr.util.CLDRFile;
20 import org.unicode.cldr.util.CldrUtility;
21 import org.unicode.cldr.util.Counter;
22 import org.unicode.cldr.util.Factory;
23 import org.unicode.cldr.util.PathStarrer;
24 import org.unicode.cldr.util.PatternCache;
25 import org.unicode.cldr.util.RegexLookup;
26 import org.unicode.cldr.util.SpecialLocales;
27 
28 public class CheckConsistentCasing extends FactoryCheckCLDR {
29 
30     private static final boolean DEBUG = CldrUtility.getProperty("DEBUG", false);
31 
32     private static final double MIN_FACTOR = 2.5;
33     // remember to add this class to the list in CheckCLDR.getCheckAll
34     // to run just this test, on just locales starting with 'nl', use CheckCLDR with -fnl.*
35     // -t.*Currencies.*
36 
37     ULocale uLocale = null;
38     BreakIterator breaker = null;
39     private String locale;
40     CasingInfo casingInfo;
41     private boolean hasCasingInfo;
42 
CheckConsistentCasing(Factory factory)43     public CheckConsistentCasing(Factory factory) {
44         super(factory);
45         casingInfo = new CasingInfo(factory);
46     }
47 
48     @Override
handleSetCldrFileToCheck( CLDRFile cldrFileToCheck, Options options, List<CheckStatus> possibleErrors)49     public CheckCLDR handleSetCldrFileToCheck(
50             CLDRFile cldrFileToCheck, Options options, List<CheckStatus> possibleErrors) {
51         if (cldrFileToCheck == null) return this;
52         super.handleSetCldrFileToCheck(cldrFileToCheck, options, possibleErrors);
53         locale = cldrFileToCheck.getLocaleID();
54         // get info about casing; note that this is done in two steps since
55         // ScriptMetadata.getInfo() returns null, in some instances.
56         // OLD: Info localeInfo = ScriptMetadata.getInfo(locale);
57         String script = new LikelySubtags().getLikelyScript(locale);
58         Info localeInfo = ScriptMetadata.getInfo(script);
59 
60         if (localeInfo != null && localeInfo.hasCase == Trinary.YES) {
61             // this script has casing info, so we can request it here
62             try {
63                 types = casingInfo.getLocaleCasing(locale);
64             } catch (Exception e) {
65                 types = Collections.emptyMap();
66             }
67             if ((types == null || types.isEmpty()) && !SpecialLocales.isScratchLocale(locale)) {
68                 possibleErrors.add(
69                         new CheckStatus()
70                                 .setCause(this)
71                                 .setMainType(CheckStatus.warningType)
72                                 .setSubtype(Subtype.incorrectCasing)
73                                 .setMessage("Could not load casing info for {0}", locale));
74             }
75         } else {
76             // no casing info - since the types Map is global, and null checks aren't done,
77             // we are better off  with an empty map here
78             types = Collections.emptyMap();
79         }
80         // types may be null, avoid NPE
81         hasCasingInfo = (types == null) ? false : types.size() > 0;
82         return this;
83     }
84 
85     // If you don't need any file initialization or postprocessing, you only need this one routine
86     @Override
handleCheck( String path, String fullPath, String value, Options options, List<CheckStatus> result)87     public CheckCLDR handleCheck(
88             String path, String fullPath, String value, Options options, List<CheckStatus> result) {
89         // it helps performance to have a quick reject of most paths
90         if (fullPath == null) return this; // skip paths that we don't have
91         if (!accept(result)) return this; // causes hasCasingInfo to be calculated
92         if (!hasCasingInfo) return this;
93 
94         String locale2 = getCldrFileToCheck().getSourceLocaleID(path, null);
95         if (locale2.equals(locale) && value != null && value.length() > 0) {
96             Category category = getCategory(path);
97             if (category != null) {
98                 checkConsistentCasing(category, path, fullPath, value, options, result);
99             }
100         }
101         return this;
102     }
103 
104     static final Matcher placeholder = PatternCache.get("\\{\\d+\\}").matcher("");
105 
106     /** The casing type of a given string. */
107     public enum CasingType {
108         titlecase,
109         lowercase,
110         other;
111 
from(String s)112         public static CasingType from(String s) {
113             if (s == null || s.length() == 0) {
114                 return other;
115             }
116             int cp;
117             // Look for the first meaningful character in the string to determine case.
118             for (int i = 0; i < s.length(); i += Character.charCount(cp)) {
119                 cp = s.codePointAt(i);
120                 // used to skip the placeholders, but works better to have them be 'other'
121                 // if (cp == '{') {
122                 // if (placeholder.reset(s).region(i,s.length()).lookingAt()) {
123                 // i = placeholder.end() - 1; // skip
124                 // continue;
125                 // }
126                 // }
127                 int type = UCharacter.getType(cp);
128                 switch (type) {
129                     case UCharacter.LOWERCASE_LETTER:
130                         return lowercase;
131 
132                     case UCharacter.UPPERCASE_LETTER:
133                     case UCharacter.TITLECASE_LETTER:
134                         return titlecase;
135 
136                         // for other letters / numbers / symbols, return other
137                     case UCharacter.OTHER_LETTER:
138                     case UCharacter.DECIMAL_DIGIT_NUMBER:
139                     case UCharacter.LETTER_NUMBER:
140                     case UCharacter.OTHER_NUMBER:
141                     case UCharacter.MATH_SYMBOL:
142                     case UCharacter.CURRENCY_SYMBOL:
143                     case UCharacter.MODIFIER_SYMBOL:
144                     case UCharacter.OTHER_SYMBOL:
145                         return other;
146                         // ignore everything else (whitespace, punctuation, etc) and keep going
147                 }
148             }
149             return other;
150         }
151 
152         /** Return true if either is other, or they are identical. */
worksWith(CasingType otherType)153         public boolean worksWith(CasingType otherType) {
154             return otherType == null
155                     || this == otherType
156                     || this == CasingType.other
157                     || otherType == CasingType.other;
158         }
159     }
160 
161     public enum CasingTypeAndErrFlag {
162         titlecase_mismatchWarn(CasingType.titlecase, false),
163         titlecase_mismatchErr(CasingType.titlecase, true),
164         lowercase_mismatchWarn(CasingType.lowercase, false),
165         lowercase_mismatchErr(CasingType.lowercase, true),
166         other_mismatchWarn(CasingType.other, false),
167         other_mismatchErr(CasingType.other, true);
168 
169         private final CasingType type;
170         private final boolean flag; // force error instead of warning for mismatch
171 
CasingTypeAndErrFlag(CasingType type, boolean flag)172         private CasingTypeAndErrFlag(CasingType type, boolean flag) {
173             this.type = type;
174             this.flag = flag;
175         }
176 
type()177         public CasingType type() {
178             return type;
179         }
180 
flag()181         public boolean flag() {
182             return flag;
183         }
184     }
185 
186     static final RegexLookup<Category> pathToBucket =
187             new RegexLookup<Category>()
188                     .add("//ldml/localeDisplayNames/languages/language", Category.language)
189                     .add("//ldml/localeDisplayNames/scripts/script", Category.script)
190                     .add("//ldml/localeDisplayNames/territories/territory", Category.territory)
191                     .add("//ldml/localeDisplayNames/variants/variant", Category.variant)
192                     .add("//ldml/localeDisplayNames/keys/key", Category.key)
193                     .add("//ldml/localeDisplayNames/types/type", Category.keyValue)
194                     .add("//ldml/dates/calendars/calendar.*/months.*narrow", Category.month_narrow)
195                     .add(
196                             "//ldml/dates/calendars/calendar.*/months.*format",
197                             Category.month_format_except_narrow)
198                     .add(
199                             "//ldml/dates/calendars/calendar.*/months",
200                             Category.month_standalone_except_narrow)
201                     .add("//ldml/dates/calendars/calendar.*/days.*narrow", Category.day_narrow)
202                     .add(
203                             "//ldml/dates/calendars/calendar.*/days.*format",
204                             Category.day_format_except_narrow)
205                     .add(
206                             "//ldml/dates/calendars/calendar.*/days",
207                             Category.day_standalone_except_narrow)
208                     .add("//ldml/dates/calendars/calendar.*/eras/eraNarrow", Category.era_narrow)
209                     .add("//ldml/dates/calendars/calendar.*/eras/eraAbbr", Category.era_abbr)
210                     .add("//ldml/dates/calendars/calendar.*/eras/", Category.era_name)
211                     .add(
212                             "//ldml/dates/calendars/calendar.*/quarters.*narrow",
213                             Category.quarter_narrow)
214                     .add(
215                             "//ldml/dates/calendars/calendar.*/quarters.*abbreviated",
216                             Category.quarter_abbreviated)
217                     .add(
218                             "//ldml/dates/calendars/calendar.*/quarters.*format",
219                             Category.quarter_format_wide)
220                     .add(
221                             "//ldml/dates/calendars/calendar.*/quarters",
222                             Category.quarter_standalone_wide)
223                     .add("//ldml/.*/relative", Category.relative)
224                     .add("//ldml/dates/fields", Category.calendar_field)
225                     .add(
226                             "//ldml/dates/timeZoneNames/zone.*/exemplarCity",
227                             Category.zone_exemplarCity)
228                     .add("//ldml/dates/timeZoneNames/zone.*/short", Category.zone_short)
229                     .add("//ldml/dates/timeZoneNames/zone", Category.zone_long)
230                     .add(
231                             "//ldml/dates/timeZoneNames/metazone.*/commonlyUsed",
232                             Category.NOT_USED) // just to remove them from the other cases
233                     .add("//ldml/dates/timeZoneNames/metazone.*/short", Category.metazone_long)
234                     .add("//ldml/dates/timeZoneNames/metazone", Category.metazone_long)
235                     .add("//ldml/numbers/currencies/currency.*/symbol", Category.symbol)
236                     .add(
237                             "//ldml/numbers/currencies/currency.*/displayName.*@count",
238                             Category.currencyName_count)
239                     .add("//ldml/numbers/currencies/currency.*/displayName", Category.currencyName)
240                     .add("//ldml/units/unit.*/unitPattern.*(past|future)", Category.relative)
241                     .add("//ldml/units/unit.*/unitPattern", Category.unit_pattern)
242             // ldml/localeDisplayNames/keys/key[@type=".*"]
243             // ldml/localeDisplayNames/measurementSystemNames/measurementSystemName[@type=".*"]
244             // ldml/localeDisplayNames/transformNames/transformName[@type=".*"]
245             ;
246 
247     Map<Category, CasingTypeAndErrFlag> types = new EnumMap<>(Category.class);
248 
249     public enum Category {
250         language,
251         script,
252         territory,
253         variant,
254         keyValue,
255         month_narrow,
256         month_format_except_narrow,
257         month_standalone_except_narrow,
258         day_narrow,
259         day_format_except_narrow,
260         day_standalone_except_narrow,
261         era_narrow,
262         era_abbr,
263         era_name,
264         quarter_narrow,
265         quarter_abbreviated,
266         quarter_format_wide,
267         quarter_standalone_wide,
268         calendar_field,
269         zone_exemplarCity,
270         zone_short,
271         zone_long,
272         NOT_USED,
273         metazone_short,
274         metazone_long,
275         symbol,
276         currencyName_count,
277         currencyName,
278         relative,
279         unit_pattern,
280         key;
281     }
282 
283     // //ldml/numbers/currencies/currency[@type="ADP"]/displayName
284     // //ldml/numbers/currencies/currency[@type="RON"]/displayName[@count="other"]
285     // //ldml/numbers/currencies/currency[@type="BYB"]/symbol
286 
getCategory(String path)287     static Category getCategory(String path) {
288         return pathToBucket.get(path);
289     }
290 
291     /**
292      * Calculates casing information using data from the specified CLDRFile.
293      *
294      * @param resolved the resolved CLDRFile to calculate casing information from
295      * @return
296      */
getSamples(CLDRFile resolved)297     public static Map<Category, CasingType> getSamples(CLDRFile resolved) {
298         // Use EnumMap instead of an array for type safety.
299         Map<Category, Counter<CasingType>> counters = new EnumMap<>(Category.class);
300 
301         for (Category category : Category.values()) {
302             counters.put(category, new Counter<CasingType>());
303         }
304         PathStarrer starrer = new PathStarrer();
305         boolean isRoot = "root".equals(resolved.getLocaleID());
306         Set<String> missing = !DEBUG ? null : new TreeSet<>();
307 
308         for (String path : resolved) {
309             if (!isRoot) {
310                 String locale2 = resolved.getSourceLocaleID(path, null);
311                 if (locale2.equals("root") || locale2.equals("code-fallback")) {
312                     continue;
313                 }
314             }
315             String winningPath = resolved.getWinningPath(path);
316             if (!winningPath.equals(path)) {
317                 continue;
318             }
319             Category category = getCategory(path);
320             if (category != null) {
321                 String value = resolved.getStringValue(path);
322                 if (value == null || value.length() == 0) continue;
323                 CasingType ft = CasingType.from(value);
324                 counters.get(category).add(ft, 1);
325             } else if (DEBUG) {
326                 String starred = starrer.set(path);
327                 missing.add(starred);
328             }
329         }
330 
331         Map<Category, CasingType> info = new EnumMap<>(Category.class);
332         for (Category category : Category.values()) {
333             if (category == Category.NOT_USED) continue;
334             Counter<CasingType> counter = counters.get(category);
335             long countLower = counter.getCount(CasingType.lowercase);
336             long countUpper = counter.getCount(CasingType.titlecase);
337             long countOther = counter.getCount(CasingType.other);
338             CasingType type;
339             if (countLower + countUpper == 0) {
340                 type = CasingType.other;
341             } else if (countLower >= countUpper * MIN_FACTOR && countLower >= countOther) {
342                 type = CasingType.lowercase;
343             } else if (countUpper >= countLower * MIN_FACTOR && countUpper >= countOther) {
344                 type = CasingType.titlecase;
345             } else {
346                 type = CasingType.other;
347             }
348             info.put(category, type);
349         }
350         if (DEBUG && missing.size() != 0) {
351             System.out.println("Paths skipped:\n" + Joiner.on("\n").join(missing));
352         }
353         return info;
354     }
355 
356     private static final String CASE_WARNING =
357             "The first letter of 〈{0}〉 is {1}, which differs from what is expected "
358                     + "for the {2} category: that almost all values be {3}.\n\n";
359 
checkConsistentCasing( Category category, String path, String fullPath, String value, Options options, List<CheckStatus> result)360     private void checkConsistentCasing(
361             Category category,
362             String path,
363             String fullPath,
364             String value,
365             Options options,
366             List<CheckStatus> result) {
367         // Avoid NPE
368         if (types != null) {
369             CasingType ft = CasingType.from(value);
370             CasingTypeAndErrFlag typeAndFlagFromCat = types.get(category);
371             if (typeAndFlagFromCat == null) {
372                 typeAndFlagFromCat = CasingTypeAndErrFlag.other_mismatchWarn;
373             }
374             if (!ft.worksWith(typeAndFlagFromCat.type())) {
375                 result.add(
376                         new CheckStatus()
377                                 .setCause(this)
378                                 .setMainType(
379                                         typeAndFlagFromCat.flag()
380                                                 ? CheckStatus.errorType
381                                                 : CheckStatus.warningType)
382                                 .setSubtype(Subtype.incorrectCasing) // typically warningType or
383                                 // errorType
384                                 .setMessage(
385                                         CASE_WARNING,
386                                         value,
387                                         ft,
388                                         category,
389                                         typeAndFlagFromCat
390                                                 .type())); // the message; can be MessageFormat with
391                 // arguments
392             }
393         }
394     }
395 }
396