• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 package org.unicode.cldr.test;
2 
3 import java.util.Collections;
4 import java.util.EnumMap;
5 import java.util.List;
6 import java.util.Map;
7 import java.util.Set;
8 import java.util.TreeSet;
9 import java.util.regex.Matcher;
10 
11 import org.unicode.cldr.draft.ScriptMetadata;
12 import org.unicode.cldr.draft.ScriptMetadata.Info;
13 import org.unicode.cldr.draft.ScriptMetadata.Trinary;
14 import org.unicode.cldr.test.CheckCLDR.CheckStatus.Subtype;
15 import org.unicode.cldr.tool.LikelySubtags;
16 import org.unicode.cldr.util.CLDRFile;
17 import org.unicode.cldr.util.CLDRURLS;
18 import org.unicode.cldr.util.CldrUtility;
19 import org.unicode.cldr.util.Counter;
20 import org.unicode.cldr.util.Factory;
21 import org.unicode.cldr.util.PathStarrer;
22 import org.unicode.cldr.util.PatternCache;
23 import org.unicode.cldr.util.RegexLookup;
24 import org.unicode.cldr.util.SpecialLocales;
25 
26 import com.google.common.base.Joiner;
27 import com.ibm.icu.lang.UCharacter;
28 import com.ibm.icu.text.BreakIterator;
29 import com.ibm.icu.util.ULocale;
30 
31 public class CheckConsistentCasing extends FactoryCheckCLDR {
32 
33     private static final boolean DEBUG = CldrUtility.getProperty("DEBUG", false);
34 
35     private static final double MIN_FACTOR = 2.5;
36     // remember to add this class to the list in CheckCLDR.getCheckAll
37     // to run just this test, on just locales starting with 'nl', use CheckCLDR with -fnl.* -t.*Currencies.*
38 
39     ULocale uLocale = null;
40     BreakIterator breaker = null;
41     private String locale;
42     CasingInfo casingInfo;
43     private boolean hasCasingInfo;
44 
CheckConsistentCasing(Factory factory)45     public CheckConsistentCasing(Factory factory) {
46         super(factory);
47         casingInfo = new CasingInfo(factory);
48     }
49 
50     @Override
setCldrFileToCheck(CLDRFile cldrFileToCheck, Options options, List<CheckStatus> possibleErrors)51     public CheckCLDR setCldrFileToCheck(CLDRFile cldrFileToCheck, Options options,
52         List<CheckStatus> possibleErrors) {
53         if (cldrFileToCheck == null) return this;
54         super.setCldrFileToCheck(cldrFileToCheck, options, possibleErrors);
55         locale = cldrFileToCheck.getLocaleID();
56         // get info about casing; note that this is done in two steps since
57         // ScriptMetadata.getInfo() returns null, in some instances.
58         // OLD: Info localeInfo = ScriptMetadata.getInfo(locale);
59         String script = new LikelySubtags().getLikelyScript(locale);
60         Info localeInfo = ScriptMetadata.getInfo(script);
61 
62         if (localeInfo != null && localeInfo.hasCase == Trinary.YES) {
63             // this script has casing info, so we can request it here
64             try {
65                 types = casingInfo.getLocaleCasing(locale);
66             } catch (Exception e) {
67                 types = Collections.emptyMap();
68             }
69         } else {
70             // no casing info - since the types Map is global, and null checks aren't done,
71             // we are better off  with an empty map here
72             types = Collections.emptyMap();
73         }
74         if ((types == null || types.isEmpty()) && !SpecialLocales.isScratchLocale(locale)) {
75             possibleErrors.add(new CheckStatus().setCause(this)
76                 .setMainType(CheckStatus.warningType)
77                 .setSubtype(Subtype.incorrectCasing)
78                 .setMessage("Could not load casing info for {0}", locale));
79         }
80         // types may be null, avoid NPE
81         hasCasingInfo = (types == null) ? false : types.size() > 0;
82         return this;
83     }
84 
85     // If you don't need any file initialization or postprocessing, you only need this one routine
86     @Override
handleCheck(String path, String fullPath, String value, Options options, List<CheckStatus> result)87     public CheckCLDR handleCheck(String path, String fullPath, String value, Options options,
88         List<CheckStatus> result) {
89         // it helps performance to have a quick reject of most paths
90         if (fullPath == null) return this; // skip paths that we don't have
91         if (!hasCasingInfo) return this;
92 
93         String locale2 = getCldrFileToCheck().getSourceLocaleID(path, null);
94         if (locale2.equals(locale) && value != null && value.length() > 0) {
95             Category category = getCategory(path);
96             if (category != null) {
97                 checkConsistentCasing(category, path, fullPath, value, options, result);
98             }
99         }
100         return this;
101     }
102 
103     static final Matcher placeholder = PatternCache.get("\\{\\d+\\}").matcher("");
104 
105     /**
106      * The casing type of a given string.
107      */
108     public enum CasingType {
109         titlecase, lowercase, other;
from(String s)110         public static CasingType from(String s) {
111             if (s == null || s.length() == 0) {
112                 return other;
113             }
114             int cp;
115             // Look for the first meaningful character in the string to determine case.
116             for (int i = 0; i < s.length(); i += Character.charCount(cp)) {
117                 cp = s.codePointAt(i);
118                 // used to skip the placeholders, but works better to have them be 'other'
119                 // if (cp == '{') {
120                 // if (placeholder.reset(s).region(i,s.length()).lookingAt()) {
121                 // i = placeholder.end() - 1; // skip
122                 // continue;
123                 // }
124                 // }
125                 int type = UCharacter.getType(cp);
126                 switch (type) {
127 
128                 case UCharacter.LOWERCASE_LETTER:
129                     return lowercase;
130 
131                 case UCharacter.UPPERCASE_LETTER:
132                 case UCharacter.TITLECASE_LETTER:
133                     return titlecase;
134 
135                 // for other letters / numbers / symbols, return other
136                 case UCharacter.OTHER_LETTER:
137                 case UCharacter.DECIMAL_DIGIT_NUMBER:
138                 case UCharacter.LETTER_NUMBER:
139                 case UCharacter.OTHER_NUMBER:
140                 case UCharacter.MATH_SYMBOL:
141                 case UCharacter.CURRENCY_SYMBOL:
142                 case UCharacter.MODIFIER_SYMBOL:
143                 case UCharacter.OTHER_SYMBOL:
144                     return other;
145                 // ignore everything else (whitespace, punctuation, etc) and keep going
146                 }
147             }
148             return other;
149         }
150 
151         /**
152          * Return true if either is other, or they are identical.
153          */
worksWith(CasingType otherType)154         public boolean worksWith(CasingType otherType) {
155             return otherType == null || this == otherType || this == CasingType.other || otherType == CasingType.other;
156         }
157     }
158 
159     public enum CasingTypeAndErrFlag {
160         titlecase_mismatchWarn(CasingType.titlecase, false), titlecase_mismatchErr(CasingType.titlecase, true), lowercase_mismatchWarn(CasingType.lowercase,
161             false), lowercase_mismatchErr(CasingType.lowercase, true), other_mismatchWarn(CasingType.other, false), other_mismatchErr(CasingType.other, true);
162 
163         private final CasingType type;
164         private final boolean flag; // force error instead of warning for mismatch
165 
CasingTypeAndErrFlag(CasingType type, boolean flag)166         private CasingTypeAndErrFlag(CasingType type, boolean flag) {
167             this.type = type;
168             this.flag = flag;
169         }
170 
type()171         public CasingType type() {
172             return type;
173         }
174 
flag()175         public boolean flag() {
176             return flag;
177         }
178     }
179 
180     static final RegexLookup<Category> pathToBucket = new RegexLookup<Category>()
181         .add("//ldml/localeDisplayNames/languages/language", Category.language)
182         .add("//ldml/localeDisplayNames/scripts/script", Category.script)
183         .add("//ldml/localeDisplayNames/territories/territory", Category.territory)
184         .add("//ldml/localeDisplayNames/variants/variant", Category.variant)
185         .add("//ldml/localeDisplayNames/keys/key", Category.key)
186         .add("//ldml/localeDisplayNames/types/type", Category.keyValue)
187         .add("//ldml/dates/calendars/calendar.*/months.*narrow", Category.month_narrow)
188         .add("//ldml/dates/calendars/calendar.*/months.*format", Category.month_format_except_narrow)
189         .add("//ldml/dates/calendars/calendar.*/months", Category.month_standalone_except_narrow)
190         .add("//ldml/dates/calendars/calendar.*/days.*narrow", Category.day_narrow)
191         .add("//ldml/dates/calendars/calendar.*/days.*format", Category.day_format_except_narrow)
192         .add("//ldml/dates/calendars/calendar.*/days", Category.day_standalone_except_narrow)
193         .add("//ldml/dates/calendars/calendar.*/eras/eraNarrow", Category.era_narrow)
194         .add("//ldml/dates/calendars/calendar.*/eras/eraAbbr", Category.era_abbr)
195         .add("//ldml/dates/calendars/calendar.*/eras/", Category.era_name)
196         .add("//ldml/dates/calendars/calendar.*/quarters.*narrow", Category.quarter_narrow)
197         .add("//ldml/dates/calendars/calendar.*/quarters.*abbreviated", Category.quarter_abbreviated)
198         .add("//ldml/dates/calendars/calendar.*/quarters.*format", Category.quarter_format_wide)
199         .add("//ldml/dates/calendars/calendar.*/quarters", Category.quarter_standalone_wide)
200         .add("//ldml/.*/relative", Category.relative)
201         .add("//ldml/dates/fields", Category.calendar_field)
202         .add("//ldml/dates/timeZoneNames/zone.*/exemplarCity", Category.zone_exemplarCity)
203         .add("//ldml/dates/timeZoneNames/zone.*/short", Category.zone_short)
204         .add("//ldml/dates/timeZoneNames/zone", Category.zone_long)
205         .add("//ldml/dates/timeZoneNames/metazone.*/commonlyUsed", Category.NOT_USED) // just to remove them from the other cases
206         .add("//ldml/dates/timeZoneNames/metazone.*/short", Category.metazone_long)
207         .add("//ldml/dates/timeZoneNames/metazone", Category.metazone_long)
208         .add("//ldml/numbers/currencies/currency.*/symbol", Category.symbol)
209         .add("//ldml/numbers/currencies/currency.*/displayName.*@count", Category.currencyName_count)
210         .add("//ldml/numbers/currencies/currency.*/displayName", Category.currencyName)
211         .add("//ldml/units/unit.*/unitPattern.*(past|future)", Category.relative)
212         .add("//ldml/units/unit.*/unitPattern", Category.unit_pattern)
213     // ldml/localeDisplayNames/keys/key[@type=".*"]
214     // ldml/localeDisplayNames/measurementSystemNames/measurementSystemName[@type=".*"]
215     // ldml/localeDisplayNames/transformNames/transformName[@type=".*"]
216     ;
217 
218     Map<Category, CasingTypeAndErrFlag> types = new EnumMap<>(Category.class);
219 
220     public enum Category {
221         language, script, territory, variant, keyValue, month_narrow, month_format_except_narrow, month_standalone_except_narrow, day_narrow, day_format_except_narrow, day_standalone_except_narrow, era_narrow, era_abbr, era_name, quarter_narrow, quarter_abbreviated, quarter_format_wide, quarter_standalone_wide, calendar_field, zone_exemplarCity, zone_short, zone_long, NOT_USED, metazone_short, metazone_long, symbol, currencyName_count, currencyName, relative, unit_pattern, key;
222     }
223 
224     // //ldml/numbers/currencies/currency[@type="ADP"]/displayName
225     // //ldml/numbers/currencies/currency[@type="RON"]/displayName[@count="other"]
226     // //ldml/numbers/currencies/currency[@type="BYB"]/symbol
227 
getCategory(String path)228     static Category getCategory(String path) {
229         return pathToBucket.get(path);
230     }
231 
232     /**
233      * Calculates casing information using data from the specified CLDRFile.
234      *
235      * @param resolved
236      *            the resolved CLDRFile to calculate casing information from
237      * @return
238      */
getSamples(CLDRFile resolved)239     public static Map<Category, CasingType> getSamples(CLDRFile resolved) {
240         // Use EnumMap instead of an array for type safety.
241         Map<Category, Counter<CasingType>> counters = new EnumMap<>(Category.class);
242 
243         for (Category category : Category.values()) {
244             counters.put(category, new Counter<CasingType>());
245         }
246         PathStarrer starrer = new PathStarrer();
247         boolean isRoot = "root".equals(resolved.getLocaleID());
248         Set<String> missing = !DEBUG ? null : new TreeSet<>();
249 
250         for (String path : resolved) {
251             if (!isRoot) {
252                 String locale2 = resolved.getSourceLocaleID(path, null);
253                 if (locale2.equals("root") || locale2.equals("code-fallback")) {
254                     continue;
255                 }
256             }
257             String winningPath = resolved.getWinningPath(path);
258             if (!winningPath.equals(path)) {
259                 continue;
260             }
261             Category category = getCategory(path);
262             if (category != null) {
263                 String value = resolved.getStringValue(path);
264                 if (value == null || value.length() == 0) continue;
265                 CasingType ft = CasingType.from(value);
266                 counters.get(category).add(ft, 1);
267             } else if (DEBUG) {
268                 String starred = starrer.set(path);
269                 missing.add(starred);
270             }
271         }
272 
273         Map<Category, CasingType> info = new EnumMap<>(Category.class);
274         for (Category category : Category.values()) {
275             if (category == Category.NOT_USED) continue;
276             Counter<CasingType> counter = counters.get(category);
277             long countLower = counter.getCount(CasingType.lowercase);
278             long countUpper = counter.getCount(CasingType.titlecase);
279             long countOther = counter.getCount(CasingType.other);
280             CasingType type;
281             if (countLower + countUpper == 0) {
282                 type = CasingType.other;
283             } else if (countLower >= countUpper * MIN_FACTOR && countLower >= countOther) {
284                 type = CasingType.lowercase;
285             } else if (countUpper >= countLower * MIN_FACTOR && countUpper >= countOther) {
286                 type = CasingType.titlecase;
287             } else {
288                 type = CasingType.other;
289             }
290             info.put(category, type);
291         }
292         if (DEBUG && missing.size() != 0) {
293             System.out.println("Paths skipped:\n" + Joiner.on("\n").join(missing));
294         }
295         return info;
296     }
297 
298     private static final String CASE_WARNING = "The first letter of 〈{0}〉 is {1}, which differs from what is expected " +
299         "for the {2} category: that almost all values be {3}.\n\n" +
300         "For guidance, see " + CLDRURLS.CAPITALIZATION_URL + ". " +
301         "If this warning is wrong, please file a ticket at " + CLDRURLS.CLDR_NEWTICKET_URL + ".";
302 
checkConsistentCasing(Category category, String path, String fullPath, String value, Options options, List<CheckStatus> result)303     private void checkConsistentCasing(Category category, String path, String fullPath, String value,
304         Options options, List<CheckStatus> result) {
305         // Avoid NPE
306         if (types != null) {
307             CasingType ft = CasingType.from(value);
308             CasingTypeAndErrFlag typeAndFlagFromCat = types.get(category);
309             if (typeAndFlagFromCat == null) {
310                 typeAndFlagFromCat = CasingTypeAndErrFlag.other_mismatchWarn;
311             }
312             if (!ft.worksWith(typeAndFlagFromCat.type())) {
313                 result.add(new CheckStatus().setCause(this)
314                     .setMainType(typeAndFlagFromCat.flag() ? CheckStatus.errorType : CheckStatus.warningType)
315                     .setSubtype(Subtype.incorrectCasing) // typically warningType or errorType
316                     .setMessage(CASE_WARNING, value, ft, category, typeAndFlagFromCat.type())); // the message; can be MessageFormat with arguments
317             }
318         }
319     }
320 }