1 package org.unicode.cldr.test; 2 3 import com.google.common.base.Joiner; 4 import com.ibm.icu.lang.UCharacter; 5 import com.ibm.icu.text.BreakIterator; 6 import com.ibm.icu.util.ULocale; 7 import java.util.Collections; 8 import java.util.EnumMap; 9 import java.util.List; 10 import java.util.Map; 11 import java.util.Set; 12 import java.util.TreeSet; 13 import java.util.regex.Matcher; 14 import org.unicode.cldr.draft.ScriptMetadata; 15 import org.unicode.cldr.draft.ScriptMetadata.Info; 16 import org.unicode.cldr.draft.ScriptMetadata.Trinary; 17 import org.unicode.cldr.test.CheckCLDR.CheckStatus.Subtype; 18 import org.unicode.cldr.tool.LikelySubtags; 19 import org.unicode.cldr.util.CLDRFile; 20 import org.unicode.cldr.util.CldrUtility; 21 import org.unicode.cldr.util.Counter; 22 import org.unicode.cldr.util.Factory; 23 import org.unicode.cldr.util.PathStarrer; 24 import org.unicode.cldr.util.PatternCache; 25 import org.unicode.cldr.util.RegexLookup; 26 import org.unicode.cldr.util.SpecialLocales; 27 28 public class CheckConsistentCasing extends FactoryCheckCLDR { 29 30 private static final boolean DEBUG = CldrUtility.getProperty("DEBUG", false); 31 32 private static final double MIN_FACTOR = 2.5; 33 // remember to add this class to the list in CheckCLDR.getCheckAll 34 // to run just this test, on just locales starting with 'nl', use CheckCLDR with -fnl.* 35 // -t.*Currencies.* 36 37 ULocale uLocale = null; 38 BreakIterator breaker = null; 39 private String locale; 40 CasingInfo casingInfo; 41 private boolean hasCasingInfo; 42 CheckConsistentCasing(Factory factory)43 public CheckConsistentCasing(Factory factory) { 44 super(factory); 45 casingInfo = new CasingInfo(factory); 46 } 47 48 @Override handleSetCldrFileToCheck( CLDRFile cldrFileToCheck, Options options, List<CheckStatus> possibleErrors)49 public CheckCLDR handleSetCldrFileToCheck( 50 CLDRFile cldrFileToCheck, Options options, List<CheckStatus> possibleErrors) { 51 if (cldrFileToCheck == null) return this; 52 super.handleSetCldrFileToCheck(cldrFileToCheck, options, possibleErrors); 53 locale = cldrFileToCheck.getLocaleID(); 54 // get info about casing; note that this is done in two steps since 55 // ScriptMetadata.getInfo() returns null, in some instances. 56 // OLD: Info localeInfo = ScriptMetadata.getInfo(locale); 57 String script = new LikelySubtags().getLikelyScript(locale); 58 Info localeInfo = ScriptMetadata.getInfo(script); 59 60 if (localeInfo != null && localeInfo.hasCase == Trinary.YES) { 61 // this script has casing info, so we can request it here 62 try { 63 types = casingInfo.getLocaleCasing(locale); 64 } catch (Exception e) { 65 types = Collections.emptyMap(); 66 } 67 if ((types == null || types.isEmpty()) && !SpecialLocales.isScratchLocale(locale)) { 68 possibleErrors.add( 69 new CheckStatus() 70 .setCause(this) 71 .setMainType(CheckStatus.warningType) 72 .setSubtype(Subtype.incorrectCasing) 73 .setMessage("Could not load casing info for {0}", locale)); 74 } 75 } else { 76 // no casing info - since the types Map is global, and null checks aren't done, 77 // we are better off with an empty map here 78 types = Collections.emptyMap(); 79 } 80 // types may be null, avoid NPE 81 hasCasingInfo = (types == null) ? false : types.size() > 0; 82 return this; 83 } 84 85 // If you don't need any file initialization or postprocessing, you only need this one routine 86 @Override handleCheck( String path, String fullPath, String value, Options options, List<CheckStatus> result)87 public CheckCLDR handleCheck( 88 String path, String fullPath, String value, Options options, List<CheckStatus> result) { 89 // it helps performance to have a quick reject of most paths 90 if (fullPath == null) return this; // skip paths that we don't have 91 if (!accept(result)) return this; // causes hasCasingInfo to be calculated 92 if (!hasCasingInfo) return this; 93 94 String locale2 = getCldrFileToCheck().getSourceLocaleID(path, null); 95 if (locale2.equals(locale) && value != null && value.length() > 0) { 96 Category category = getCategory(path); 97 if (category != null) { 98 checkConsistentCasing(category, path, fullPath, value, options, result); 99 } 100 } 101 return this; 102 } 103 104 static final Matcher placeholder = PatternCache.get("\\{\\d+\\}").matcher(""); 105 106 /** The casing type of a given string. */ 107 public enum CasingType { 108 titlecase, 109 lowercase, 110 other; 111 from(String s)112 public static CasingType from(String s) { 113 if (s == null || s.length() == 0) { 114 return other; 115 } 116 int cp; 117 // Look for the first meaningful character in the string to determine case. 118 for (int i = 0; i < s.length(); i += Character.charCount(cp)) { 119 cp = s.codePointAt(i); 120 // used to skip the placeholders, but works better to have them be 'other' 121 // if (cp == '{') { 122 // if (placeholder.reset(s).region(i,s.length()).lookingAt()) { 123 // i = placeholder.end() - 1; // skip 124 // continue; 125 // } 126 // } 127 int type = UCharacter.getType(cp); 128 switch (type) { 129 case UCharacter.LOWERCASE_LETTER: 130 return lowercase; 131 132 case UCharacter.UPPERCASE_LETTER: 133 case UCharacter.TITLECASE_LETTER: 134 return titlecase; 135 136 // for other letters / numbers / symbols, return other 137 case UCharacter.OTHER_LETTER: 138 case UCharacter.DECIMAL_DIGIT_NUMBER: 139 case UCharacter.LETTER_NUMBER: 140 case UCharacter.OTHER_NUMBER: 141 case UCharacter.MATH_SYMBOL: 142 case UCharacter.CURRENCY_SYMBOL: 143 case UCharacter.MODIFIER_SYMBOL: 144 case UCharacter.OTHER_SYMBOL: 145 return other; 146 // ignore everything else (whitespace, punctuation, etc) and keep going 147 } 148 } 149 return other; 150 } 151 152 /** Return true if either is other, or they are identical. */ worksWith(CasingType otherType)153 public boolean worksWith(CasingType otherType) { 154 return otherType == null 155 || this == otherType 156 || this == CasingType.other 157 || otherType == CasingType.other; 158 } 159 } 160 161 public enum CasingTypeAndErrFlag { 162 titlecase_mismatchWarn(CasingType.titlecase, false), 163 titlecase_mismatchErr(CasingType.titlecase, true), 164 lowercase_mismatchWarn(CasingType.lowercase, false), 165 lowercase_mismatchErr(CasingType.lowercase, true), 166 other_mismatchWarn(CasingType.other, false), 167 other_mismatchErr(CasingType.other, true); 168 169 private final CasingType type; 170 private final boolean flag; // force error instead of warning for mismatch 171 CasingTypeAndErrFlag(CasingType type, boolean flag)172 private CasingTypeAndErrFlag(CasingType type, boolean flag) { 173 this.type = type; 174 this.flag = flag; 175 } 176 type()177 public CasingType type() { 178 return type; 179 } 180 flag()181 public boolean flag() { 182 return flag; 183 } 184 } 185 186 static final RegexLookup<Category> pathToBucket = 187 new RegexLookup<Category>() 188 .add("//ldml/localeDisplayNames/languages/language", Category.language) 189 .add("//ldml/localeDisplayNames/scripts/script", Category.script) 190 .add("//ldml/localeDisplayNames/territories/territory", Category.territory) 191 .add("//ldml/localeDisplayNames/variants/variant", Category.variant) 192 .add("//ldml/localeDisplayNames/keys/key", Category.key) 193 .add("//ldml/localeDisplayNames/types/type", Category.keyValue) 194 .add("//ldml/dates/calendars/calendar.*/months.*narrow", Category.month_narrow) 195 .add( 196 "//ldml/dates/calendars/calendar.*/months.*format", 197 Category.month_format_except_narrow) 198 .add( 199 "//ldml/dates/calendars/calendar.*/months", 200 Category.month_standalone_except_narrow) 201 .add("//ldml/dates/calendars/calendar.*/days.*narrow", Category.day_narrow) 202 .add( 203 "//ldml/dates/calendars/calendar.*/days.*format", 204 Category.day_format_except_narrow) 205 .add( 206 "//ldml/dates/calendars/calendar.*/days", 207 Category.day_standalone_except_narrow) 208 .add("//ldml/dates/calendars/calendar.*/eras/eraNarrow", Category.era_narrow) 209 .add("//ldml/dates/calendars/calendar.*/eras/eraAbbr", Category.era_abbr) 210 .add("//ldml/dates/calendars/calendar.*/eras/", Category.era_name) 211 .add( 212 "//ldml/dates/calendars/calendar.*/quarters.*narrow", 213 Category.quarter_narrow) 214 .add( 215 "//ldml/dates/calendars/calendar.*/quarters.*abbreviated", 216 Category.quarter_abbreviated) 217 .add( 218 "//ldml/dates/calendars/calendar.*/quarters.*format", 219 Category.quarter_format_wide) 220 .add( 221 "//ldml/dates/calendars/calendar.*/quarters", 222 Category.quarter_standalone_wide) 223 .add("//ldml/.*/relative", Category.relative) 224 .add("//ldml/dates/fields", Category.calendar_field) 225 .add( 226 "//ldml/dates/timeZoneNames/zone.*/exemplarCity", 227 Category.zone_exemplarCity) 228 .add("//ldml/dates/timeZoneNames/zone.*/short", Category.zone_short) 229 .add("//ldml/dates/timeZoneNames/zone", Category.zone_long) 230 .add( 231 "//ldml/dates/timeZoneNames/metazone.*/commonlyUsed", 232 Category.NOT_USED) // just to remove them from the other cases 233 .add("//ldml/dates/timeZoneNames/metazone.*/short", Category.metazone_long) 234 .add("//ldml/dates/timeZoneNames/metazone", Category.metazone_long) 235 .add("//ldml/numbers/currencies/currency.*/symbol", Category.symbol) 236 .add( 237 "//ldml/numbers/currencies/currency.*/displayName.*@count", 238 Category.currencyName_count) 239 .add("//ldml/numbers/currencies/currency.*/displayName", Category.currencyName) 240 .add("//ldml/units/unit.*/unitPattern.*(past|future)", Category.relative) 241 .add("//ldml/units/unit.*/unitPattern", Category.unit_pattern) 242 // ldml/localeDisplayNames/keys/key[@type=".*"] 243 // ldml/localeDisplayNames/measurementSystemNames/measurementSystemName[@type=".*"] 244 // ldml/localeDisplayNames/transformNames/transformName[@type=".*"] 245 ; 246 247 Map<Category, CasingTypeAndErrFlag> types = new EnumMap<>(Category.class); 248 249 public enum Category { 250 language, 251 script, 252 territory, 253 variant, 254 keyValue, 255 month_narrow, 256 month_format_except_narrow, 257 month_standalone_except_narrow, 258 day_narrow, 259 day_format_except_narrow, 260 day_standalone_except_narrow, 261 era_narrow, 262 era_abbr, 263 era_name, 264 quarter_narrow, 265 quarter_abbreviated, 266 quarter_format_wide, 267 quarter_standalone_wide, 268 calendar_field, 269 zone_exemplarCity, 270 zone_short, 271 zone_long, 272 NOT_USED, 273 metazone_short, 274 metazone_long, 275 symbol, 276 currencyName_count, 277 currencyName, 278 relative, 279 unit_pattern, 280 key; 281 } 282 283 // //ldml/numbers/currencies/currency[@type="ADP"]/displayName 284 // //ldml/numbers/currencies/currency[@type="RON"]/displayName[@count="other"] 285 // //ldml/numbers/currencies/currency[@type="BYB"]/symbol 286 getCategory(String path)287 static Category getCategory(String path) { 288 return pathToBucket.get(path); 289 } 290 291 /** 292 * Calculates casing information using data from the specified CLDRFile. 293 * 294 * @param resolved the resolved CLDRFile to calculate casing information from 295 * @return 296 */ getSamples(CLDRFile resolved)297 public static Map<Category, CasingType> getSamples(CLDRFile resolved) { 298 // Use EnumMap instead of an array for type safety. 299 Map<Category, Counter<CasingType>> counters = new EnumMap<>(Category.class); 300 301 for (Category category : Category.values()) { 302 counters.put(category, new Counter<CasingType>()); 303 } 304 PathStarrer starrer = new PathStarrer(); 305 boolean isRoot = "root".equals(resolved.getLocaleID()); 306 Set<String> missing = !DEBUG ? null : new TreeSet<>(); 307 308 for (String path : resolved) { 309 if (!isRoot) { 310 String locale2 = resolved.getSourceLocaleID(path, null); 311 if (locale2.equals("root") || locale2.equals("code-fallback")) { 312 continue; 313 } 314 } 315 String winningPath = resolved.getWinningPath(path); 316 if (!winningPath.equals(path)) { 317 continue; 318 } 319 Category category = getCategory(path); 320 if (category != null) { 321 String value = resolved.getStringValue(path); 322 if (value == null || value.length() == 0) continue; 323 CasingType ft = CasingType.from(value); 324 counters.get(category).add(ft, 1); 325 } else if (DEBUG) { 326 String starred = starrer.set(path); 327 missing.add(starred); 328 } 329 } 330 331 Map<Category, CasingType> info = new EnumMap<>(Category.class); 332 for (Category category : Category.values()) { 333 if (category == Category.NOT_USED) continue; 334 Counter<CasingType> counter = counters.get(category); 335 long countLower = counter.getCount(CasingType.lowercase); 336 long countUpper = counter.getCount(CasingType.titlecase); 337 long countOther = counter.getCount(CasingType.other); 338 CasingType type; 339 if (countLower + countUpper == 0) { 340 type = CasingType.other; 341 } else if (countLower >= countUpper * MIN_FACTOR && countLower >= countOther) { 342 type = CasingType.lowercase; 343 } else if (countUpper >= countLower * MIN_FACTOR && countUpper >= countOther) { 344 type = CasingType.titlecase; 345 } else { 346 type = CasingType.other; 347 } 348 info.put(category, type); 349 } 350 if (DEBUG && missing.size() != 0) { 351 System.out.println("Paths skipped:\n" + Joiner.on("\n").join(missing)); 352 } 353 return info; 354 } 355 356 private static final String CASE_WARNING = 357 "The first letter of 〈{0}〉 is {1}, which differs from what is expected " 358 + "for the {2} category: that almost all values be {3}.\n\n"; 359 checkConsistentCasing( Category category, String path, String fullPath, String value, Options options, List<CheckStatus> result)360 private void checkConsistentCasing( 361 Category category, 362 String path, 363 String fullPath, 364 String value, 365 Options options, 366 List<CheckStatus> result) { 367 // Avoid NPE 368 if (types != null) { 369 CasingType ft = CasingType.from(value); 370 CasingTypeAndErrFlag typeAndFlagFromCat = types.get(category); 371 if (typeAndFlagFromCat == null) { 372 typeAndFlagFromCat = CasingTypeAndErrFlag.other_mismatchWarn; 373 } 374 if (!ft.worksWith(typeAndFlagFromCat.type())) { 375 result.add( 376 new CheckStatus() 377 .setCause(this) 378 .setMainType( 379 typeAndFlagFromCat.flag() 380 ? CheckStatus.errorType 381 : CheckStatus.warningType) 382 .setSubtype(Subtype.incorrectCasing) // typically warningType or 383 // errorType 384 .setMessage( 385 CASE_WARNING, 386 value, 387 ft, 388 category, 389 typeAndFlagFromCat 390 .type())); // the message; can be MessageFormat with 391 // arguments 392 } 393 } 394 } 395 } 396