1 package org.unicode.cldr.test; 2 3 import java.util.Collections; 4 import java.util.EnumMap; 5 import java.util.List; 6 import java.util.Map; 7 import java.util.Set; 8 import java.util.TreeSet; 9 import java.util.regex.Matcher; 10 11 import org.unicode.cldr.draft.ScriptMetadata; 12 import org.unicode.cldr.draft.ScriptMetadata.Info; 13 import org.unicode.cldr.draft.ScriptMetadata.Trinary; 14 import org.unicode.cldr.test.CheckCLDR.CheckStatus.Subtype; 15 import org.unicode.cldr.tool.LikelySubtags; 16 import org.unicode.cldr.util.CLDRFile; 17 import org.unicode.cldr.util.CLDRURLS; 18 import org.unicode.cldr.util.CldrUtility; 19 import org.unicode.cldr.util.Counter; 20 import org.unicode.cldr.util.Factory; 21 import org.unicode.cldr.util.PathStarrer; 22 import org.unicode.cldr.util.PatternCache; 23 import org.unicode.cldr.util.RegexLookup; 24 import org.unicode.cldr.util.SpecialLocales; 25 26 import com.google.common.base.Joiner; 27 import com.ibm.icu.lang.UCharacter; 28 import com.ibm.icu.text.BreakIterator; 29 import com.ibm.icu.util.ULocale; 30 31 public class CheckConsistentCasing extends FactoryCheckCLDR { 32 33 private static final boolean DEBUG = CldrUtility.getProperty("DEBUG", false); 34 35 private static final double MIN_FACTOR = 2.5; 36 // remember to add this class to the list in CheckCLDR.getCheckAll 37 // to run just this test, on just locales starting with 'nl', use CheckCLDR with -fnl.* -t.*Currencies.* 38 39 ULocale uLocale = null; 40 BreakIterator breaker = null; 41 private String locale; 42 CasingInfo casingInfo; 43 private boolean hasCasingInfo; 44 CheckConsistentCasing(Factory factory)45 public CheckConsistentCasing(Factory factory) { 46 super(factory); 47 casingInfo = new CasingInfo(factory); 48 } 49 50 @Override setCldrFileToCheck(CLDRFile cldrFileToCheck, Options options, List<CheckStatus> possibleErrors)51 public CheckCLDR setCldrFileToCheck(CLDRFile cldrFileToCheck, Options options, 52 List<CheckStatus> possibleErrors) { 53 if (cldrFileToCheck == null) return this; 54 super.setCldrFileToCheck(cldrFileToCheck, options, possibleErrors); 55 locale = cldrFileToCheck.getLocaleID(); 56 // get info about casing; note that this is done in two steps since 57 // ScriptMetadata.getInfo() returns null, in some instances. 58 // OLD: Info localeInfo = ScriptMetadata.getInfo(locale); 59 String script = new LikelySubtags().getLikelyScript(locale); 60 Info localeInfo = ScriptMetadata.getInfo(script); 61 62 if (localeInfo != null && localeInfo.hasCase == Trinary.YES) { 63 // this script has casing info, so we can request it here 64 try { 65 types = casingInfo.getLocaleCasing(locale); 66 } catch (Exception e) { 67 types = Collections.emptyMap(); 68 } 69 } else { 70 // no casing info - since the types Map is global, and null checks aren't done, 71 // we are better off with an empty map here 72 types = Collections.emptyMap(); 73 } 74 if ((types == null || types.isEmpty()) && !SpecialLocales.isScratchLocale(locale)) { 75 possibleErrors.add(new CheckStatus().setCause(this) 76 .setMainType(CheckStatus.warningType) 77 .setSubtype(Subtype.incorrectCasing) 78 .setMessage("Could not load casing info for {0}", locale)); 79 } 80 // types may be null, avoid NPE 81 hasCasingInfo = (types == null) ? false : types.size() > 0; 82 return this; 83 } 84 85 // If you don't need any file initialization or postprocessing, you only need this one routine 86 @Override handleCheck(String path, String fullPath, String value, Options options, List<CheckStatus> result)87 public CheckCLDR handleCheck(String path, String fullPath, String value, Options options, 88 List<CheckStatus> result) { 89 // it helps performance to have a quick reject of most paths 90 if (fullPath == null) return this; // skip paths that we don't have 91 if (!hasCasingInfo) return this; 92 93 String locale2 = getCldrFileToCheck().getSourceLocaleID(path, null); 94 if (locale2.equals(locale) && value != null && value.length() > 0) { 95 Category category = getCategory(path); 96 if (category != null) { 97 checkConsistentCasing(category, path, fullPath, value, options, result); 98 } 99 } 100 return this; 101 } 102 103 static final Matcher placeholder = PatternCache.get("\\{\\d+\\}").matcher(""); 104 105 /** 106 * The casing type of a given string. 107 */ 108 public enum CasingType { 109 titlecase, lowercase, other; from(String s)110 public static CasingType from(String s) { 111 if (s == null || s.length() == 0) { 112 return other; 113 } 114 int cp; 115 // Look for the first meaningful character in the string to determine case. 116 for (int i = 0; i < s.length(); i += Character.charCount(cp)) { 117 cp = s.codePointAt(i); 118 // used to skip the placeholders, but works better to have them be 'other' 119 // if (cp == '{') { 120 // if (placeholder.reset(s).region(i,s.length()).lookingAt()) { 121 // i = placeholder.end() - 1; // skip 122 // continue; 123 // } 124 // } 125 int type = UCharacter.getType(cp); 126 switch (type) { 127 128 case UCharacter.LOWERCASE_LETTER: 129 return lowercase; 130 131 case UCharacter.UPPERCASE_LETTER: 132 case UCharacter.TITLECASE_LETTER: 133 return titlecase; 134 135 // for other letters / numbers / symbols, return other 136 case UCharacter.OTHER_LETTER: 137 case UCharacter.DECIMAL_DIGIT_NUMBER: 138 case UCharacter.LETTER_NUMBER: 139 case UCharacter.OTHER_NUMBER: 140 case UCharacter.MATH_SYMBOL: 141 case UCharacter.CURRENCY_SYMBOL: 142 case UCharacter.MODIFIER_SYMBOL: 143 case UCharacter.OTHER_SYMBOL: 144 return other; 145 // ignore everything else (whitespace, punctuation, etc) and keep going 146 } 147 } 148 return other; 149 } 150 151 /** 152 * Return true if either is other, or they are identical. 153 */ worksWith(CasingType otherType)154 public boolean worksWith(CasingType otherType) { 155 return otherType == null || this == otherType || this == CasingType.other || otherType == CasingType.other; 156 } 157 } 158 159 public enum CasingTypeAndErrFlag { 160 titlecase_mismatchWarn(CasingType.titlecase, false), titlecase_mismatchErr(CasingType.titlecase, true), lowercase_mismatchWarn(CasingType.lowercase, 161 false), lowercase_mismatchErr(CasingType.lowercase, true), other_mismatchWarn(CasingType.other, false), other_mismatchErr(CasingType.other, true); 162 163 private final CasingType type; 164 private final boolean flag; // force error instead of warning for mismatch 165 CasingTypeAndErrFlag(CasingType type, boolean flag)166 private CasingTypeAndErrFlag(CasingType type, boolean flag) { 167 this.type = type; 168 this.flag = flag; 169 } 170 type()171 public CasingType type() { 172 return type; 173 } 174 flag()175 public boolean flag() { 176 return flag; 177 } 178 } 179 180 static final RegexLookup<Category> pathToBucket = new RegexLookup<Category>() 181 .add("//ldml/localeDisplayNames/languages/language", Category.language) 182 .add("//ldml/localeDisplayNames/scripts/script", Category.script) 183 .add("//ldml/localeDisplayNames/territories/territory", Category.territory) 184 .add("//ldml/localeDisplayNames/variants/variant", Category.variant) 185 .add("//ldml/localeDisplayNames/keys/key", Category.key) 186 .add("//ldml/localeDisplayNames/types/type", Category.keyValue) 187 .add("//ldml/dates/calendars/calendar.*/months.*narrow", Category.month_narrow) 188 .add("//ldml/dates/calendars/calendar.*/months.*format", Category.month_format_except_narrow) 189 .add("//ldml/dates/calendars/calendar.*/months", Category.month_standalone_except_narrow) 190 .add("//ldml/dates/calendars/calendar.*/days.*narrow", Category.day_narrow) 191 .add("//ldml/dates/calendars/calendar.*/days.*format", Category.day_format_except_narrow) 192 .add("//ldml/dates/calendars/calendar.*/days", Category.day_standalone_except_narrow) 193 .add("//ldml/dates/calendars/calendar.*/eras/eraNarrow", Category.era_narrow) 194 .add("//ldml/dates/calendars/calendar.*/eras/eraAbbr", Category.era_abbr) 195 .add("//ldml/dates/calendars/calendar.*/eras/", Category.era_name) 196 .add("//ldml/dates/calendars/calendar.*/quarters.*narrow", Category.quarter_narrow) 197 .add("//ldml/dates/calendars/calendar.*/quarters.*abbreviated", Category.quarter_abbreviated) 198 .add("//ldml/dates/calendars/calendar.*/quarters.*format", Category.quarter_format_wide) 199 .add("//ldml/dates/calendars/calendar.*/quarters", Category.quarter_standalone_wide) 200 .add("//ldml/.*/relative", Category.relative) 201 .add("//ldml/dates/fields", Category.calendar_field) 202 .add("//ldml/dates/timeZoneNames/zone.*/exemplarCity", Category.zone_exemplarCity) 203 .add("//ldml/dates/timeZoneNames/zone.*/short", Category.zone_short) 204 .add("//ldml/dates/timeZoneNames/zone", Category.zone_long) 205 .add("//ldml/dates/timeZoneNames/metazone.*/commonlyUsed", Category.NOT_USED) // just to remove them from the other cases 206 .add("//ldml/dates/timeZoneNames/metazone.*/short", Category.metazone_long) 207 .add("//ldml/dates/timeZoneNames/metazone", Category.metazone_long) 208 .add("//ldml/numbers/currencies/currency.*/symbol", Category.symbol) 209 .add("//ldml/numbers/currencies/currency.*/displayName.*@count", Category.currencyName_count) 210 .add("//ldml/numbers/currencies/currency.*/displayName", Category.currencyName) 211 .add("//ldml/units/unit.*/unitPattern.*(past|future)", Category.relative) 212 .add("//ldml/units/unit.*/unitPattern", Category.unit_pattern) 213 // ldml/localeDisplayNames/keys/key[@type=".*"] 214 // ldml/localeDisplayNames/measurementSystemNames/measurementSystemName[@type=".*"] 215 // ldml/localeDisplayNames/transformNames/transformName[@type=".*"] 216 ; 217 218 Map<Category, CasingTypeAndErrFlag> types = new EnumMap<>(Category.class); 219 220 public enum Category { 221 language, script, territory, variant, keyValue, month_narrow, month_format_except_narrow, month_standalone_except_narrow, day_narrow, day_format_except_narrow, day_standalone_except_narrow, era_narrow, era_abbr, era_name, quarter_narrow, quarter_abbreviated, quarter_format_wide, quarter_standalone_wide, calendar_field, zone_exemplarCity, zone_short, zone_long, NOT_USED, metazone_short, metazone_long, symbol, currencyName_count, currencyName, relative, unit_pattern, key; 222 } 223 224 // //ldml/numbers/currencies/currency[@type="ADP"]/displayName 225 // //ldml/numbers/currencies/currency[@type="RON"]/displayName[@count="other"] 226 // //ldml/numbers/currencies/currency[@type="BYB"]/symbol 227 getCategory(String path)228 static Category getCategory(String path) { 229 return pathToBucket.get(path); 230 } 231 232 /** 233 * Calculates casing information using data from the specified CLDRFile. 234 * 235 * @param resolved 236 * the resolved CLDRFile to calculate casing information from 237 * @return 238 */ getSamples(CLDRFile resolved)239 public static Map<Category, CasingType> getSamples(CLDRFile resolved) { 240 // Use EnumMap instead of an array for type safety. 241 Map<Category, Counter<CasingType>> counters = new EnumMap<>(Category.class); 242 243 for (Category category : Category.values()) { 244 counters.put(category, new Counter<CasingType>()); 245 } 246 PathStarrer starrer = new PathStarrer(); 247 boolean isRoot = "root".equals(resolved.getLocaleID()); 248 Set<String> missing = !DEBUG ? null : new TreeSet<>(); 249 250 for (String path : resolved) { 251 if (!isRoot) { 252 String locale2 = resolved.getSourceLocaleID(path, null); 253 if (locale2.equals("root") || locale2.equals("code-fallback")) { 254 continue; 255 } 256 } 257 String winningPath = resolved.getWinningPath(path); 258 if (!winningPath.equals(path)) { 259 continue; 260 } 261 Category category = getCategory(path); 262 if (category != null) { 263 String value = resolved.getStringValue(path); 264 if (value == null || value.length() == 0) continue; 265 CasingType ft = CasingType.from(value); 266 counters.get(category).add(ft, 1); 267 } else if (DEBUG) { 268 String starred = starrer.set(path); 269 missing.add(starred); 270 } 271 } 272 273 Map<Category, CasingType> info = new EnumMap<>(Category.class); 274 for (Category category : Category.values()) { 275 if (category == Category.NOT_USED) continue; 276 Counter<CasingType> counter = counters.get(category); 277 long countLower = counter.getCount(CasingType.lowercase); 278 long countUpper = counter.getCount(CasingType.titlecase); 279 long countOther = counter.getCount(CasingType.other); 280 CasingType type; 281 if (countLower + countUpper == 0) { 282 type = CasingType.other; 283 } else if (countLower >= countUpper * MIN_FACTOR && countLower >= countOther) { 284 type = CasingType.lowercase; 285 } else if (countUpper >= countLower * MIN_FACTOR && countUpper >= countOther) { 286 type = CasingType.titlecase; 287 } else { 288 type = CasingType.other; 289 } 290 info.put(category, type); 291 } 292 if (DEBUG && missing.size() != 0) { 293 System.out.println("Paths skipped:\n" + Joiner.on("\n").join(missing)); 294 } 295 return info; 296 } 297 298 private static final String CASE_WARNING = "The first letter of 〈{0}〉 is {1}, which differs from what is expected " + 299 "for the {2} category: that almost all values be {3}.\n\n" + 300 "For guidance, see " + CLDRURLS.CAPITALIZATION_URL + ". " + 301 "If this warning is wrong, please file a ticket at " + CLDRURLS.CLDR_NEWTICKET_URL + "."; 302 checkConsistentCasing(Category category, String path, String fullPath, String value, Options options, List<CheckStatus> result)303 private void checkConsistentCasing(Category category, String path, String fullPath, String value, 304 Options options, List<CheckStatus> result) { 305 // Avoid NPE 306 if (types != null) { 307 CasingType ft = CasingType.from(value); 308 CasingTypeAndErrFlag typeAndFlagFromCat = types.get(category); 309 if (typeAndFlagFromCat == null) { 310 typeAndFlagFromCat = CasingTypeAndErrFlag.other_mismatchWarn; 311 } 312 if (!ft.worksWith(typeAndFlagFromCat.type())) { 313 result.add(new CheckStatus().setCause(this) 314 .setMainType(typeAndFlagFromCat.flag() ? CheckStatus.errorType : CheckStatus.warningType) 315 .setSubtype(Subtype.incorrectCasing) // typically warningType or errorType 316 .setMessage(CASE_WARNING, value, ft, category, typeAndFlagFromCat.type())); // the message; can be MessageFormat with arguments 317 } 318 } 319 } 320 }