1 package org.unicode.cldr.test; 2 3 import java.util.Collections; 4 import java.util.EnumMap; 5 import java.util.List; 6 import java.util.Map; 7 import java.util.Set; 8 import java.util.TreeSet; 9 import java.util.regex.Matcher; 10 11 import org.unicode.cldr.draft.ScriptMetadata; 12 import org.unicode.cldr.draft.ScriptMetadata.Info; 13 import org.unicode.cldr.draft.ScriptMetadata.Trinary; 14 import org.unicode.cldr.test.CheckCLDR.CheckStatus.Subtype; 15 import org.unicode.cldr.tool.LikelySubtags; 16 import org.unicode.cldr.util.CLDRFile; 17 import org.unicode.cldr.util.CldrUtility; 18 import org.unicode.cldr.util.Counter; 19 import org.unicode.cldr.util.Factory; 20 import org.unicode.cldr.util.PathStarrer; 21 import org.unicode.cldr.util.PatternCache; 22 import org.unicode.cldr.util.RegexLookup; 23 import org.unicode.cldr.util.XPathParts; 24 25 import com.ibm.icu.dev.util.CollectionUtilities; 26 import com.ibm.icu.lang.UCharacter; 27 import com.ibm.icu.text.BreakIterator; 28 import com.ibm.icu.util.ULocale; 29 30 public class CheckConsistentCasing extends FactoryCheckCLDR { 31 32 private static final boolean DEBUG = CldrUtility.getProperty("DEBUG", false); 33 34 private static final double MIN_FACTOR = 2.5; 35 // remember to add this class to the list in CheckCLDR.getCheckAll 36 // to run just this test, on just locales starting with 'nl', use CheckCLDR with -fnl.* -t.*Currencies.* 37 38 XPathParts parts = new XPathParts(); // used to parse out a path 39 ULocale uLocale = null; 40 BreakIterator breaker = null; 41 private String locale; 42 CasingInfo casingInfo; 43 private boolean hasCasingInfo; 44 CheckConsistentCasing(Factory factory)45 public CheckConsistentCasing(Factory factory) { 46 super(factory); 47 casingInfo = new CasingInfo(factory); 48 } 49 50 @Override setCldrFileToCheck(CLDRFile cldrFileToCheck, Options options, List<CheckStatus> possibleErrors)51 public CheckCLDR setCldrFileToCheck(CLDRFile cldrFileToCheck, Options options, 52 List<CheckStatus> possibleErrors) { 53 if (cldrFileToCheck == null) return this; 54 super.setCldrFileToCheck(cldrFileToCheck, options, possibleErrors); 55 locale = cldrFileToCheck.getLocaleID(); 56 // get info about casing; note that this is done in two steps since 57 // ScriptMetadata.getInfo() returns null, in some instances. 58 // OLD: Info localeInfo = ScriptMetadata.getInfo(locale); 59 String script = new LikelySubtags().getLikelyScript(locale); 60 Info localeInfo = ScriptMetadata.getInfo(script); 61 62 if (localeInfo != null && localeInfo.hasCase == Trinary.YES) { 63 // this script has casing info, so we can request it here 64 try { 65 types = casingInfo.getLocaleCasing(locale); 66 } catch (Exception e) { 67 types = Collections.emptyMap(); 68 } 69 } else { 70 // no casing info - since the types Map is global, and null checks aren't done, 71 // we are better off with an empty map here 72 types = Collections.emptyMap(); 73 } 74 if (types == null || types.isEmpty()) { 75 possibleErrors.add(new CheckStatus().setCause(this) 76 .setMainType(CheckStatus.warningType) 77 .setSubtype(Subtype.incorrectCasing) 78 .setMessage("Could not load casing info for {0}", locale)); 79 } 80 // types may be null, avoid NPE 81 hasCasingInfo = (types == null) ? false : types.size() > 0; 82 return this; 83 } 84 85 // If you don't need any file initialization or postprocessing, you only need this one routine handleCheck(String path, String fullPath, String value, Options options, List<CheckStatus> result)86 public CheckCLDR handleCheck(String path, String fullPath, String value, Options options, 87 List<CheckStatus> result) { 88 // it helps performance to have a quick reject of most paths 89 if (fullPath == null) return this; // skip paths that we don't have 90 if (!hasCasingInfo) return this; 91 92 String locale2 = getCldrFileToCheck().getSourceLocaleID(path, null); 93 if (locale2.equals(locale) && value != null && value.length() > 0) { 94 Category category = getCategory(path); 95 if (category != null) { 96 checkConsistentCasing(category, path, fullPath, value, options, result); 97 } 98 } 99 return this; 100 } 101 102 static final Matcher placeholder = PatternCache.get("\\{\\d+\\}").matcher(""); 103 104 /** 105 * The casing type of a given string. 106 */ 107 public enum CasingType { 108 titlecase, lowercase, other; from(String s)109 public static CasingType from(String s) { 110 if (s == null || s.length() == 0) { 111 return other; 112 } 113 int cp; 114 // Look for the first meaningful character in the string to determine case. 115 for (int i = 0; i < s.length(); i += Character.charCount(cp)) { 116 cp = s.codePointAt(i); 117 // used to skip the placeholders, but works better to have them be 'other' 118 // if (cp == '{') { 119 // if (placeholder.reset(s).region(i,s.length()).lookingAt()) { 120 // i = placeholder.end() - 1; // skip 121 // continue; 122 // } 123 // } 124 int type = UCharacter.getType(cp); 125 switch (type) { 126 127 case UCharacter.LOWERCASE_LETTER: 128 return lowercase; 129 130 case UCharacter.UPPERCASE_LETTER: 131 case UCharacter.TITLECASE_LETTER: 132 return titlecase; 133 134 // for other letters / numbers / symbols, return other 135 case UCharacter.OTHER_LETTER: 136 case UCharacter.DECIMAL_DIGIT_NUMBER: 137 case UCharacter.LETTER_NUMBER: 138 case UCharacter.OTHER_NUMBER: 139 case UCharacter.MATH_SYMBOL: 140 case UCharacter.CURRENCY_SYMBOL: 141 case UCharacter.MODIFIER_SYMBOL: 142 case UCharacter.OTHER_SYMBOL: 143 return other; 144 // ignore everything else (whitespace, punctuation, etc) and keep going 145 } 146 } 147 return other; 148 } 149 150 /** 151 * Return true if either is other, or they are identical. 152 */ worksWith(CasingType otherType)153 public boolean worksWith(CasingType otherType) { 154 return otherType == null || this == otherType || this == CasingType.other || otherType == CasingType.other; 155 } 156 } 157 158 public enum CasingTypeAndErrFlag { 159 titlecase_mismatchWarn(CasingType.titlecase, false), titlecase_mismatchErr(CasingType.titlecase, true), lowercase_mismatchWarn(CasingType.lowercase, 160 false), lowercase_mismatchErr(CasingType.lowercase, true), other_mismatchWarn(CasingType.other, false), other_mismatchErr(CasingType.other, true); 161 162 private final CasingType type; 163 private final boolean flag; // force error instead of warning for mismatch 164 CasingTypeAndErrFlag(CasingType type, boolean flag)165 private CasingTypeAndErrFlag(CasingType type, boolean flag) { 166 this.type = type; 167 this.flag = flag; 168 } 169 type()170 public CasingType type() { 171 return type; 172 } 173 flag()174 public boolean flag() { 175 return flag; 176 } 177 } 178 179 static final RegexLookup<Category> pathToBucket = new RegexLookup<Category>() 180 .add("//ldml/localeDisplayNames/languages/language", Category.language) 181 .add("//ldml/localeDisplayNames/scripts/script", Category.script) 182 .add("//ldml/localeDisplayNames/territories/territory", Category.territory) 183 .add("//ldml/localeDisplayNames/variants/variant", Category.variant) 184 .add("//ldml/localeDisplayNames/keys/key", Category.key) 185 .add("//ldml/localeDisplayNames/types/type", Category.keyValue) 186 .add("//ldml/dates/calendars/calendar.*/months.*narrow", Category.month_narrow) 187 .add("//ldml/dates/calendars/calendar.*/months.*format", Category.month_format_except_narrow) 188 .add("//ldml/dates/calendars/calendar.*/months", Category.month_standalone_except_narrow) 189 .add("//ldml/dates/calendars/calendar.*/days.*narrow", Category.day_narrow) 190 .add("//ldml/dates/calendars/calendar.*/days.*format", Category.day_format_except_narrow) 191 .add("//ldml/dates/calendars/calendar.*/days", Category.day_standalone_except_narrow) 192 .add("//ldml/dates/calendars/calendar.*/eras/eraNarrow", Category.era_narrow) 193 .add("//ldml/dates/calendars/calendar.*/eras/eraAbbr", Category.era_abbr) 194 .add("//ldml/dates/calendars/calendar.*/eras/", Category.era_name) 195 .add("//ldml/dates/calendars/calendar.*/quarters.*narrow", Category.quarter_narrow) 196 .add("//ldml/dates/calendars/calendar.*/quarters.*abbreviated", Category.quarter_abbreviated) 197 .add("//ldml/dates/calendars/calendar.*/quarters.*format", Category.quarter_format_wide) 198 .add("//ldml/dates/calendars/calendar.*/quarters", Category.quarter_standalone_wide) 199 .add("//ldml/.*/relative", Category.relative) 200 .add("//ldml/dates/fields", Category.calendar_field) 201 .add("//ldml/dates/timeZoneNames/zone.*/exemplarCity", Category.zone_exemplarCity) 202 .add("//ldml/dates/timeZoneNames/zone.*/short", Category.zone_short) 203 .add("//ldml/dates/timeZoneNames/zone", Category.zone_long) 204 .add("//ldml/dates/timeZoneNames/metazone.*/commonlyUsed", Category.NOT_USED) // just to remove them from the other cases 205 .add("//ldml/dates/timeZoneNames/metazone.*/short", Category.metazone_long) 206 .add("//ldml/dates/timeZoneNames/metazone", Category.metazone_long) 207 .add("//ldml/numbers/currencies/currency.*/symbol", Category.symbol) 208 .add("//ldml/numbers/currencies/currency.*/displayName.*@count", Category.currencyName_count) 209 .add("//ldml/numbers/currencies/currency.*/displayName", Category.currencyName) 210 .add("//ldml/units/unit.*/unitPattern.*(past|future)", Category.relative) 211 .add("//ldml/units/unit.*/unitPattern", Category.unit_pattern) 212 // ldml/localeDisplayNames/keys/key[@type=".*"] 213 // ldml/localeDisplayNames/measurementSystemNames/measurementSystemName[@type=".*"] 214 // ldml/localeDisplayNames/transformNames/transformName[@type=".*"] 215 ; 216 217 Map<Category, CasingTypeAndErrFlag> types = new EnumMap<Category, CasingTypeAndErrFlag>(Category.class); 218 219 public enum Category { 220 language, script, territory, variant, keyValue, month_narrow, month_format_except_narrow, month_standalone_except_narrow, day_narrow, day_format_except_narrow, day_standalone_except_narrow, era_narrow, era_abbr, era_name, quarter_narrow, quarter_abbreviated, quarter_format_wide, quarter_standalone_wide, calendar_field, zone_exemplarCity, zone_short, zone_long, NOT_USED, metazone_short, metazone_long, symbol, currencyName_count, currencyName, relative, unit_pattern, key; 221 } 222 223 // //ldml/numbers/currencies/currency[@type="ADP"]/displayName 224 // //ldml/numbers/currencies/currency[@type="RON"]/displayName[@count="other"] 225 // //ldml/numbers/currencies/currency[@type="BYB"]/symbol 226 getCategory(String path)227 static Category getCategory(String path) { 228 return pathToBucket.get(path); 229 } 230 231 /** 232 * Calculates casing information using data from the specified CLDRFile. 233 * 234 * @param resolved 235 * the resolved CLDRFile to calculate casing information from 236 * @return 237 */ getSamples(CLDRFile resolved)238 public static Map<Category, CasingType> getSamples(CLDRFile resolved) { 239 // Use EnumMap instead of an array for type safety. 240 Map<Category, Counter<CasingType>> counters = new EnumMap<Category, Counter<CasingType>>(Category.class); 241 242 for (Category category : Category.values()) { 243 counters.put(category, new Counter<CasingType>()); 244 } 245 PathStarrer starrer = new PathStarrer(); 246 boolean isRoot = "root".equals(resolved.getLocaleID()); 247 Set<String> missing = !DEBUG ? null : new TreeSet<String>(); 248 249 for (String path : resolved) { 250 if (!isRoot) { 251 String locale2 = resolved.getSourceLocaleID(path, null); 252 if (locale2.equals("root") || locale2.equals("code-fallback")) { 253 continue; 254 } 255 } 256 String winningPath = resolved.getWinningPath(path); 257 if (!winningPath.equals(path)) { 258 continue; 259 } 260 Category category = getCategory(path); 261 if (category != null) { 262 String value = resolved.getStringValue(path); 263 if (value == null || value.length() == 0) continue; 264 CasingType ft = CasingType.from(value); 265 counters.get(category).add(ft, 1); 266 } else if (DEBUG) { 267 String starred = starrer.set(path); 268 missing.add(starred); 269 } 270 } 271 272 Map<Category, CasingType> info = new EnumMap<Category, CasingType>(Category.class); 273 for (Category category : Category.values()) { 274 if (category == Category.NOT_USED) continue; 275 Counter<CasingType> counter = counters.get(category); 276 long countLower = counter.getCount(CasingType.lowercase); 277 long countUpper = counter.getCount(CasingType.titlecase); 278 long countOther = counter.getCount(CasingType.other); 279 CasingType type; 280 if (countLower + countUpper == 0) { 281 type = CasingType.other; 282 } else if (countLower >= countUpper * MIN_FACTOR && countLower >= countOther) { 283 type = CasingType.lowercase; 284 } else if (countUpper >= countLower * MIN_FACTOR && countUpper >= countOther) { 285 type = CasingType.titlecase; 286 } else { 287 type = CasingType.other; 288 } 289 info.put(category, type); 290 } 291 if (DEBUG && missing.size() != 0) { 292 System.out.println("Paths skipped:\n" + CollectionUtilities.join(missing, "\n")); 293 } 294 return info; 295 } 296 297 private static final String CASE_WARNING = "The first letter of 〈{0}〉 is {1}, which differs from what is expected " + 298 "for the {2} category: that almost all values be {3}.\n\n" + 299 "For guidance, see http://cldr.org/translation/capitalization. " + 300 "If this warning is wrong, please file a ticket at http://unicode.org/cldr/trac/."; 301 checkConsistentCasing(Category category, String path, String fullPath, String value, Options options, List<CheckStatus> result)302 private void checkConsistentCasing(Category category, String path, String fullPath, String value, 303 Options options, List<CheckStatus> result) { 304 // Avoid NPE 305 if (types != null) { 306 CasingType ft = CasingType.from(value); 307 CasingTypeAndErrFlag typeAndFlagFromCat = types.get(category); 308 if (typeAndFlagFromCat == null) { 309 typeAndFlagFromCat = CasingTypeAndErrFlag.other_mismatchWarn; 310 } 311 if (!ft.worksWith(typeAndFlagFromCat.type())) { 312 result.add(new CheckStatus().setCause(this) 313 .setMainType(typeAndFlagFromCat.flag() ? CheckStatus.errorType : CheckStatus.warningType) 314 .setSubtype(Subtype.incorrectCasing) // typically warningType or errorType 315 .setMessage(CASE_WARNING, value, ft, category, typeAndFlagFromCat.type())); // the message; can be MessageFormat with arguments 316 } 317 } 318 } 319 }