1 package org.unicode.cldr.test; 2 3 import java.util.Collections; 4 import java.util.EnumMap; 5 import java.util.List; 6 import java.util.Map; 7 import java.util.Set; 8 import java.util.TreeSet; 9 import java.util.regex.Matcher; 10 11 import org.unicode.cldr.draft.ScriptMetadata; 12 import org.unicode.cldr.draft.ScriptMetadata.Info; 13 import org.unicode.cldr.draft.ScriptMetadata.Trinary; 14 import org.unicode.cldr.test.CheckCLDR.CheckStatus.Subtype; 15 import org.unicode.cldr.tool.LikelySubtags; 16 import org.unicode.cldr.util.CLDRFile; 17 import org.unicode.cldr.util.CldrUtility; 18 import org.unicode.cldr.util.Counter; 19 import org.unicode.cldr.util.Factory; 20 import org.unicode.cldr.util.PathStarrer; 21 import org.unicode.cldr.util.PatternCache; 22 import org.unicode.cldr.util.RegexLookup; 23 24 import com.google.common.base.Joiner; 25 import com.ibm.icu.lang.UCharacter; 26 import com.ibm.icu.text.BreakIterator; 27 import com.ibm.icu.util.ULocale; 28 29 public class CheckConsistentCasing extends FactoryCheckCLDR { 30 31 private static final boolean DEBUG = CldrUtility.getProperty("DEBUG", false); 32 33 private static final double MIN_FACTOR = 2.5; 34 // remember to add this class to the list in CheckCLDR.getCheckAll 35 // to run just this test, on just locales starting with 'nl', use CheckCLDR with -fnl.* -t.*Currencies.* 36 37 ULocale uLocale = null; 38 BreakIterator breaker = null; 39 private String locale; 40 CasingInfo casingInfo; 41 private boolean hasCasingInfo; 42 CheckConsistentCasing(Factory factory)43 public CheckConsistentCasing(Factory factory) { 44 super(factory); 45 casingInfo = new CasingInfo(factory); 46 } 47 48 @Override setCldrFileToCheck(CLDRFile cldrFileToCheck, Options options, List<CheckStatus> possibleErrors)49 public CheckCLDR setCldrFileToCheck(CLDRFile cldrFileToCheck, Options options, 50 List<CheckStatus> possibleErrors) { 51 if (cldrFileToCheck == null) return this; 52 super.setCldrFileToCheck(cldrFileToCheck, options, possibleErrors); 53 locale = cldrFileToCheck.getLocaleID(); 54 // get info about casing; note that this is done in two steps since 55 // ScriptMetadata.getInfo() returns null, in some instances. 56 // OLD: Info localeInfo = ScriptMetadata.getInfo(locale); 57 String script = new LikelySubtags().getLikelyScript(locale); 58 Info localeInfo = ScriptMetadata.getInfo(script); 59 60 if (localeInfo != null && localeInfo.hasCase == Trinary.YES) { 61 // this script has casing info, so we can request it here 62 try { 63 types = casingInfo.getLocaleCasing(locale); 64 } catch (Exception e) { 65 types = Collections.emptyMap(); 66 } 67 } else { 68 // no casing info - since the types Map is global, and null checks aren't done, 69 // we are better off with an empty map here 70 types = Collections.emptyMap(); 71 } 72 if (types == null || types.isEmpty()) { 73 possibleErrors.add(new CheckStatus().setCause(this) 74 .setMainType(CheckStatus.warningType) 75 .setSubtype(Subtype.incorrectCasing) 76 .setMessage("Could not load casing info for {0}", locale)); 77 } 78 // types may be null, avoid NPE 79 hasCasingInfo = (types == null) ? false : types.size() > 0; 80 return this; 81 } 82 83 // If you don't need any file initialization or postprocessing, you only need this one routine 84 @Override handleCheck(String path, String fullPath, String value, Options options, List<CheckStatus> result)85 public CheckCLDR handleCheck(String path, String fullPath, String value, Options options, 86 List<CheckStatus> result) { 87 // it helps performance to have a quick reject of most paths 88 if (fullPath == null) return this; // skip paths that we don't have 89 if (!hasCasingInfo) return this; 90 91 String locale2 = getCldrFileToCheck().getSourceLocaleID(path, null); 92 if (locale2.equals(locale) && value != null && value.length() > 0) { 93 Category category = getCategory(path); 94 if (category != null) { 95 checkConsistentCasing(category, path, fullPath, value, options, result); 96 } 97 } 98 return this; 99 } 100 101 static final Matcher placeholder = PatternCache.get("\\{\\d+\\}").matcher(""); 102 103 /** 104 * The casing type of a given string. 105 */ 106 public enum CasingType { 107 titlecase, lowercase, other; from(String s)108 public static CasingType from(String s) { 109 if (s == null || s.length() == 0) { 110 return other; 111 } 112 int cp; 113 // Look for the first meaningful character in the string to determine case. 114 for (int i = 0; i < s.length(); i += Character.charCount(cp)) { 115 cp = s.codePointAt(i); 116 // used to skip the placeholders, but works better to have them be 'other' 117 // if (cp == '{') { 118 // if (placeholder.reset(s).region(i,s.length()).lookingAt()) { 119 // i = placeholder.end() - 1; // skip 120 // continue; 121 // } 122 // } 123 int type = UCharacter.getType(cp); 124 switch (type) { 125 126 case UCharacter.LOWERCASE_LETTER: 127 return lowercase; 128 129 case UCharacter.UPPERCASE_LETTER: 130 case UCharacter.TITLECASE_LETTER: 131 return titlecase; 132 133 // for other letters / numbers / symbols, return other 134 case UCharacter.OTHER_LETTER: 135 case UCharacter.DECIMAL_DIGIT_NUMBER: 136 case UCharacter.LETTER_NUMBER: 137 case UCharacter.OTHER_NUMBER: 138 case UCharacter.MATH_SYMBOL: 139 case UCharacter.CURRENCY_SYMBOL: 140 case UCharacter.MODIFIER_SYMBOL: 141 case UCharacter.OTHER_SYMBOL: 142 return other; 143 // ignore everything else (whitespace, punctuation, etc) and keep going 144 } 145 } 146 return other; 147 } 148 149 /** 150 * Return true if either is other, or they are identical. 151 */ worksWith(CasingType otherType)152 public boolean worksWith(CasingType otherType) { 153 return otherType == null || this == otherType || this == CasingType.other || otherType == CasingType.other; 154 } 155 } 156 157 public enum CasingTypeAndErrFlag { 158 titlecase_mismatchWarn(CasingType.titlecase, false), titlecase_mismatchErr(CasingType.titlecase, true), lowercase_mismatchWarn(CasingType.lowercase, 159 false), lowercase_mismatchErr(CasingType.lowercase, true), other_mismatchWarn(CasingType.other, false), other_mismatchErr(CasingType.other, true); 160 161 private final CasingType type; 162 private final boolean flag; // force error instead of warning for mismatch 163 CasingTypeAndErrFlag(CasingType type, boolean flag)164 private CasingTypeAndErrFlag(CasingType type, boolean flag) { 165 this.type = type; 166 this.flag = flag; 167 } 168 type()169 public CasingType type() { 170 return type; 171 } 172 flag()173 public boolean flag() { 174 return flag; 175 } 176 } 177 178 static final RegexLookup<Category> pathToBucket = new RegexLookup<Category>() 179 .add("//ldml/localeDisplayNames/languages/language", Category.language) 180 .add("//ldml/localeDisplayNames/scripts/script", Category.script) 181 .add("//ldml/localeDisplayNames/territories/territory", Category.territory) 182 .add("//ldml/localeDisplayNames/variants/variant", Category.variant) 183 .add("//ldml/localeDisplayNames/keys/key", Category.key) 184 .add("//ldml/localeDisplayNames/types/type", Category.keyValue) 185 .add("//ldml/dates/calendars/calendar.*/months.*narrow", Category.month_narrow) 186 .add("//ldml/dates/calendars/calendar.*/months.*format", Category.month_format_except_narrow) 187 .add("//ldml/dates/calendars/calendar.*/months", Category.month_standalone_except_narrow) 188 .add("//ldml/dates/calendars/calendar.*/days.*narrow", Category.day_narrow) 189 .add("//ldml/dates/calendars/calendar.*/days.*format", Category.day_format_except_narrow) 190 .add("//ldml/dates/calendars/calendar.*/days", Category.day_standalone_except_narrow) 191 .add("//ldml/dates/calendars/calendar.*/eras/eraNarrow", Category.era_narrow) 192 .add("//ldml/dates/calendars/calendar.*/eras/eraAbbr", Category.era_abbr) 193 .add("//ldml/dates/calendars/calendar.*/eras/", Category.era_name) 194 .add("//ldml/dates/calendars/calendar.*/quarters.*narrow", Category.quarter_narrow) 195 .add("//ldml/dates/calendars/calendar.*/quarters.*abbreviated", Category.quarter_abbreviated) 196 .add("//ldml/dates/calendars/calendar.*/quarters.*format", Category.quarter_format_wide) 197 .add("//ldml/dates/calendars/calendar.*/quarters", Category.quarter_standalone_wide) 198 .add("//ldml/.*/relative", Category.relative) 199 .add("//ldml/dates/fields", Category.calendar_field) 200 .add("//ldml/dates/timeZoneNames/zone.*/exemplarCity", Category.zone_exemplarCity) 201 .add("//ldml/dates/timeZoneNames/zone.*/short", Category.zone_short) 202 .add("//ldml/dates/timeZoneNames/zone", Category.zone_long) 203 .add("//ldml/dates/timeZoneNames/metazone.*/commonlyUsed", Category.NOT_USED) // just to remove them from the other cases 204 .add("//ldml/dates/timeZoneNames/metazone.*/short", Category.metazone_long) 205 .add("//ldml/dates/timeZoneNames/metazone", Category.metazone_long) 206 .add("//ldml/numbers/currencies/currency.*/symbol", Category.symbol) 207 .add("//ldml/numbers/currencies/currency.*/displayName.*@count", Category.currencyName_count) 208 .add("//ldml/numbers/currencies/currency.*/displayName", Category.currencyName) 209 .add("//ldml/units/unit.*/unitPattern.*(past|future)", Category.relative) 210 .add("//ldml/units/unit.*/unitPattern", Category.unit_pattern) 211 // ldml/localeDisplayNames/keys/key[@type=".*"] 212 // ldml/localeDisplayNames/measurementSystemNames/measurementSystemName[@type=".*"] 213 // ldml/localeDisplayNames/transformNames/transformName[@type=".*"] 214 ; 215 216 Map<Category, CasingTypeAndErrFlag> types = new EnumMap<>(Category.class); 217 218 public enum Category { 219 language, script, territory, variant, keyValue, month_narrow, month_format_except_narrow, month_standalone_except_narrow, day_narrow, day_format_except_narrow, day_standalone_except_narrow, era_narrow, era_abbr, era_name, quarter_narrow, quarter_abbreviated, quarter_format_wide, quarter_standalone_wide, calendar_field, zone_exemplarCity, zone_short, zone_long, NOT_USED, metazone_short, metazone_long, symbol, currencyName_count, currencyName, relative, unit_pattern, key; 220 } 221 222 // //ldml/numbers/currencies/currency[@type="ADP"]/displayName 223 // //ldml/numbers/currencies/currency[@type="RON"]/displayName[@count="other"] 224 // //ldml/numbers/currencies/currency[@type="BYB"]/symbol 225 getCategory(String path)226 static Category getCategory(String path) { 227 return pathToBucket.get(path); 228 } 229 230 /** 231 * Calculates casing information using data from the specified CLDRFile. 232 * 233 * @param resolved 234 * the resolved CLDRFile to calculate casing information from 235 * @return 236 */ getSamples(CLDRFile resolved)237 public static Map<Category, CasingType> getSamples(CLDRFile resolved) { 238 // Use EnumMap instead of an array for type safety. 239 Map<Category, Counter<CasingType>> counters = new EnumMap<>(Category.class); 240 241 for (Category category : Category.values()) { 242 counters.put(category, new Counter<CasingType>()); 243 } 244 PathStarrer starrer = new PathStarrer(); 245 boolean isRoot = "root".equals(resolved.getLocaleID()); 246 Set<String> missing = !DEBUG ? null : new TreeSet<>(); 247 248 for (String path : resolved) { 249 if (!isRoot) { 250 String locale2 = resolved.getSourceLocaleID(path, null); 251 if (locale2.equals("root") || locale2.equals("code-fallback")) { 252 continue; 253 } 254 } 255 String winningPath = resolved.getWinningPath(path); 256 if (!winningPath.equals(path)) { 257 continue; 258 } 259 Category category = getCategory(path); 260 if (category != null) { 261 String value = resolved.getStringValue(path); 262 if (value == null || value.length() == 0) continue; 263 CasingType ft = CasingType.from(value); 264 counters.get(category).add(ft, 1); 265 } else if (DEBUG) { 266 String starred = starrer.set(path); 267 missing.add(starred); 268 } 269 } 270 271 Map<Category, CasingType> info = new EnumMap<>(Category.class); 272 for (Category category : Category.values()) { 273 if (category == Category.NOT_USED) continue; 274 Counter<CasingType> counter = counters.get(category); 275 long countLower = counter.getCount(CasingType.lowercase); 276 long countUpper = counter.getCount(CasingType.titlecase); 277 long countOther = counter.getCount(CasingType.other); 278 CasingType type; 279 if (countLower + countUpper == 0) { 280 type = CasingType.other; 281 } else if (countLower >= countUpper * MIN_FACTOR && countLower >= countOther) { 282 type = CasingType.lowercase; 283 } else if (countUpper >= countLower * MIN_FACTOR && countUpper >= countOther) { 284 type = CasingType.titlecase; 285 } else { 286 type = CasingType.other; 287 } 288 info.put(category, type); 289 } 290 if (DEBUG && missing.size() != 0) { 291 System.out.println("Paths skipped:\n" + Joiner.on("\n").join(missing)); 292 } 293 return info; 294 } 295 296 private static final String CASE_WARNING = "The first letter of 〈{0}〉 is {1}, which differs from what is expected " + 297 "for the {2} category: that almost all values be {3}.\n\n" + 298 "For guidance, see http://cldr.org/translation/capitalization. " + 299 "If this warning is wrong, please file a ticket at http://unicode.org/cldr/trac/."; 300 checkConsistentCasing(Category category, String path, String fullPath, String value, Options options, List<CheckStatus> result)301 private void checkConsistentCasing(Category category, String path, String fullPath, String value, 302 Options options, List<CheckStatus> result) { 303 // Avoid NPE 304 if (types != null) { 305 CasingType ft = CasingType.from(value); 306 CasingTypeAndErrFlag typeAndFlagFromCat = types.get(category); 307 if (typeAndFlagFromCat == null) { 308 typeAndFlagFromCat = CasingTypeAndErrFlag.other_mismatchWarn; 309 } 310 if (!ft.worksWith(typeAndFlagFromCat.type())) { 311 result.add(new CheckStatus().setCause(this) 312 .setMainType(typeAndFlagFromCat.flag() ? CheckStatus.errorType : CheckStatus.warningType) 313 .setSubtype(Subtype.incorrectCasing) // typically warningType or errorType 314 .setMessage(CASE_WARNING, value, ft, category, typeAndFlagFromCat.type())); // the message; can be MessageFormat with arguments 315 } 316 } 317 } 318 }