1 /* 2 ****************************************************************************** 3 * Copyright (C) 2005-2012, International Business Machines Corporation and * 4 * others. All Rights Reserved. * 5 ****************************************************************************** 6 */ 7 package org.unicode.cldr.test; 8 9 import java.util.BitSet; 10 import java.util.Date; 11 import java.util.HashMap; 12 import java.util.HashSet; 13 import java.util.LinkedHashSet; 14 import java.util.List; 15 import java.util.Map; 16 import java.util.Set; 17 import java.util.regex.Matcher; 18 import java.util.regex.Pattern; 19 20 import org.unicode.cldr.test.CheckCLDR.CheckStatus.Subtype; 21 import org.unicode.cldr.util.CLDRConfig; 22 import org.unicode.cldr.util.CLDRFile; 23 import org.unicode.cldr.util.CLDRFile.Status; 24 import org.unicode.cldr.util.Factory; 25 import org.unicode.cldr.util.InternalCldrException; 26 import org.unicode.cldr.util.LocaleIDParser; 27 import org.unicode.cldr.util.PatternCache; 28 import org.unicode.cldr.util.PatternPlaceholders; 29 import org.unicode.cldr.util.PatternPlaceholders.PlaceholderInfo; 30 import org.unicode.cldr.util.PatternPlaceholders.PlaceholderStatus; 31 import org.unicode.cldr.util.SupplementalDataInfo; 32 import org.unicode.cldr.util.SupplementalDataInfo.BasicLanguageData; 33 import org.unicode.cldr.util.SupplementalDataInfo.BasicLanguageData.Type; 34 import org.unicode.cldr.util.SupplementalDataInfo.CurrencyDateInfo; 35 import org.unicode.cldr.util.UnicodeSetPrettyPrinter; 36 import org.unicode.cldr.util.XMLSource; 37 import org.unicode.cldr.util.XPathParts; 38 39 import com.google.common.base.Joiner; 40 import com.google.common.collect.Multiset; 41 import com.google.common.collect.Multiset.Entry; 42 import com.google.common.collect.TreeMultiset; 43 import com.ibm.icu.impl.Relation; 44 import com.ibm.icu.lang.UScript; 45 import com.ibm.icu.text.Collator; 46 import com.ibm.icu.text.DateTimePatternGenerator; 47 import com.ibm.icu.text.Normalizer2; 48 import com.ibm.icu.text.PluralRules; 49 import com.ibm.icu.text.Transform; 50 import com.ibm.icu.text.UnicodeSet; 51 import com.ibm.icu.text.PluralRules.PluralType; 52 import com.ibm.icu.util.ULocale; 53 54 public class CheckForExemplars extends FactoryCheckCLDR { 55 private static final UnicodeSet RTL_CONTROLS = new UnicodeSet("[\\u061C\\u200E\\u200F\\u202A-\\u202D\\u2066-\\u2069]"); 56 57 private static final UnicodeSet RTL = new UnicodeSet("[[:bc=AL:][:bc=R:]]"); 58 59 private static final String STAND_IN = "#"; 60 61 // private final UnicodeSet commonAndInherited = new UnicodeSet(CheckExemplars.Allowed).complement(); 62 // "[[:script=common:][:script=inherited:][:alphabetic=false:]]"); 63 static String[] EXEMPLAR_SKIPS = { 64 "/currencySpacing", 65 "/exemplarCharacters", 66 // "/pattern", 67 "/localizedPatternChars", 68 "/segmentations", 69 "/references", 70 "/localeDisplayNames/variants/", 71 "/commonlyUsed", 72 "/defaultNumberingSystem", 73 "/otherNumberingSystems", 74 "/exponential", 75 "/nan", 76 "/scientificFormats", 77 "/inText", 78 "/orientation", 79 "/symbol[@alt=\"narrow\"]", 80 "/characters/parseLenients" 81 }; 82 83 static String[] DATE_PARTS = { 84 "/hourFormat", 85 "/dateFormatItem", 86 "/intervalFormatItem", 87 "/dateFormatLength", 88 "timeFormatLength" 89 }; 90 91 static final UnicodeSet START_PAREN = new UnicodeSet("[[:Ps:]]").freeze(); 92 static final UnicodeSet END_PAREN = new UnicodeSet("[[:Pe:]]").freeze(); 93 static final UnicodeSet ALL_CURRENCY_SYMBOLS = new UnicodeSet("[[:Sc:]]").freeze(); 94 static final UnicodeSet LETTER = new UnicodeSet("[[A-Za-z]]").freeze(); 95 static final UnicodeSet NUMBERS = new UnicodeSet("[[:N:]]").freeze(); 96 static final UnicodeSet DISALLOWED_HOUR_FORMAT = new UnicodeSet("[[:letter:]]").remove('H').remove('m').freeze(); 97 static final UnicodeSet DISALLOWED_IN_RANGE = new UnicodeSet("[:L:]").freeze(); 98 99 private UnicodeSet exemplars; 100 private UnicodeSet exemplarsPlusAscii; 101 //private static final UnicodeSet DISALLOWED_IN_scriptRegionExemplars = new UnicodeSet("[()();,;,]").freeze(); 102 //private static final UnicodeSet DISALLOWED_IN_scriptRegionExemplarsWithParens = new UnicodeSet("[;,;,]").freeze(); 103 104 // Hack until cldrbug 6566 is fixed. TODO 105 private static final Pattern IGNORE_PLACEHOLDER_PARENTHESES = PatternCache.get("\\p{Ps}#\\p{Pe}"); 106 // For the following: traditional placeholders just have {0}, {1}, {2}, ... 107 // But personName namePattern placeHolders start with [a-z], then continue with [0-9a-zA-Z-]+ 108 // They need to be distinguished from non-placeholder patterns using {} in UnicodeSets 109 public static final Pattern PLACEHOLDER= PatternCache.get("\\{[0-9a-zA-Z-]+\\}"); 110 111 112 // private UnicodeSet currencySymbolExemplars; 113 private boolean skip; 114 private Collator col; 115 private Collator spaceCol; 116 UnicodeSetPrettyPrinter prettyPrint; 117 private Status otherPathStatus = new Status(); 118 private Matcher patternMatcher = PLACEHOLDER.matcher(""); 119 private boolean errorDefaultOption; 120 121 // for extracting date pattern text 122 private DateTimePatternGenerator.FormatParser formatParser = new DateTimePatternGenerator.FormatParser(); 123 StringBuilder justText = new StringBuilder(); 124 125 // public static final Pattern SUPPOSED_TO_BE_MESSAGE_FORMAT_PATTERN = PatternCache.get("/(" + 126 // "codePattern" + 127 // "|dateRangePattern" + 128 // "|dateTimeFormat[^/]*?/pattern" + 129 // "|appendItem" + 130 // "|intervalFormatFallback" + 131 // "|hoursFormat" + 132 // "|gmtFormat" + 133 // "|regionFormat" + 134 // "|fallbackFormat" + 135 // "|unitPattern.*@count=\"(zero|one|two|few|many|other)\"" + 136 // "|localePattern" + 137 // "|localeKeyTypePattern" + 138 // "|listPatternPart" + 139 // "|ellipsis" + 140 // "|monthPattern" + 141 // ")"); 142 // private Matcher supposedToBeMessageFormat = SUPPOSED_TO_BE_MESSAGE_FORMAT_PATTERN.matcher(""); 143 144 public static final Pattern LEAD_OR_TRAIL_WHITESPACE_OK = PatternCache.get("/(" + 145 "references/reference" + 146 "|insertBetween" + 147 ")"); 148 private Matcher leadOrTrailWhitespaceOk = LEAD_OR_TRAIL_WHITESPACE_OK.matcher(""); 149 150 private static UnicodeSet ASCII = new UnicodeSet("[\\u0020-\\u007F]").freeze(); 151 152 private PatternPlaceholders patternPlaceholders = PatternPlaceholders.getInstance(); 153 private SupplementalDataInfo sdi; 154 private Relation scriptToCurrencies; 155 CheckForExemplars(Factory factory)156 public CheckForExemplars(Factory factory) { 157 super(factory); 158 // patternPlaceholders = RegexLookup.of(new PlaceholderTransform()) 159 // .loadFromFile(PatternPlaceholders.class, "data/Placeholders.txt"); 160 sdi = SupplementalDataInfo.getInstance(); 161 } 162 163 /** 164 * Adapted from GenerateXMB.MapTransform 165 * 166 * @author jchye 167 * 168 */ 169 static class PlaceholderTransform implements Transform<String, Set<String>> { 170 @Override transform(String source)171 public Set<String> transform(String source) { 172 Set<String> placeholders = new LinkedHashSet<>(); 173 String[] parts = source.split(";\\s+"); 174 for (String part : parts) { 175 int equalsPos = part.indexOf('='); 176 String placeholder = part.substring(0, equalsPos).trim(); 177 placeholders.add(placeholder); 178 } 179 return placeholders; 180 } 181 } 182 183 @Override setCldrFileToCheck(CLDRFile cldrFile, Options options, List<CheckStatus> possibleErrors)184 public CheckCLDR setCldrFileToCheck(CLDRFile cldrFile, Options options, List<CheckStatus> possibleErrors) { 185 if (cldrFile == null) return this; 186 skip = true; 187 super.setCldrFileToCheck(cldrFile, options, possibleErrors); 188 if (cldrFile.getLocaleID().equals("root")) { 189 return this; 190 } 191 192 errorDefaultOption = options.get(Options.Option.exemplarErrors) != null; 193 194 String locale = cldrFile.getLocaleID(); 195 col = Collator.getInstance(new ULocale(locale)); 196 spaceCol = Collator.getInstance(new ULocale(locale)); 197 spaceCol.setStrength(Collator.PRIMARY); 198 199 CLDRFile resolvedFile = getResolvedCldrFileToCheck(); 200 boolean[] ok = new boolean[1]; 201 exemplars = safeGetExemplars("", possibleErrors, resolvedFile, ok); 202 203 if (exemplars == null) { 204 CheckStatus item = new CheckStatus().setCause(this).setMainType(CheckStatus.errorType) 205 .setSubtype(Subtype.noExemplarCharacters) 206 .setMessage("No Exemplar Characters: {0}", new Object[] { this.getClass().getName() }); 207 possibleErrors.add(item); 208 return this; 209 } else if (!ok[0]) { 210 exemplars = new UnicodeSet(); 211 } else { 212 exemplars = new UnicodeSet(exemplars); // modifiable copy 213 } 214 215 boolean isRTL = RTL.containsSome(exemplars); 216 if (isRTL) { 217 exemplars.addAll(RTL_CONTROLS); 218 } 219 // UnicodeSet temp = resolvedFile.getExemplarSet("standard"); 220 // if (temp != null) exemplars.addAll(temp); 221 UnicodeSet auxiliary = safeGetExemplars("auxiliary", possibleErrors, resolvedFile, ok); // resolvedFile.getExemplarSet("auxiliary", 222 // CLDRFile.WinningChoice.WINNING); 223 if (auxiliary != null) { 224 exemplars.addAll(auxiliary); 225 } 226 227 if (CheckExemplars.USE_PUNCTUATION) { 228 UnicodeSet punctuation = safeGetExemplars("punctuation", possibleErrors, resolvedFile, ok); // resolvedFile.getExemplarSet("auxiliary", 229 if (punctuation != null) { 230 exemplars.addAll(punctuation); 231 } 232 233 UnicodeSet numbers = getNumberSystemExemplars(); 234 exemplars.addAll(numbers); 235 236 // TODO fix replacement character 237 exemplars.add(STAND_IN); 238 } 239 240 exemplars.addAll(CheckExemplars.AlwaysOK).freeze(); 241 exemplarsPlusAscii = new UnicodeSet(exemplars).addAll(ASCII).freeze(); 242 243 skip = false; 244 prettyPrint = new UnicodeSetPrettyPrinter() 245 .setOrdering(col != null ? col : Collator.getInstance(ULocale.ROOT)) 246 .setSpaceComparator(col != null ? col : Collator.getInstance(ULocale.ROOT) 247 .setStrength2(Collator.PRIMARY)) 248 .setCompressRanges(true); 249 return this; 250 } 251 getNumberSystemExemplars()252 private UnicodeSet getNumberSystemExemplars() { 253 String numberSystem = getCldrFileToCheck().getStringValue("//ldml/numbers/defaultNumberingSystem"); 254 String digits = sdi.getDigits(numberSystem); 255 return new UnicodeSet().addAll(digits); 256 } 257 safeGetExemplars(String type, List<CheckStatus> possibleErrors, CLDRFile resolvedFile, boolean[] ok)258 private UnicodeSet safeGetExemplars(String type, List<CheckStatus> possibleErrors, CLDRFile resolvedFile, 259 boolean[] ok) { 260 UnicodeSet result = null; 261 try { 262 result = resolvedFile.getExemplarSet(type, CLDRFile.WinningChoice.WINNING); 263 ok[0] = true; 264 } catch (IllegalArgumentException iae) { 265 possibleErrors.add(new CheckStatus() 266 .setCause(this).setMainType(CheckStatus.errorType).setSubtype(Subtype.couldNotAccessExemplars) 267 .setMessage("Could not get exemplar set: " + iae.toString())); 268 ok[0] = false; 269 } 270 return result; 271 } 272 273 @Override handleCheck(String path, String fullPath, String value, Options options, List<CheckStatus> result)274 public CheckCLDR handleCheck(String path, String fullPath, String value, 275 Options options, List<CheckStatus> result) { 276 if (fullPath == null) return this; // skip paths that we don't have 277 if (value == null) return this; // skip values that we don't have ? 278 if (skip) return this; 279 if (path == null) { 280 throw new InternalCldrException("Empty path!"); 281 } else if (getCldrFileToCheck() == null) { 282 throw new InternalCldrException("no file to check!"); 283 } 284 String sourceLocale = getResolvedCldrFileToCheck().getSourceLocaleID(path, otherPathStatus); 285 286 // if we are an alias to another path, then skip 287 // if (!path.equals(otherPathStatus.pathWhereFound)) { 288 // return this; 289 // } 290 291 // now check locale source 292 if (XMLSource.CODE_FALLBACK_ID.equals(sourceLocale)) { 293 return this; 294 // } else if ("root".equals(sourceLocale)) { 295 // // skip eras for non-gregorian 296 // if (true) return this; 297 // if (path.indexOf("/calendar") >= 0 && path.indexOf("gregorian") <= 0) return this; 298 } 299 300 if (containsPart(path, EXEMPLAR_SKIPS)) { 301 return this; 302 } 303 304 CheckStatus.Type errorOption = errorDefaultOption & sourceLocale.equals(getResolvedCldrFileToCheck().getLocaleID()) 305 ? CheckStatus.errorType : CheckStatus.warningType; 306 307 value = checkAndReplacePlaceholders(path, value, result); 308 if (path.startsWith("//ldml/numbers/miscPatterns") && path.contains("[@type=\"range\"]")) { 309 if (DISALLOWED_IN_RANGE.containsSome(value)) { 310 result 311 .add(new CheckStatus() 312 .setCause(this) 313 .setMainType(CheckStatus.errorType) 314 .setSubtype(Subtype.illegalCharactersInPattern) 315 .setMessage( 316 "Range patterns should not have letters.", 317 new Object[] {})); 318 } 319 } 320 // Now handle date patterns. 321 if (containsPart(path, DATE_PARTS)) { 322 if (!extractDatePatternText(value, STAND_IN, justText)) { 323 return this; // we are done, no text. 324 } 325 value = justText.toString(); 326 if (NUMBERS.containsSome(value)) { 327 UnicodeSet disallowed = new UnicodeSet().addAll(value).retainAll(NUMBERS); 328 addMissingMessage(disallowed, CheckStatus.errorType, 329 Subtype.patternCannotContainDigits, 330 Subtype.patternCannotContainDigits, 331 "cannot occur in date or time patterns", result); 332 } 333 if (path.endsWith("/hourFormat")) { 334 UnicodeSet disallowed = new UnicodeSet().addAll(value) 335 .retainAll(DISALLOWED_HOUR_FORMAT); 336 if (!disallowed.isEmpty()) { 337 addMissingMessage(disallowed, CheckStatus.errorType, 338 Subtype.patternContainsInvalidCharacters, 339 Subtype.patternContainsInvalidCharacters, 340 "cannot occur in the hour format", result); 341 } 342 } 343 } 344 345 if (path.startsWith("//ldml/posix/messages")) return this; 346 347 UnicodeSet disallowed; 348 349 if (path.contains("/currency") && path.contains("/symbol")) { 350 if (null != (disallowed = containsAllCountingParens(exemplars, exemplarsPlusAscii, value))) { 351 disallowed.removeAll(ALL_CURRENCY_SYMBOLS); 352 disallowed.removeAll(LETTER); // Allow ASCII A-Z in currency symbols 353 if (disallowed.size() > 0) { 354 // && asciiNotAllowed(getCldrFileToCheck().getLocaleID(), currency)) { 355 addMissingMessage(disallowed, errorOption, 356 Subtype.charactersNotInMainOrAuxiliaryExemplars, 357 Subtype.asciiCharactersNotInMainOrAuxiliaryExemplars, "are not in the exemplar characters", 358 result); 359 } 360 } 361 } else if (path.contains("/gmtFormat") || path.contains("/gmtZeroFormat")) { 362 if (null != (disallowed = containsAllCountingParens(exemplars, exemplarsPlusAscii, value))) { 363 disallowed.removeAll(LETTER); // Allow ASCII A-Z in gmtFormat and gmtZeroFormat 364 if (disallowed.size() > 0) { 365 addMissingMessage(disallowed, errorOption, 366 Subtype.charactersNotInMainOrAuxiliaryExemplars, 367 Subtype.asciiCharactersNotInMainOrAuxiliaryExemplars, "are not in the exemplar characters", 368 result); 369 } 370 } 371 } else if (path.contains("/months") || path.contains("/quarters")) { 372 if (null != (disallowed = containsAllCountingParens(exemplars, exemplarsPlusAscii, value))) { 373 disallowed.removeAll("IVXivx"); // Allow Roman-numeral letters in month or quarter names 374 if (path.contains("/calendar[@type=\"generic\"]/months")) { 375 disallowed.removeAll("M"); // Generic-calendar month names contain 'M' and do not get modified 376 } 377 if (disallowed.size() > 0) { 378 addMissingMessage(disallowed, errorOption, 379 Subtype.charactersNotInMainOrAuxiliaryExemplars, 380 Subtype.asciiCharactersNotInMainOrAuxiliaryExemplars, "are not in the exemplar characters", 381 result); 382 } 383 } 384 } else if (path.contains("/localeDisplayNames") && !path.contains("/localeDisplayPattern")) { 385 // test first for outside of the set. 386 if (null != (disallowed = containsAllCountingParens(exemplars, exemplarsPlusAscii, value))) { 387 if (path.contains("[@type=\"iso8601\"]")) { 388 disallowed.removeAll("ISO"); // Name of ISO8601 calendar may contain "ISO" regardless of native script 389 } 390 if (disallowed.size() > 0) { 391 addMissingMessage(disallowed, errorOption, Subtype.charactersNotInMainOrAuxiliaryExemplars, 392 Subtype.asciiCharactersNotInMainOrAuxiliaryExemplars, "are not in the exemplar characters", result); 393 } 394 } 395 if (path.contains("/codePatterns")) { 396 disallowed = new UnicodeSet().addAll(value).retainAll(NUMBERS); 397 if (!disallowed.isEmpty()) { 398 addMissingMessage(disallowed, CheckStatus.errorType, 399 Subtype.patternCannotContainDigits, 400 Subtype.patternCannotContainDigits, 401 "cannot occur in locale fields", result); 402 } 403 } 404 } else if (path.contains("/units")) { 405 String noValidParentheses = IGNORE_PLACEHOLDER_PARENTHESES.matcher(value).replaceAll(""); 406 disallowed = new UnicodeSet().addAll(START_PAREN).addAll(END_PAREN) 407 .retainAll(noValidParentheses); 408 if (!disallowed.isEmpty()) { 409 addMissingMessage(disallowed, CheckStatus.errorType, 410 Subtype.parenthesesNotAllowed, 411 Subtype.parenthesesNotAllowed, 412 "cannot occur in units", result); 413 } 414 } else if (path.endsWith("/exemplarCity")) { 415 disallowed = containsAllCountingParens(exemplars, exemplarsPlusAscii, value); 416 if (disallowed != null) { 417 if ("root".equals(sourceLocale)) { 418 return this; 419 } 420 // Get script of locale. 421 LocaleIDParser parser = new LocaleIDParser().set(sourceLocale); 422 String script = parser.getScript(); 423 if (script.length() == 0) { 424 String localeID = sdi.getLikelySubtags().get(sourceLocale); 425 if (localeID == null) { 426 localeID = sdi.getLikelySubtags().get(parser.getLanguage()); 427 if (localeID == null) { 428 throw new IllegalArgumentException( 429 "A likely subtag for " + parser.getLanguage() + 430 " is required to get its script."); 431 } 432 } 433 script = parser.set(localeID).getScript(); 434 } 435 int myscript = UScript.getCodeFromName(script); 436 UnicodeSet toRemove = new UnicodeSet(); 437 for (int i = 0; i < disallowed.size(); i++) { 438 int c = disallowed.charAt(i); 439 if (UScript.getScript(c) == myscript) { 440 toRemove.add(c); 441 } 442 } 443 disallowed.removeAll(toRemove); 444 if (disallowed.size() > 0) { 445 addMissingMessage(disallowed, errorOption, Subtype.charactersNotInMainOrAuxiliaryExemplars, 446 Subtype.asciiCharactersNotInMainOrAuxiliaryExemplars, "are not in the exemplar characters", result); 447 } 448 } 449 } else if (path.contains("/annotations") && !path.contains("[@type")) { 450 if (null != (disallowed = containsAllCountingParens(exemplars, exemplarsPlusAscii, value))) { 451 addMissingMessage(disallowed, CheckStatus.warningType, Subtype.charactersNotInMainOrAuxiliaryExemplars, 452 Subtype.asciiCharactersNotInMainOrAuxiliaryExemplars, "are not in the exemplar characters", result); 453 } 454 } else { 455 if (null != (disallowed = containsAllCountingParens(exemplars, exemplarsPlusAscii, value))) { 456 addMissingMessage(disallowed, errorOption, Subtype.charactersNotInMainOrAuxiliaryExemplars, 457 Subtype.asciiCharactersNotInMainOrAuxiliaryExemplars, "are not in the exemplar characters", result); 458 } 459 } 460 461 // check for spaces 462 463 if (!value.equals(value.trim()) && !path.contains("/foreignSpaceReplacement")) { // foreignSpaceReplacement value can be just space 464 if (!leadOrTrailWhitespaceOk.reset(path).find()) { 465 result.add(new CheckStatus().setCause(this).setMainType(CheckStatus.errorType) 466 .setSubtype(Subtype.mustNotStartOrEndWithSpace) 467 .setMessage("This item must not start or end with whitespace, or be empty.")); 468 } 469 } 470 // if (value.contains(" ")) { 471 // result.add(new 472 // CheckStatus().setCause(this).setMainType(CheckStatus.errorType).setSubtype(Subtype.mustNotStartOrEndWithSpace) 473 // .setMessage("This item must not contain two space characters in a row.")); 474 // } 475 return this; 476 } 477 checkAndReplacePlaceholders(String path, String value, List<CheckStatus> result)478 private String checkAndReplacePlaceholders(String path, String value, List<CheckStatus> result) { 479 CheckStatus.Type statusType = getPhase() == Phase.BUILD ? CheckStatus.warningType : CheckStatus.errorType; // new errors, so get past the tests. 480 481 // Get information about what should be there 482 PlaceholderStatus placeholderStatus = patternPlaceholders.getStatus(path); 483 Map<String, PlaceholderInfo> placeholderInfo = patternPlaceholders.get(path); 484 485 int minimum = placeholderInfo.size(); 486 int maximum = placeholderInfo.size(); 487 488 if (placeholderStatus == PlaceholderStatus.LOCALE_DEPENDENT || placeholderStatus == PlaceholderStatus.MULTIPLE) { 489 // if locale dependent, it is because of count= or ordinal=. Figure out what the values are, and whether we are allowed to have none or one 490 XPathParts parts = XPathParts.getFrozenInstance(path); 491 PluralRules.PluralType ptype = PluralType.CARDINAL; 492 String keyword = parts.getAttributeValue(-1, "count"); 493 if (keyword == null) { 494 keyword = parts.getAttributeValue(-1, "ordinal"); 495 ptype = PluralType.ORDINAL; 496 } 497 SupplementalDataInfo sdi = CLDRConfig.getInstance().getSupplementalDataInfo(); 498 PluralRules rules = sdi.getPluralRules(new ULocale(getCldrFileToCheck().getLocaleID()), ptype); 499 if (rules != null) { 500 try { 501 if (rules.getUniqueKeywordValue(keyword) != PluralRules.NO_UNIQUE_VALUE) { 502 minimum = 0; 503 } 504 } catch (Exception e) { 505 // internal error, skip 506 } 507 } 508 } else if (placeholderStatus == PlaceholderStatus.OPTIONAL) { 509 minimum = 1; 510 } 511 512 // TODO: move these tests to CheckPlaceholder 513 514 // Now see what is there, and see if they match 515 Matcher matcher = patternMatcher.reset(value); 516 Multiset<String> matchList = TreeMultiset.create(); // Look for duplicate values. 517 while (matcher.find()) { 518 matchList.add(matcher.group()); 519 } 520 final Set<String> distinctPlaceholders = matchList.elementSet(); 521 int countDistinctPlaceholders = distinctPlaceholders.size(); 522 523 if (countDistinctPlaceholders > 0 && placeholderStatus != PlaceholderStatus.OPTIONAL ) { 524 // Verify that all placeholders are monotonically increasing from zero. 525 int expected = 0; 526 for (String element : distinctPlaceholders) { 527 // int elementValue = Integer.parseInt(element, 1, element.length()-1, 10); 528 int elementValue = Integer.parseInt(element.substring(1, element.length()-1), 10); 529 if (elementValue != expected) { 530 result.add(new CheckStatus().setCause(this).setMainType(statusType) 531 .setSubtype(Subtype.gapsInPlaceholderNumbers) 532 .setMessage("Placeholders {0} should be strictly increasing, starting at zero.", distinctPlaceholders)); 533 break; 534 } 535 ++expected; 536 } 537 } 538 539 // Check if duplicates are allowed 540 if (matchList.size() > countDistinctPlaceholders && placeholderStatus != PlaceholderStatus.MULTIPLE) { 541 Set<String> errors = new LinkedHashSet<>(); 542 for (Entry<String> entry : matchList.entrySet()) { 543 if (entry.getCount() > 1) { 544 errors.add(entry.getElement()); 545 } 546 } 547 result.add(new CheckStatus().setCause(this).setMainType(statusType) 548 .setSubtype(Subtype.duplicatePlaceholders) 549 .setMessage("Duplicate placeholders: {0}.", Joiner.on(", ").join(errors))); 550 } 551 552 // Now see if the number we have is within bounds 553 554 if (countDistinctPlaceholders < minimum) { 555 result.add(new CheckStatus().setCause(this).setMainType(statusType) 556 .setSubtype(Subtype.missingPlaceholders) 557 .setMessage("Need at least {0} placeholder(s), but only have {1}. Placeholders are: {2}", minimum, countDistinctPlaceholders, placeholderInfo)); 558 } else { 559 if (countDistinctPlaceholders > maximum) { 560 result.add(new CheckStatus().setCause(this).setMainType(statusType) 561 .setSubtype(Subtype.extraPlaceholders) 562 .setMessage("Need no more than {0} placeholders, but have too many with {1}.", countDistinctPlaceholders, minimum)); 563 } 564 } 565 // Return the pattern with placeholders replaced 566 return matchList.isEmpty() ? value : patternMatcher.replaceAll(STAND_IN); 567 } 568 569 /** 570 * Checks if ASCII characters are allowed in a currency symbol in the specified locale. 571 * @param localeID the locale ID that the currency is in 572 * @param currency the currency to be checked 573 * @return true if ASCII is not allowed 574 */ asciiNotAllowed(String localeID, String currency)575 private boolean asciiNotAllowed(String localeID, String currency) { 576 // Don't allow ascii at all for bidi scripts. 577 String charOrientation = getResolvedCldrFileToCheck().getStringValue( 578 "//ldml/layout/orientation/characterOrder"); 579 if (charOrientation.equals("right-to-left")) { 580 return true; 581 } 582 583 // Get script of locale. if Latn, quit. 584 LocaleIDParser parser = new LocaleIDParser().set(localeID); 585 String script = parser.getScript(); 586 if (script.length() == 0) { 587 localeID = sdi.getLikelySubtags().get(localeID); 588 if (localeID == null) { 589 localeID = sdi.getLikelySubtags().get(parser.getLanguage()); 590 if (localeID == null) { 591 throw new IllegalArgumentException( 592 "A likely subtag for " + parser.getLanguage() + 593 " is required to get its script."); 594 } 595 } 596 script = parser.set(localeID).getScript(); 597 } 598 if (script.equals("Latn")) { 599 return false; 600 } 601 602 // Enforce checking of for other non-Latin scripts, for all currencies 603 // whose countries use that script, e.g. Russian should have Cyrillic 604 // currency symbols for modern currencies of countries with official 605 // languages whose script is Cyrillic (Bulgaria, Serbia, ...). 606 Set<String> currencies = getCurrenciesForScript(script); 607 return currencies != null && currencies.contains(currency); 608 } 609 getCurrenciesForScript(String script)610 private Set<String> getCurrenciesForScript(String script) { 611 if (scriptToCurrencies != null) return scriptToCurrencies.get(script); 612 613 // Get mapping of scripts to the territories that use that script in 614 // any of their primary languages. 615 Relation scriptToTerritories = new Relation(new HashMap<String, Set<String>>(), HashSet.class); 616 for (String lang : sdi.getBasicLanguageDataLanguages()) { 617 BasicLanguageData langData = sdi.getBasicLanguageDataMap(lang).get(Type.primary); 618 if (langData == null) { 619 continue; 620 } 621 for (String curScript : langData.getScripts()) { 622 scriptToTerritories.putAll(curScript, langData.getTerritories()); 623 } 624 } 625 626 // For each territory, get all of its legal tender currencies. 627 Date now = new Date(System.currentTimeMillis()); 628 scriptToCurrencies = new Relation(new HashMap<String, Set<String>>(), HashSet.class); 629 for (Object curScript : scriptToTerritories.keySet()) { 630 Set<String> territories = scriptToTerritories.get(curScript); 631 Set<String> currencies = new HashSet<>(); 632 for (String territory : territories) { 633 Set<CurrencyDateInfo> currencyInfo = sdi.getCurrencyDateInfo(territory); 634 for (CurrencyDateInfo info : currencyInfo) { 635 if (info.isLegalTender() && info.getEnd().compareTo(now) > 0) { 636 currencies.add(info.getCurrency()); 637 } 638 } 639 } 640 scriptToCurrencies.putAll(curScript, currencies); 641 } 642 return scriptToCurrencies.get(script); 643 } 644 645 /** 646 * Extracts just the text from a date field, replacing all the variable fields by variableReplacement. Return null 647 * if 648 * there is an error (a different test will find that error). 649 */ extractDatePatternText(String value, String variableReplacement, StringBuilder justText)650 public boolean extractDatePatternText(String value, String variableReplacement, StringBuilder justText) { 651 boolean haveText = false; 652 try { 653 formatParser.set(value); 654 } catch (Exception e) { 655 return false; // give up, it is illegal 656 } 657 boolean doReplacement = variableReplacement != null && variableReplacement.length() > 0; 658 justText.setLength(0); 659 for (Object item : formatParser.getItems()) { 660 if (item instanceof String) { 661 justText.append(item); 662 haveText = true; 663 } else { 664 if (doReplacement) { 665 justText.append(variableReplacement); 666 } 667 } 668 } 669 return haveText; 670 } 671 containsPart(String source, String... segments)672 public boolean containsPart(String source, String... segments) { 673 for (int i = 0; i < segments.length; ++i) { 674 if (source.indexOf(segments[i]) > 0) { 675 return true; 676 } 677 } 678 return false; 679 } 680 681 static final String TEST = "؉"; 682 addMissingMessage(UnicodeSet missing, CheckStatus.Type warningVsError, Subtype subtype, Subtype subtypeAscii, String qualifier, List<CheckStatus> result)683 private void addMissingMessage(UnicodeSet missing, CheckStatus.Type warningVsError, Subtype subtype, 684 Subtype subtypeAscii, 685 String qualifier, List<CheckStatus> result) { 686 String fixedMissing = prettyPrint.format(missing); 687 BitSet scripts = new BitSet(); 688 for (String s : missing) { 689 final int script = UScript.getScript(s.codePointAt(0)); 690 if (script == UScript.INHERITED || script == UScript.COMMON) { 691 continue; 692 } 693 scripts.set(script); 694 } 695 StringBuilder scriptString = new StringBuilder(); 696 if (!scripts.isEmpty()) { 697 scriptString.append("{"); 698 for (int i = scripts.nextSetBit(0); i >= 0; i = scripts.nextSetBit(i + 1)) { 699 if (scriptString.length() > 1) { 700 scriptString.append(", "); 701 } 702 scriptString.append(UScript.getName(i)); 703 } 704 scriptString.append("}"); 705 } 706 final String helpUrl = "http://cldr.unicode.org/translation/-core-data/exemplars#TOC-Handling-Warnings-in-Exemplar-characters"; 707 final String message = "The characters \u200E{0}\u200E {1} {2}. " 708 + "For what to do, see <i>Handling Warnings</i> in <a target='CLDR-ST-DOCS' href='" 709 + helpUrl 710 + "'>Exemplar Characters</a>."; 711 result.add(new CheckStatus() 712 .setCause(this) 713 .setMainType(warningVsError) 714 .setSubtype(ASCII.containsAll(missing) ? subtypeAscii : subtype) 715 .setMessage(message, new Object[] { fixedMissing, scriptString, qualifier })); 716 } 717 718 static final Normalizer2 NFC = Normalizer2.getInstance(null, "nfc", Normalizer2.Mode.COMPOSE); 719 720 /** 721 * Return null if ok, otherwise UnicodeSet of bad characters 722 * 723 * @param exemplarSet 724 * @param value 725 * @return 726 */ containsAllCountingParens(UnicodeSet exemplarSet, UnicodeSet exemplarSetPlusASCII, String value)727 private UnicodeSet containsAllCountingParens(UnicodeSet exemplarSet, UnicodeSet exemplarSetPlusASCII, String value) { 728 UnicodeSet result = null; 729 if (exemplarSet.containsAll(value)) { 730 return result; 731 } 732 733 // Normalize 734 value = NFC.normalize(value); 735 736 // if we failed, then check that everything outside of () is ok. 737 // and everything inside parens is either ASCII or in the set 738 int lastPos = 0; 739 while (true) { 740 int start = START_PAREN.findIn(value, lastPos, false); 741 String outside = value.substring(lastPos, start); 742 result = addDisallowedItems(exemplarSet, outside, result); 743 if (start == value.length()) { 744 break; // all done 745 } 746 ++start; 747 int end = END_PAREN.findIn(value, start, false); 748 // don't worry about mixed brackets 749 String inside = value.substring(start, end); 750 result = addDisallowedItems(exemplarSetPlusASCII, inside, result); 751 if (end == value.length()) { 752 break; // all done 753 } 754 lastPos = end + 1; 755 } 756 return result; 757 } 758 addDisallowedItems(UnicodeSet exemplarSet, String outside, UnicodeSet result)759 private UnicodeSet addDisallowedItems(UnicodeSet exemplarSet, String outside, UnicodeSet result) { 760 if (!exemplarSet.containsAll(outside)) { 761 if (result == null) { 762 result = new UnicodeSet(); 763 } 764 result.addAll(new UnicodeSet().addAll(outside).removeAll(exemplarSet)); 765 } 766 return result; 767 } 768 } 769