1 /* 2 ****************************************************************************** 3 * Copyright (C) 2005-2012, International Business Machines Corporation and * 4 * others. All Rights Reserved. * 5 ****************************************************************************** 6 */ 7 package org.unicode.cldr.test; 8 9 import java.util.BitSet; 10 import java.util.Date; 11 import java.util.HashMap; 12 import java.util.HashSet; 13 import java.util.LinkedHashSet; 14 import java.util.List; 15 import java.util.Map; 16 import java.util.Set; 17 import java.util.regex.Matcher; 18 import java.util.regex.Pattern; 19 20 import org.unicode.cldr.test.CheckCLDR.CheckStatus.Subtype; 21 import org.unicode.cldr.util.CLDRFile; 22 import org.unicode.cldr.util.CLDRFile.Status; 23 import org.unicode.cldr.util.Factory; 24 import org.unicode.cldr.util.InternalCldrException; 25 import org.unicode.cldr.util.LocaleIDParser; 26 import org.unicode.cldr.util.PatternCache; 27 import org.unicode.cldr.util.PatternPlaceholders; 28 import org.unicode.cldr.util.PatternPlaceholders.PlaceholderInfo; 29 import org.unicode.cldr.util.PatternPlaceholders.PlaceholderStatus; 30 import org.unicode.cldr.util.SupplementalDataInfo; 31 import org.unicode.cldr.util.SupplementalDataInfo.BasicLanguageData; 32 import org.unicode.cldr.util.SupplementalDataInfo.BasicLanguageData.Type; 33 import org.unicode.cldr.util.SupplementalDataInfo.CurrencyDateInfo; 34 import org.unicode.cldr.util.UnicodeSetPrettyPrinter; 35 import org.unicode.cldr.util.XMLSource; 36 import org.unicode.cldr.util.XPathParts; 37 38 import com.google.common.base.Joiner; 39 import com.google.common.collect.Multiset; 40 import com.google.common.collect.Multiset.Entry; 41 import com.google.common.collect.TreeMultiset; 42 import com.ibm.icu.impl.Relation; 43 import com.ibm.icu.lang.UScript; 44 import com.ibm.icu.text.Collator; 45 import com.ibm.icu.text.DateTimePatternGenerator; 46 import com.ibm.icu.text.Normalizer2; 47 import com.ibm.icu.text.PluralRules; 48 import com.ibm.icu.text.Transform; 49 import com.ibm.icu.text.UnicodeSet; 50 import com.ibm.icu.util.ULocale; 51 52 public class CheckForExemplars extends FactoryCheckCLDR { 53 private static final UnicodeSet RTL_CONTROLS = new UnicodeSet("[\\u061C\\u200E\\u200F\\u202A-\\u202D\\u2066-\\u2069]"); 54 55 private static final UnicodeSet RTL = new UnicodeSet("[[:bc=AL:][:bc=R:]]"); 56 57 private static final String STAND_IN = "#"; 58 59 // private final UnicodeSet commonAndInherited = new UnicodeSet(CheckExemplars.Allowed).complement(); 60 // "[[:script=common:][:script=inherited:][:alphabetic=false:]]"); 61 static String[] EXEMPLAR_SKIPS = { 62 "/currencySpacing", 63 "/exemplarCharacters", 64 // "/pattern", 65 "/localizedPatternChars", 66 "/segmentations", 67 "/references", 68 "/localeDisplayNames/variants/", 69 "/commonlyUsed", 70 "/defaultNumberingSystem", 71 "/otherNumberingSystems", 72 "/exponential", 73 "/nan", 74 "/scientificFormats", 75 "/inText", 76 "/orientation", 77 "/symbol[@alt=\"narrow\"]", 78 "/characters/parseLenients" 79 }; 80 81 static String[] DATE_PARTS = { 82 "/hourFormat", 83 "/dateFormatItem", 84 "/intervalFormatItem", 85 "/dateFormatLength", 86 "timeFormatLength" 87 }; 88 89 static final UnicodeSet START_PAREN = new UnicodeSet("[[:Ps:]]").freeze(); 90 static final UnicodeSet END_PAREN = new UnicodeSet("[[:Pe:]]").freeze(); 91 static final UnicodeSet ALL_CURRENCY_SYMBOLS = new UnicodeSet("[[:Sc:]]").freeze(); 92 static final UnicodeSet LETTER = new UnicodeSet("[[A-Za-z]]").freeze(); 93 static final UnicodeSet NUMBERS = new UnicodeSet("[[:N:]]").freeze(); 94 static final UnicodeSet DISALLOWED_HOUR_FORMAT = new UnicodeSet("[[:letter:]]").remove('H').remove('m').freeze(); 95 static final UnicodeSet DISALLOWED_IN_RANGE = new UnicodeSet("[:L:]").freeze(); 96 97 private UnicodeSet exemplars; 98 private UnicodeSet exemplarsPlusAscii; 99 //private static final UnicodeSet DISALLOWED_IN_scriptRegionExemplars = new UnicodeSet("[()();,;,]").freeze(); 100 //private static final UnicodeSet DISALLOWED_IN_scriptRegionExemplarsWithParens = new UnicodeSet("[;,;,]").freeze(); 101 102 // Hack until cldrbug 6566 is fixed. TODO 103 private static final Pattern IGNORE_PLACEHOLDER_PARENTHESES = PatternCache.get("\\p{Ps}#\\p{Pe}"); 104 105 // private UnicodeSet currencySymbolExemplars; 106 private boolean skip; 107 private Collator col; 108 private Collator spaceCol; 109 UnicodeSetPrettyPrinter prettyPrint; 110 private Status otherPathStatus = new Status(); 111 private Matcher patternMatcher = ExampleGenerator.PARAMETER.matcher(""); 112 private boolean errorDefaultOption; 113 114 // for extracting date pattern text 115 private DateTimePatternGenerator.FormatParser formatParser = new DateTimePatternGenerator.FormatParser(); 116 StringBuilder justText = new StringBuilder(); 117 118 // public static final Pattern SUPPOSED_TO_BE_MESSAGE_FORMAT_PATTERN = PatternCache.get("/(" + 119 // "codePattern" + 120 // "|dateRangePattern" + 121 // "|dateTimeFormat[^/]*?/pattern" + 122 // "|appendItem" + 123 // "|intervalFormatFallback" + 124 // "|hoursFormat" + 125 // "|gmtFormat" + 126 // "|regionFormat" + 127 // "|fallbackFormat" + 128 // "|unitPattern.*@count=\"(zero|one|two|few|many|other)\"" + 129 // "|localePattern" + 130 // "|localeKeyTypePattern" + 131 // "|listPatternPart" + 132 // "|ellipsis" + 133 // "|monthPattern" + 134 // ")"); 135 // private Matcher supposedToBeMessageFormat = SUPPOSED_TO_BE_MESSAGE_FORMAT_PATTERN.matcher(""); 136 137 public static final Pattern LEAD_OR_TRAIL_WHITESPACE_OK = PatternCache.get("/(" + 138 "references/reference" + 139 "|insertBetween" + 140 ")"); 141 private Matcher leadOrTrailWhitespaceOk = LEAD_OR_TRAIL_WHITESPACE_OK.matcher(""); 142 143 private static UnicodeSet ASCII = new UnicodeSet("[\\u0020-\\u007F]").freeze(); 144 145 private PatternPlaceholders patternPlaceholders = PatternPlaceholders.getInstance(); 146 private SupplementalDataInfo sdi; 147 private Relation scriptToCurrencies; 148 CheckForExemplars(Factory factory)149 public CheckForExemplars(Factory factory) { 150 super(factory); 151 // patternPlaceholders = RegexLookup.of(new PlaceholderTransform()) 152 // .loadFromFile(PatternPlaceholders.class, "data/Placeholders.txt"); 153 sdi = SupplementalDataInfo.getInstance(); 154 } 155 156 /** 157 * Adapted from GenerateXMB.MapTransform 158 * 159 * @author jchye 160 * 161 */ 162 static class PlaceholderTransform implements Transform<String, Set<String>> { 163 @Override transform(String source)164 public Set<String> transform(String source) { 165 Set<String> placeholders = new LinkedHashSet<>(); 166 String[] parts = source.split(";\\s+"); 167 for (String part : parts) { 168 int equalsPos = part.indexOf('='); 169 String placeholder = part.substring(0, equalsPos).trim(); 170 placeholders.add(placeholder); 171 } 172 return placeholders; 173 } 174 } 175 176 @Override setCldrFileToCheck(CLDRFile cldrFile, Options options, List<CheckStatus> possibleErrors)177 public CheckCLDR setCldrFileToCheck(CLDRFile cldrFile, Options options, List<CheckStatus> possibleErrors) { 178 if (cldrFile == null) return this; 179 skip = true; 180 super.setCldrFileToCheck(cldrFile, options, possibleErrors); 181 if (cldrFile.getLocaleID().equals("root")) { 182 return this; 183 } 184 185 errorDefaultOption = options.get(Options.Option.exemplarErrors) != null; 186 187 String locale = cldrFile.getLocaleID(); 188 col = Collator.getInstance(new ULocale(locale)); 189 spaceCol = Collator.getInstance(new ULocale(locale)); 190 spaceCol.setStrength(Collator.PRIMARY); 191 192 CLDRFile resolvedFile = getResolvedCldrFileToCheck(); 193 boolean[] ok = new boolean[1]; 194 exemplars = safeGetExemplars("", possibleErrors, resolvedFile, ok); 195 196 if (exemplars == null) { 197 CheckStatus item = new CheckStatus().setCause(this).setMainType(CheckStatus.errorType) 198 .setSubtype(Subtype.noExemplarCharacters) 199 .setMessage("No Exemplar Characters: {0}", new Object[] { this.getClass().getName() }); 200 possibleErrors.add(item); 201 return this; 202 } else if (!ok[0]) { 203 exemplars = new UnicodeSet(); 204 } else { 205 exemplars = new UnicodeSet(exemplars); // modifiable copy 206 } 207 208 boolean isRTL = RTL.containsSome(exemplars); 209 if (isRTL) { 210 exemplars.addAll(RTL_CONTROLS); 211 } 212 // UnicodeSet temp = resolvedFile.getExemplarSet("standard"); 213 // if (temp != null) exemplars.addAll(temp); 214 UnicodeSet auxiliary = safeGetExemplars("auxiliary", possibleErrors, resolvedFile, ok); // resolvedFile.getExemplarSet("auxiliary", 215 // CLDRFile.WinningChoice.WINNING); 216 if (auxiliary != null) { 217 exemplars.addAll(auxiliary); 218 } 219 220 if (CheckExemplars.USE_PUNCTUATION) { 221 UnicodeSet punctuation = safeGetExemplars("punctuation", possibleErrors, resolvedFile, ok); // resolvedFile.getExemplarSet("auxiliary", 222 if (punctuation != null) { 223 exemplars.addAll(punctuation); 224 } 225 226 UnicodeSet numbers = getNumberSystemExemplars(); 227 exemplars.addAll(numbers); 228 229 // TODO fix replacement character 230 exemplars.add(STAND_IN); 231 } 232 233 exemplars.addAll(CheckExemplars.AlwaysOK).freeze(); 234 exemplarsPlusAscii = new UnicodeSet(exemplars).addAll(ASCII).freeze(); 235 236 skip = false; 237 prettyPrint = new UnicodeSetPrettyPrinter() 238 .setOrdering(col != null ? col : Collator.getInstance(ULocale.ROOT)) 239 .setSpaceComparator(col != null ? col : Collator.getInstance(ULocale.ROOT) 240 .setStrength2(Collator.PRIMARY)) 241 .setCompressRanges(true); 242 return this; 243 } 244 getNumberSystemExemplars()245 private UnicodeSet getNumberSystemExemplars() { 246 String numberSystem = getCldrFileToCheck().getStringValue("//ldml/numbers/defaultNumberingSystem"); 247 String digits = sdi.getDigits(numberSystem); 248 return new UnicodeSet().addAll(digits); 249 } 250 safeGetExemplars(String type, List<CheckStatus> possibleErrors, CLDRFile resolvedFile, boolean[] ok)251 private UnicodeSet safeGetExemplars(String type, List<CheckStatus> possibleErrors, CLDRFile resolvedFile, 252 boolean[] ok) { 253 UnicodeSet result = null; 254 try { 255 result = resolvedFile.getExemplarSet(type, CLDRFile.WinningChoice.WINNING); 256 ok[0] = true; 257 } catch (IllegalArgumentException iae) { 258 possibleErrors.add(new CheckStatus() 259 .setCause(this).setMainType(CheckStatus.errorType).setSubtype(Subtype.couldNotAccessExemplars) 260 .setMessage("Could not get exemplar set: " + iae.toString())); 261 ok[0] = false; 262 } 263 return result; 264 } 265 266 @Override handleCheck(String path, String fullPath, String value, Options options, List<CheckStatus> result)267 public CheckCLDR handleCheck(String path, String fullPath, String value, 268 Options options, List<CheckStatus> result) { 269 if (fullPath == null) return this; // skip paths that we don't have 270 if (value == null) return this; // skip values that we don't have ? 271 if (skip) return this; 272 if (path == null) { 273 throw new InternalCldrException("Empty path!"); 274 } else if (getCldrFileToCheck() == null) { 275 throw new InternalCldrException("no file to check!"); 276 } 277 String sourceLocale = getResolvedCldrFileToCheck().getSourceLocaleID(path, otherPathStatus); 278 279 // if we are an alias to another path, then skip 280 // if (!path.equals(otherPathStatus.pathWhereFound)) { 281 // return this; 282 // } 283 284 // now check locale source 285 if (XMLSource.CODE_FALLBACK_ID.equals(sourceLocale)) { 286 return this; 287 // } else if ("root".equals(sourceLocale)) { 288 // // skip eras for non-gregorian 289 // if (true) return this; 290 // if (path.indexOf("/calendar") >= 0 && path.indexOf("gregorian") <= 0) return this; 291 } 292 293 if (containsPart(path, EXEMPLAR_SKIPS)) { 294 return this; 295 } 296 297 CheckStatus.Type errorOption = errorDefaultOption & sourceLocale.equals(getResolvedCldrFileToCheck().getLocaleID()) 298 ? CheckStatus.errorType : CheckStatus.warningType; 299 300 value = checkAndReplacePlaceholders(path, value, result); 301 if (path.startsWith("//ldml/numbers/miscPatterns") && path.contains("[@type=\"range\"]")) { 302 if (DISALLOWED_IN_RANGE.containsSome(value)) { 303 result 304 .add(new CheckStatus() 305 .setCause(this) 306 .setMainType(CheckStatus.errorType) 307 .setSubtype(Subtype.illegalCharactersInPattern) 308 .setMessage( 309 "Range patterns should not have letters.", 310 new Object[] {})); 311 } 312 } 313 // Now handle date patterns. 314 if (containsPart(path, DATE_PARTS)) { 315 if (!extractDatePatternText(value, STAND_IN, justText)) { 316 return this; // we are done, no text. 317 } 318 value = justText.toString(); 319 if (NUMBERS.containsSome(value)) { 320 UnicodeSet disallowed = new UnicodeSet().addAll(value).retainAll(NUMBERS); 321 addMissingMessage(disallowed, CheckStatus.errorType, 322 Subtype.patternCannotContainDigits, 323 Subtype.patternCannotContainDigits, 324 "cannot occur in date or time patterns", result); 325 } 326 if (path.endsWith("/hourFormat")) { 327 UnicodeSet disallowed = new UnicodeSet().addAll(value) 328 .retainAll(DISALLOWED_HOUR_FORMAT); 329 if (!disallowed.isEmpty()) { 330 addMissingMessage(disallowed, CheckStatus.errorType, 331 Subtype.patternContainsInvalidCharacters, 332 Subtype.patternContainsInvalidCharacters, 333 "cannot occur in the hour format", result); 334 } 335 } 336 } 337 338 if (path.startsWith("//ldml/posix/messages")) return this; 339 340 UnicodeSet disallowed; 341 342 if (path.contains("/currency") && path.contains("/symbol")) { 343 if (null != (disallowed = containsAllCountingParens(exemplars, exemplarsPlusAscii, value))) { 344 disallowed.removeAll(ALL_CURRENCY_SYMBOLS); 345 disallowed.removeAll(LETTER); // Allow ASCII A-Z in currency symbols 346 if (disallowed.size() > 0) { 347 // && asciiNotAllowed(getCldrFileToCheck().getLocaleID(), currency)) { 348 addMissingMessage(disallowed, errorOption, 349 Subtype.charactersNotInMainOrAuxiliaryExemplars, 350 Subtype.asciiCharactersNotInMainOrAuxiliaryExemplars, "are not in the exemplar characters", 351 result); 352 } 353 } 354 } else if (path.contains("/gmtFormat") || path.contains("/gmtZeroFormat")) { 355 if (null != (disallowed = containsAllCountingParens(exemplars, exemplarsPlusAscii, value))) { 356 disallowed.removeAll(LETTER); // Allow ASCII A-Z in gmtFormat and gmtZeroFormat 357 if (disallowed.size() > 0) { 358 addMissingMessage(disallowed, errorOption, 359 Subtype.charactersNotInMainOrAuxiliaryExemplars, 360 Subtype.asciiCharactersNotInMainOrAuxiliaryExemplars, "are not in the exemplar characters", 361 result); 362 } 363 } 364 } else if (path.contains("/months") || path.contains("/quarters")) { 365 if (null != (disallowed = containsAllCountingParens(exemplars, exemplarsPlusAscii, value))) { 366 disallowed.removeAll("IVXivx"); // Allow Roman-numeral letters in month or quarter names 367 if (path.contains("/calendar[@type=\"generic\"]/months")) { 368 disallowed.removeAll("M"); // Generic-calendar month names contain 'M' and do not get modified 369 } 370 if (disallowed.size() > 0) { 371 addMissingMessage(disallowed, errorOption, 372 Subtype.charactersNotInMainOrAuxiliaryExemplars, 373 Subtype.asciiCharactersNotInMainOrAuxiliaryExemplars, "are not in the exemplar characters", 374 result); 375 } 376 } 377 } else if (path.contains("/localeDisplayNames") && !path.contains("/localeDisplayPattern")) { 378 // test first for outside of the set. 379 if (null != (disallowed = containsAllCountingParens(exemplars, exemplarsPlusAscii, value))) { 380 if (path.contains("[@type=\"iso8601\"]")) { 381 disallowed.removeAll("ISO"); // Name of ISO8601 calendar may contain "ISO" regardless of native script 382 } 383 if (disallowed.size() > 0) { 384 addMissingMessage(disallowed, errorOption, Subtype.charactersNotInMainOrAuxiliaryExemplars, 385 Subtype.asciiCharactersNotInMainOrAuxiliaryExemplars, "are not in the exemplar characters", result); 386 } 387 } 388 if (path.contains("/codePatterns")) { 389 disallowed = new UnicodeSet().addAll(value).retainAll(NUMBERS); 390 if (!disallowed.isEmpty()) { 391 addMissingMessage(disallowed, CheckStatus.errorType, 392 Subtype.patternCannotContainDigits, 393 Subtype.patternCannotContainDigits, 394 "cannot occur in locale fields", result); 395 } 396 } 397 } else if (path.contains("/units")) { 398 String noValidParentheses = IGNORE_PLACEHOLDER_PARENTHESES.matcher(value).replaceAll(""); 399 disallowed = new UnicodeSet().addAll(START_PAREN).addAll(END_PAREN) 400 .retainAll(noValidParentheses); 401 if (!disallowed.isEmpty()) { 402 addMissingMessage(disallowed, CheckStatus.errorType, 403 Subtype.parenthesesNotAllowed, 404 Subtype.parenthesesNotAllowed, 405 "cannot occur in units", result); 406 } 407 } else if (path.endsWith("/exemplarCity")) { 408 disallowed = containsAllCountingParens(exemplars, exemplarsPlusAscii, value); 409 if (disallowed != null) { 410 if ("root".equals(sourceLocale)) { 411 return this; 412 } 413 // Get script of locale. 414 LocaleIDParser parser = new LocaleIDParser().set(sourceLocale); 415 String script = parser.getScript(); 416 if (script.length() == 0) { 417 String localeID = sdi.getLikelySubtags().get(sourceLocale); 418 if (localeID == null) { 419 localeID = sdi.getLikelySubtags().get(parser.getLanguage()); 420 if (localeID == null) { 421 throw new IllegalArgumentException( 422 "A likely subtag for " + parser.getLanguage() + 423 " is required to get its script."); 424 } 425 } 426 script = parser.set(localeID).getScript(); 427 } 428 int myscript = UScript.getCodeFromName(script); 429 UnicodeSet toRemove = new UnicodeSet(); 430 for (int i = 0; i < disallowed.size(); i++) { 431 int c = disallowed.charAt(i); 432 if (UScript.getScript(c) == myscript) { 433 toRemove.add(c); 434 } 435 } 436 disallowed.removeAll(toRemove); 437 if (disallowed.size() > 0) { 438 addMissingMessage(disallowed, errorOption, Subtype.charactersNotInMainOrAuxiliaryExemplars, 439 Subtype.asciiCharactersNotInMainOrAuxiliaryExemplars, "are not in the exemplar characters", result); 440 } 441 } 442 } else if (path.contains("/annotations") && !path.contains("[@type")) { 443 if (null != (disallowed = containsAllCountingParens(exemplars, exemplarsPlusAscii, value))) { 444 addMissingMessage(disallowed, CheckStatus.warningType, Subtype.charactersNotInMainOrAuxiliaryExemplars, 445 Subtype.asciiCharactersNotInMainOrAuxiliaryExemplars, "are not in the exemplar characters", result); 446 } 447 } else { 448 if (null != (disallowed = containsAllCountingParens(exemplars, exemplarsPlusAscii, value))) { 449 addMissingMessage(disallowed, errorOption, Subtype.charactersNotInMainOrAuxiliaryExemplars, 450 Subtype.asciiCharactersNotInMainOrAuxiliaryExemplars, "are not in the exemplar characters", result); 451 } 452 } 453 454 // check for spaces 455 456 if (!value.equals(value.trim())) { 457 if (!leadOrTrailWhitespaceOk.reset(path).find()) { 458 result.add(new CheckStatus().setCause(this).setMainType(CheckStatus.errorType) 459 .setSubtype(Subtype.mustNotStartOrEndWithSpace) 460 .setMessage("This item must not start or end with whitespace, or be empty.")); 461 } 462 } 463 // if (value.contains(" ")) { 464 // result.add(new 465 // CheckStatus().setCause(this).setMainType(CheckStatus.errorType).setSubtype(Subtype.mustNotStartOrEndWithSpace) 466 // .setMessage("This item must not contain two space characters in a row.")); 467 // } 468 return this; 469 } 470 checkAndReplacePlaceholders(String path, String value, List<CheckStatus> result)471 private String checkAndReplacePlaceholders(String path, String value, List<CheckStatus> result) { 472 CheckStatus.Type statusType = getPhase() == Phase.BUILD ? CheckStatus.warningType : CheckStatus.errorType; // new errors, so get past the tests. 473 474 // Get information about what should be there 475 PlaceholderStatus placeholderStatus = patternPlaceholders.getStatus(path); 476 Map<String, PlaceholderInfo> placeholderInfo = patternPlaceholders.get(path); 477 478 int minimum = placeholderInfo.size(); 479 int maximum = placeholderInfo.size(); 480 481 if (placeholderStatus == PlaceholderStatus.LOCALE_DEPENDENT || placeholderStatus == PlaceholderStatus.MULTIPLE) { 482 // if locale dependent, it is because of count= or ordinal=. Figure out what the values are, and whether we are allowed to have none or one 483 PluralRules rules = PluralRules.forLocale(new ULocale(getCldrFileToCheck().getLocaleID())); 484 if (rules != null) { 485 XPathParts parts = XPathParts.getFrozenInstance(path); 486 String keyword = parts.getAttributeValue(-1, "count"); 487 if (keyword == null) { 488 keyword = parts.getAttributeValue(-1, "ordinal"); 489 } 490 try { 491 if (rules.getUniqueKeywordValue(keyword) != PluralRules.NO_UNIQUE_VALUE) { 492 minimum = 0; 493 } 494 } catch (Exception e) { 495 // internal error, skip 496 } 497 } 498 } 499 500 // TODO: move these tests to CheckPlaceholder 501 502 // Now see what is there, and see if they match 503 Matcher matcher = patternMatcher.reset(value); 504 Multiset<String> matchList = TreeMultiset.create(); // Look for duplicate values. 505 while (matcher.find()) { 506 matchList.add(matcher.group()); 507 } 508 final Set<String> distinctPlaceholders = matchList.elementSet(); 509 int countDistinctPlaceholders = distinctPlaceholders.size(); 510 511 if (countDistinctPlaceholders > 0) { 512 // Verify that all placeholders are monotonically increasing from zero. 513 int expected = 0; 514 for (String element : distinctPlaceholders) { 515 // int elementValue = Integer.parseInt(element, 1, element.length()-1, 10); 516 int elementValue = Integer.parseInt(element.substring(1, element.length()-1), 10); 517 if (elementValue != expected) { 518 result.add(new CheckStatus().setCause(this).setMainType(statusType) 519 .setSubtype(Subtype.gapsInPlaceholderNumbers) 520 .setMessage("Placeholders {0} should be strictly increasing, starting at zero.", distinctPlaceholders)); 521 break; 522 } 523 ++expected; 524 } 525 } 526 527 // Check if duplicates are allowed 528 if (matchList.size() > countDistinctPlaceholders && placeholderStatus != PlaceholderStatus.MULTIPLE) { 529 Set<String> errors = new LinkedHashSet<>(); 530 for (Entry<String> entry : matchList.entrySet()) { 531 if (entry.getCount() > 1) { 532 errors.add(entry.getElement()); 533 } 534 } 535 result.add(new CheckStatus().setCause(this).setMainType(statusType) 536 .setSubtype(Subtype.duplicatePlaceholders) 537 .setMessage("Duplicate placeholders: {0}.", Joiner.on(", ").join(errors))); 538 } 539 540 // Now see if the number we have is within bounds 541 542 if (countDistinctPlaceholders < minimum) { 543 result.add(new CheckStatus().setCause(this).setMainType(statusType) 544 .setSubtype(Subtype.missingPlaceholders) 545 .setMessage("Need at least {0} placeholder(s), but only have {1}. Placeholders are: {2}", minimum, countDistinctPlaceholders, placeholderInfo)); 546 } else { 547 if (countDistinctPlaceholders > maximum) { 548 result.add(new CheckStatus().setCause(this).setMainType(statusType) 549 .setSubtype(Subtype.extraPlaceholders) 550 .setMessage("Need no more than {0} placeholders, but have too many with {1}.", countDistinctPlaceholders, minimum)); 551 } 552 } 553 // Return the pattern with placeholders replaced 554 return matchList.isEmpty() ? value : patternMatcher.replaceAll(STAND_IN); 555 } 556 557 /** 558 * Checks if ASCII characters are allowed in a currency symbol in the specified locale. 559 * @param localeID the locale ID that the currency is in 560 * @param currency the currency to be checked 561 * @return true if ASCII is not allowed 562 */ asciiNotAllowed(String localeID, String currency)563 private boolean asciiNotAllowed(String localeID, String currency) { 564 // Don't allow ascii at all for bidi scripts. 565 String charOrientation = getResolvedCldrFileToCheck().getStringValue( 566 "//ldml/layout/orientation/characterOrder"); 567 if (charOrientation.equals("right-to-left")) { 568 return true; 569 } 570 571 // Get script of locale. if Latn, quit. 572 LocaleIDParser parser = new LocaleIDParser().set(localeID); 573 String script = parser.getScript(); 574 if (script.length() == 0) { 575 localeID = sdi.getLikelySubtags().get(localeID); 576 if (localeID == null) { 577 localeID = sdi.getLikelySubtags().get(parser.getLanguage()); 578 if (localeID == null) { 579 throw new IllegalArgumentException( 580 "A likely subtag for " + parser.getLanguage() + 581 " is required to get its script."); 582 } 583 } 584 script = parser.set(localeID).getScript(); 585 } 586 if (script.equals("Latn")) { 587 return false; 588 } 589 590 // Enforce checking of for other non-Latin scripts, for all currencies 591 // whose countries use that script, e.g. Russian should have Cyrillic 592 // currency symbols for modern currencies of countries with official 593 // languages whose script is Cyrillic (Bulgaria, Serbia, ...). 594 Set<String> currencies = getCurrenciesForScript(script); 595 return currencies != null && currencies.contains(currency); 596 } 597 getCurrenciesForScript(String script)598 private Set<String> getCurrenciesForScript(String script) { 599 if (scriptToCurrencies != null) return scriptToCurrencies.get(script); 600 601 // Get mapping of scripts to the territories that use that script in 602 // any of their primary languages. 603 Relation scriptToTerritories = new Relation(new HashMap<String, Set<String>>(), HashSet.class); 604 for (String lang : sdi.getBasicLanguageDataLanguages()) { 605 BasicLanguageData langData = sdi.getBasicLanguageDataMap(lang).get(Type.primary); 606 if (langData == null) { 607 continue; 608 } 609 for (String curScript : langData.getScripts()) { 610 scriptToTerritories.putAll(curScript, langData.getTerritories()); 611 } 612 } 613 614 // For each territory, get all of its legal tender currencies. 615 Date now = new Date(System.currentTimeMillis()); 616 scriptToCurrencies = new Relation(new HashMap<String, Set<String>>(), HashSet.class); 617 for (Object curScript : scriptToTerritories.keySet()) { 618 Set<String> territories = scriptToTerritories.get(curScript); 619 Set<String> currencies = new HashSet<>(); 620 for (String territory : territories) { 621 Set<CurrencyDateInfo> currencyInfo = sdi.getCurrencyDateInfo(territory); 622 for (CurrencyDateInfo info : currencyInfo) { 623 if (info.isLegalTender() && info.getEnd().compareTo(now) > 0) { 624 currencies.add(info.getCurrency()); 625 } 626 } 627 } 628 scriptToCurrencies.putAll(curScript, currencies); 629 } 630 return scriptToCurrencies.get(script); 631 } 632 633 /** 634 * Extracts just the text from a date field, replacing all the variable fields by variableReplacement. Return null 635 * if 636 * there is an error (a different test will find that error). 637 */ extractDatePatternText(String value, String variableReplacement, StringBuilder justText)638 public boolean extractDatePatternText(String value, String variableReplacement, StringBuilder justText) { 639 boolean haveText = false; 640 try { 641 formatParser.set(value); 642 } catch (Exception e) { 643 return false; // give up, it is illegal 644 } 645 boolean doReplacement = variableReplacement != null && variableReplacement.length() > 0; 646 justText.setLength(0); 647 for (Object item : formatParser.getItems()) { 648 if (item instanceof String) { 649 justText.append(item); 650 haveText = true; 651 } else { 652 if (doReplacement) { 653 justText.append(variableReplacement); 654 } 655 } 656 } 657 return haveText; 658 } 659 containsPart(String source, String... segments)660 public boolean containsPart(String source, String... segments) { 661 for (int i = 0; i < segments.length; ++i) { 662 if (source.indexOf(segments[i]) > 0) { 663 return true; 664 } 665 } 666 return false; 667 } 668 669 static final String TEST = "؉"; 670 addMissingMessage(UnicodeSet missing, CheckStatus.Type warningVsError, Subtype subtype, Subtype subtypeAscii, String qualifier, List<CheckStatus> result)671 private void addMissingMessage(UnicodeSet missing, CheckStatus.Type warningVsError, Subtype subtype, 672 Subtype subtypeAscii, 673 String qualifier, List<CheckStatus> result) { 674 String fixedMissing = prettyPrint.format(missing); 675 BitSet scripts = new BitSet(); 676 for (String s : missing) { 677 final int script = UScript.getScript(s.codePointAt(0)); 678 if (script == UScript.INHERITED || script == UScript.COMMON) { 679 continue; 680 } 681 scripts.set(script); 682 } 683 StringBuilder scriptString = new StringBuilder(); 684 if (!scripts.isEmpty()) { 685 scriptString.append("{"); 686 for (int i = scripts.nextSetBit(0); i >= 0; i = scripts.nextSetBit(i + 1)) { 687 if (scriptString.length() > 1) { 688 scriptString.append(", "); 689 } 690 scriptString.append(UScript.getName(i)); 691 } 692 scriptString.append("}"); 693 } 694 result 695 .add(new CheckStatus() 696 .setCause(this) 697 .setMainType(warningVsError) 698 .setSubtype(ASCII.containsAll(missing) ? subtypeAscii : subtype) 699 .setMessage( 700 "The characters \u200E{0}\u200E {1} {2}. " 701 + 702 "For what to do, see <i>Handling Warnings</i> in <a target='CLDR-ST-DOCS' href='http://cldr.org/translation/characters#TOC-Handing-Warnings'>Characters</a>.", 703 new Object[] { fixedMissing, scriptString, qualifier })); 704 } 705 706 static final Normalizer2 NFC = Normalizer2.getInstance(null, "nfc", Normalizer2.Mode.COMPOSE); 707 708 /** 709 * Return null if ok, otherwise UnicodeSet of bad characters 710 * 711 * @param exemplarSet 712 * @param value 713 * @return 714 */ containsAllCountingParens(UnicodeSet exemplarSet, UnicodeSet exemplarSetPlusASCII, String value)715 private UnicodeSet containsAllCountingParens(UnicodeSet exemplarSet, UnicodeSet exemplarSetPlusASCII, String value) { 716 UnicodeSet result = null; 717 if (exemplarSet.containsAll(value)) { 718 return result; 719 } 720 721 // Normalize 722 value = NFC.normalize(value); 723 724 // if we failed, then check that everything outside of () is ok. 725 // and everything inside parens is either ASCII or in the set 726 int lastPos = 0; 727 while (true) { 728 int start = START_PAREN.findIn(value, lastPos, false); 729 String outside = value.substring(lastPos, start); 730 result = addDisallowedItems(exemplarSet, outside, result); 731 if (start == value.length()) { 732 break; // all done 733 } 734 ++start; 735 int end = END_PAREN.findIn(value, start, false); 736 // don't worry about mixed brackets 737 String inside = value.substring(start, end); 738 result = addDisallowedItems(exemplarSetPlusASCII, inside, result); 739 if (end == value.length()) { 740 break; // all done 741 } 742 lastPos = end + 1; 743 } 744 return result; 745 } 746 addDisallowedItems(UnicodeSet exemplarSet, String outside, UnicodeSet result)747 private UnicodeSet addDisallowedItems(UnicodeSet exemplarSet, String outside, UnicodeSet result) { 748 if (!exemplarSet.containsAll(outside)) { 749 if (result == null) { 750 result = new UnicodeSet(); 751 } 752 result.addAll(new UnicodeSet().addAll(outside).removeAll(exemplarSet)); 753 } 754 return result; 755 } 756 } 757