1 /* Copyright (C) 2007-2013 Google and others. All Rights Reserved. */ 2 /* Copyright (C) 2007-2013 IBM Corp. and others. All Rights Reserved. */ 3 4 package org.unicode.cldr.test; 5 6 import java.util.Arrays; 7 import java.util.HashMap; 8 import java.util.HashSet; 9 import java.util.List; 10 import java.util.Map; 11 import java.util.Set; 12 import java.util.TreeSet; 13 import java.util.regex.Matcher; 14 import java.util.regex.Pattern; 15 16 import org.unicode.cldr.test.CheckExemplars.ExemplarType; 17 import org.unicode.cldr.util.Builder; 18 import org.unicode.cldr.util.CLDRFile; 19 import org.unicode.cldr.util.CLDRLocale; 20 import org.unicode.cldr.util.CldrUtility; 21 import org.unicode.cldr.util.DateTimeCanonicalizer; 22 import org.unicode.cldr.util.DateTimeCanonicalizer.DateTimePatternType; 23 import org.unicode.cldr.util.Emoji; 24 import org.unicode.cldr.util.ICUServiceBuilder; 25 import org.unicode.cldr.util.PatternCache; 26 import org.unicode.cldr.util.UnicodeSetPrettyPrinter; 27 import org.unicode.cldr.util.With; 28 import org.unicode.cldr.util.XPathParts; 29 30 import com.google.common.base.Joiner; 31 import com.google.common.base.Splitter; 32 import com.google.myanmartools.ZawgyiDetector; 33 import com.ibm.icu.lang.UCharacter; 34 import com.ibm.icu.text.Collator; 35 import com.ibm.icu.text.DateIntervalInfo; 36 import com.ibm.icu.text.DateTimePatternGenerator; 37 import com.ibm.icu.text.DecimalFormat; 38 import com.ibm.icu.text.Normalizer; 39 import com.ibm.icu.text.RuleBasedCollator; 40 import com.ibm.icu.text.Transform; 41 import com.ibm.icu.text.Transliterator; 42 import com.ibm.icu.text.UnicodeSet; 43 import com.ibm.icu.text.UnicodeSetIterator; 44 import com.ibm.icu.util.ULocale; 45 46 /** 47 * Class for processing the input and output of CLDR data for use in the 48 * Survey Tool and other tools. 49 */ 50 public class DisplayAndInputProcessor { 51 52 private static final boolean FIX_YEARS = true; 53 54 public static final boolean DEBUG_DAIP = CldrUtility.getProperty("DEBUG_DAIP", false); 55 56 public static final UnicodeSet RTL = new UnicodeSet("[[:Bidi_Class=Arabic_Letter:][:Bidi_Class=Right_To_Left:]]") 57 .freeze(); 58 59 public static final UnicodeSet TO_QUOTE = new UnicodeSet( 60 "[[:Cn:]" + 61 "[:Default_Ignorable_Code_Point:]" + 62 "[:patternwhitespace:]" + 63 "[:Me:][:Mn:]]" // add non-spacing marks 64 ).freeze(); 65 66 public static final Pattern NUMBER_FORMAT_XPATH = Pattern 67 .compile("//ldml/numbers/.*Format\\[@type=\"standard\"]/pattern.*"); 68 69 public static final Pattern NUMBER_SEPARATOR_PATTERN = Pattern 70 .compile("//ldml/numbers/symbols.*/(decimal|group)"); 71 72 private static final Pattern APOSTROPHE_SKIP_PATHS = PatternCache.get("//ldml/(" 73 + "localeDisplayNames/languages/language\\[@type=\"mic\"].*|" 74 + "characters/.*|" 75 + "delimiters/.*|" 76 + "dates/.+/(pattern|intervalFormatItem|dateFormatItem).*|" 77 + "units/.+/unitPattern.*|" 78 + "units/.+/durationUnitPattern.*|" 79 + "numbers/symbols.*|" 80 + "numbers/miscPatterns.*|" 81 + "numbers/(decimal|currency|percent|scientific)Formats.+/(decimal|currency|percent|scientific)Format.*)"); 82 private static final Pattern INTERVAL_FORMAT_PATHS = PatternCache.get("//ldml/dates/.+/intervalFormatItem.*"); 83 private static final Pattern NON_DECIMAL_PERIOD = PatternCache.get("(?<![0#'])\\.(?![0#'])"); 84 private static final Pattern WHITESPACE_NO_NBSP_TO_NORMALIZE = PatternCache.get("\\s+"); // string of whitespace not 85 // including NBSP, i.e. [ 86 // \t\n\r]+ 87 private static final Pattern WHITESPACE_AND_NBSP_TO_NORMALIZE = PatternCache.get("[\\s\\u00A0]+"); // string of 88 // whitespace 89 // including NBSP, 90 // i.e. [ 91 // \u00A0\t\n\r]+ 92 private static final UnicodeSet UNICODE_WHITESPACE = new UnicodeSet("[:whitespace:]").freeze(); 93 94 private static final CLDRLocale MALAYALAM = CLDRLocale.getInstance("ml"); 95 private static final CLDRLocale ROMANIAN = CLDRLocale.getInstance("ro"); 96 private static final CLDRLocale CATALAN = CLDRLocale.getInstance("ca"); 97 private static final CLDRLocale NGOMBA = CLDRLocale.getInstance("jgo"); 98 private static final CLDRLocale KWASIO = CLDRLocale.getInstance("nmg"); 99 private static final CLDRLocale HEBREW = CLDRLocale.getInstance("he"); 100 private static final CLDRLocale MYANMAR = CLDRLocale.getInstance("my"); 101 private static final CLDRLocale KYRGYZ = CLDRLocale.getInstance("ky"); 102 private static final CLDRLocale URDU = CLDRLocale.getInstance("ur"); 103 private static final CLDRLocale PASHTO = CLDRLocale.getInstance("ps"); 104 private static final CLDRLocale FARSI = CLDRLocale.getInstance("fa"); 105 private static final CLDRLocale GERMAN_SWITZERLAND = CLDRLocale.getInstance("de_CH"); 106 private static final CLDRLocale SWISS_GERMAN = CLDRLocale.getInstance("gsw"); 107 private static final CLDRLocale FF_ADLAM = CLDRLocale.getInstance("ff_Adlm"); 108 public static final Set<String> LANGUAGES_USING_MODIFIER_APOSTROPHE = new HashSet<>( 109 Arrays.asList("br", "bss", "cad", "cic", "cch", "gn", "ha", "ha_Latn", "lkt", "mgo", "moh", "mus", "nnh", "qu", "quc", "uk", "uz", "uz_Latn")); 110 111 // Ş ş Ţ ţ => Ș ș Ț ț 112 private static final char[][] ROMANIAN_CONVERSIONS = { 113 { '\u015E', '\u0218' }, { '\u015F', '\u0219' }, { '\u0162', '\u021A' }, 114 { '\u0163', '\u021B' } }; 115 116 private static final char[][] CATALAN_CONVERSIONS = { 117 { '\u013F', '\u004C', '\u00B7' }, // Ŀ -> L· 118 { '\u0140', '\u006C', '\u00B7' } }; // ŀ -> l· 119 120 private static final char[][] NGOMBA_CONVERSIONS = { 121 { '\u0251', '\u0061' }, { '\u0261', '\u0067' }, // ɑ -> a , ɡ -> g , See ticket #5691 122 { '\u2019', '\uA78C' }, { '\u02BC', '\uA78C' } }; // Saltillo, see ticket #6805 123 124 private static final char[][] KWASIO_CONVERSIONS = { 125 { '\u0306', '\u030C' }, // See ticket #6571, use caron instead of breve 126 { '\u0103', '\u01CE' }, { '\u0102', '\u01CD' }, // a-breve -> a-caron 127 { '\u0115', '\u011B' }, { '\u011A', '\u01CD' }, // e-breve -> e-caron 128 { '\u012D', '\u01D0' }, { '\u012C', '\u01CF' }, // i-breve -> i-caron 129 { '\u014F', '\u01D2' }, { '\u014E', '\u01D1' }, // o-breve -> o-caron 130 { '\u016D', '\u01D4' }, { '\u016C', '\u01D3' } // u-breve -> u-caron 131 }; 132 133 private static final char[][] HEBREW_CONVERSIONS = { 134 { '\'', '\u05F3' }, { '"', '\u05F4' } }; // ' -> geresh " -> gershayim 135 136 private static final char[][] KYRGYZ_CONVERSIONS = { 137 { 'ӊ', 'ң' }, { 'Ӊ', 'Ң' } }; // right modifier 138 139 private static final char[][] URDU_PLUS_CONVERSIONS = { 140 { '\u0643', '\u06A9' }}; // wrong char 141 142 private static final ZawgyiDetector detector = new ZawgyiDetector(); 143 private static final Transliterator zawgyiUnicodeTransliterator = 144 Transliterator.getInstance("Zawgyi-my"); 145 146 private Collator col; 147 148 private Collator spaceCol; 149 150 private UnicodeSetPrettyPrinter pp = null; 151 152 final private CLDRLocale locale; 153 private boolean isPosix; 154 155 /** 156 * Constructor, taking cldrFile. 157 * 158 * @param cldrFileToCheck 159 */ DisplayAndInputProcessor(CLDRFile cldrFileToCheck, boolean needsCollator)160 public DisplayAndInputProcessor(CLDRFile cldrFileToCheck, boolean needsCollator) { 161 init(this.locale = CLDRLocale.getInstance(cldrFileToCheck.getLocaleID()), needsCollator); 162 } 163 DisplayAndInputProcessor(CLDRFile cldrFileToCheck)164 public DisplayAndInputProcessor(CLDRFile cldrFileToCheck) { 165 init(this.locale = CLDRLocale.getInstance(cldrFileToCheck.getLocaleID()), true); 166 } 167 init(CLDRLocale locale, boolean needsCollator)168 void init(CLDRLocale locale, boolean needsCollator) { 169 isPosix = locale.toString().indexOf("POSIX") >= 0; 170 if (needsCollator) { 171 ICUServiceBuilder isb = null; 172 try { 173 isb = ICUServiceBuilder.forLocale(locale); 174 } catch (Exception e) { 175 } 176 177 if (isb != null) { 178 try { 179 col = isb.getRuleBasedCollator(); 180 } catch (Exception e) { 181 col = Collator.getInstance(ULocale.ROOT); 182 } 183 } else { 184 col = Collator.getInstance(ULocale.ROOT); 185 } 186 187 spaceCol = Collator.getInstance(locale.toULocale()); 188 if (spaceCol instanceof RuleBasedCollator) { 189 ((RuleBasedCollator) spaceCol).setAlternateHandlingShifted(false); 190 } 191 pp = new UnicodeSetPrettyPrinter().setOrdering(Collator.getInstance(ULocale.ROOT)) 192 .setSpaceComparator(Collator.getInstance(ULocale.ROOT).setStrength2(Collator.PRIMARY)) 193 .setCompressRanges(true) 194 .setToQuote(new UnicodeSet(TO_QUOTE)) 195 .setOrdering(col) 196 .setSpaceComparator(spaceCol); 197 } 198 } 199 getPrettyPrinter()200 public UnicodeSetPrettyPrinter getPrettyPrinter() { 201 return pp; 202 } 203 204 /** 205 * Constructor, taking ULocale and boolean. 206 * 207 * @param locale the ULocale 208 * @param needsCollator true or false 209 * 210 * Called by getProcessor, with locale = SurveyMain.TRANS_HINT_LOCALE 211 */ DisplayAndInputProcessor(ULocale locale, boolean needsCollator)212 public DisplayAndInputProcessor(ULocale locale, boolean needsCollator) { 213 init(this.locale = CLDRLocale.getInstance(locale), needsCollator); 214 } 215 216 /** 217 * Constructor, taking ULocale. 218 * 219 * @param locale the ULocale 220 */ DisplayAndInputProcessor(ULocale locale)221 public DisplayAndInputProcessor(ULocale locale) { 222 init(this.locale = CLDRLocale.getInstance(locale), true /* needsCollator */); 223 } 224 225 /** 226 * Constructor, taking CLDRLocale and boolean. 227 * 228 * @param locale the CLDRLocale 229 * @param needsCollator true or false 230 */ DisplayAndInputProcessor(CLDRLocale locale, boolean needsCollator)231 public DisplayAndInputProcessor(CLDRLocale locale, boolean needsCollator) { 232 init(this.locale = locale, needsCollator); 233 } 234 235 /** 236 * Constructor, taking locale. 237 * 238 * @param locale 239 */ DisplayAndInputProcessor(CLDRLocale locale)240 public DisplayAndInputProcessor(CLDRLocale locale) { 241 init(this.locale = locale, true); 242 } 243 244 /** 245 * Process the value for display. The result is a string for display in the 246 * Survey tool or similar program. 247 * 248 * @param path 249 * @param value 250 * @param fullPath 251 * @return 252 */ processForDisplay(String path, String value)253 public synchronized String processForDisplay(String path, String value) { 254 value = Normalizer.compose(value, false); // Always normalize all text to NFC. 255 if (hasUnicodeSetValue(path)) { 256 value = displayUnicodeSet(value); 257 } else if (path.contains("stopword")) { 258 return value.trim().isEmpty() ? "NONE" : value; 259 } else { 260 NumericType numericType = NumericType.getNumericType(path); 261 if (numericType != NumericType.NOT_NUMERIC) { 262 // Canonicalize existing values that aren't canonicalized yet. 263 // New values will be canonicalized on input using processInput(). 264 try { 265 value = getCanonicalPattern(value, numericType, isPosix); 266 } catch (IllegalArgumentException e) { 267 if (DEBUG_DAIP) System.err.println("Illegal pattern: " + value); 268 } 269 if (numericType != NumericType.CURRENCY && numericType != NumericType.CURRENCY_ABBREVIATED) { 270 value = value.replace("'", ""); 271 } 272 } 273 } 274 // Fix up any apostrophes in number symbols 275 if (NUMBER_SEPARATOR_PATTERN.matcher(path).matches()) { 276 value = value.replace('\'', '\u2019'); 277 } 278 // Fix up any apostrophes as appropriate (Don't do so for things like date patterns... 279 if (!APOSTROPHE_SKIP_PATHS.matcher(path).matches()) { 280 value = normalizeApostrophes(value); 281 } 282 // Fix up hyphens, replacing with N-dash as appropriate 283 if (INTERVAL_FORMAT_PATHS.matcher(path).matches()) { 284 value = normalizeIntervalHyphens(value); 285 } else { 286 value = normalizeHyphens(value); 287 } 288 return value; 289 } 290 hasUnicodeSetValue(String path)291 private boolean hasUnicodeSetValue(String path) { 292 return path.startsWith("//ldml/characters/exemplarCharacters") || path.startsWith("//ldml/characters/parseLenients"); 293 } 294 295 static final UnicodeSet WHITESPACE = new UnicodeSet("[:whitespace:]").freeze(); 296 static final DateTimeCanonicalizer dtc = new DateTimeCanonicalizer(FIX_YEARS); 297 298 public static final Splitter SPLIT_BAR = Splitter.on(Pattern.compile("(\\||\\s+l\\s+)")).trimResults().omitEmptyStrings(); 299 static final Splitter SPLIT_SPACE = Splitter.on(' ').trimResults().omitEmptyStrings(); 300 static final Joiner JOIN_BAR = Joiner.on(" | "); 301 302 /** 303 * Process the value for input. The result is a cleaned-up value. For example, 304 * an exemplar set is modified to be in the normal format, and any missing [ ] 305 * are added (a common omission on entry). If there are any failures then the 306 * original value is returned, so that the proper error message can be given. 307 * 308 * @param path 309 * @param value 310 * @param internalException 311 * TODO 312 * @param fullPath 313 * @return 314 */ processInput(String path, String value, Exception[] internalException)315 public synchronized String processInput(String path, String value, Exception[] internalException) { 316 String original = value; 317 value = stripProblematicControlCharacters(value); 318 value = Normalizer.compose(value, false); // Always normalize all input to NFC. 319 if (internalException != null) { 320 internalException[0] = null; 321 } 322 // skip processing for inheritance marker 323 if (CldrUtility.INHERITANCE_MARKER.equals(value)) { 324 return value; // Reference: https://unicode.org/cldr/trac/ticket/11261 325 } 326 // for root annotations 327 if (CLDRLocale.ROOT.equals(locale) && path.contains("/annotations")) { 328 return value; // Reference: https://unicode.org/cldr/trac/ticket/11261 329 } 330 331 try { 332 // Normalise Malayalam characters. 333 boolean isUnicodeSet = hasUnicodeSetValue(path); 334 if (locale.childOf(MALAYALAM)) { 335 String newvalue = normalizeMalayalam(value); 336 if (DEBUG_DAIP) System.out.println("DAIP: Normalized Malayalam '" + value + "' to '" + newvalue + "'"); 337 value = newvalue; 338 } else if (locale.childOf(ROMANIAN) && !isUnicodeSet) { 339 value = standardizeRomanian(value); 340 } else if (locale.childOf(CATALAN) && !isUnicodeSet) { 341 value = standardizeCatalan(value); 342 } else if (locale.childOf(NGOMBA) && !isUnicodeSet) { 343 value = standardizeNgomba(value); 344 } else if (locale.childOf(KWASIO) && !isUnicodeSet) { 345 value = standardizeKwasio(value); 346 } else if (locale.childOf(HEBREW) && !APOSTROPHE_SKIP_PATHS.matcher(path).matches()) { 347 value = replaceChars(path, value, HEBREW_CONVERSIONS, false); 348 } else if ((locale.childOf(SWISS_GERMAN) || locale.childOf(GERMAN_SWITZERLAND)) && !isUnicodeSet) { 349 value = standardizeSwissGerman(value); 350 } else if (locale.childOf(MYANMAR) && !isUnicodeSet) { 351 value = standardizeMyanmar(value); 352 } else if (locale.childOf(KYRGYZ)) { 353 value = replaceChars(path, value, KYRGYZ_CONVERSIONS, false); 354 } else if (locale.childOf(URDU) || locale.childOf(PASHTO) || locale.childOf(FARSI)) { 355 value = replaceChars(path, value, URDU_PLUS_CONVERSIONS, true); 356 } else if (locale.childOf(FF_ADLAM) && !isUnicodeSet) { 357 value = fixAdlamNasalization(value); 358 } 359 360 if (UNICODE_WHITESPACE.containsSome(value)) { 361 value = normalizeWhitespace(path, value); 362 } 363 364 // all of our values should not have leading or trailing spaces, except insertBetween 365 if (!path.contains("/insertBetween") && !isUnicodeSet) { 366 value = value.trim(); 367 } 368 369 // fix grouping separator if space 370 if (path.startsWith("//ldml/numbers/symbols") && !path.contains("/alias")) { 371 if (value.isEmpty()) { 372 value = "\u00A0"; 373 } 374 value = value.replace(' ', '\u00A0'); 375 } 376 377 // fix date patterns 378 DateTimePatternType datetimePatternType = DateTimePatternType.fromPath(path); 379 if (DateTimePatternType.STOCK_AVAILABLE_INTERVAL_PATTERNS.contains(datetimePatternType)) { 380 try { 381 value = dtc.getCanonicalDatePattern(path, value, datetimePatternType); 382 } catch (IllegalArgumentException ex) { 383 return value; 384 } 385 } 386 387 if (path.startsWith("//ldml/numbers/currencies/currency") && path.contains("displayName")) { 388 value = normalizeCurrencyDisplayName(value); 389 } 390 NumericType numericType = NumericType.getNumericType(path); 391 if (numericType != NumericType.NOT_NUMERIC) { 392 if (numericType == NumericType.CURRENCY) { 393 value = value.replaceAll(" ", "\u00A0"); 394 if (numericType == NumericType.CURRENCY_ABBREVIATED) { 395 value = value.replaceAll("0\\.0+", "0"); 396 } 397 } else { 398 value = value.replaceAll("([%\u00A4]) ", "$1\u00A0") 399 .replaceAll(" ([%\u00A4])", "\u00A0$1"); 400 value = replace(NON_DECIMAL_PERIOD, value, "'.'"); 401 if (numericType == NumericType.DECIMAL_ABBREVIATED) { 402 value = value.replaceAll("0\\.0+", "0"); 403 } 404 } 405 value = getCanonicalPattern(value, numericType, isPosix); 406 } 407 408 // fix [,] 409 if (path.startsWith("//ldml/localeDisplayNames/languages/language") 410 || path.startsWith("//ldml/localeDisplayNames/scripts/script") 411 || path.startsWith("//ldml/localeDisplayNames/territories/territory") 412 || path.startsWith("//ldml/localeDisplayNames/variants/variant") 413 || path.startsWith("//ldml/localeDisplayNames/keys/key") 414 || path.startsWith("//ldml/localeDisplayNames/types/type")) { 415 value = value.replace('[', '(').replace(']', ')').replace('[', '(').replace(']', ')'); 416 } 417 418 // Normalize two single quotes for the inches symbol. 419 if (path.contains("/units")) { 420 value = value.replace("''", "″"); 421 } 422 423 // check specific cases 424 if (isUnicodeSet) { 425 value = inputUnicodeSet(path, value); 426 } else if (path.contains("stopword")) { 427 if (value.equals("NONE")) { 428 value = ""; 429 } 430 } 431 432 // Normalize ellipsis data. 433 if (path.startsWith("//ldml/characters/ellipsis")) { 434 value = value.replace("...", "…"); 435 } 436 437 // Replace Arabic presentation forms with their nominal counterparts 438 value = replaceArabicPresentationForms(value); 439 440 // Fix up any apostrophes as appropriate (Don't do so for things like date patterns... 441 if (!APOSTROPHE_SKIP_PATHS.matcher(path).matches()) { 442 value = normalizeApostrophes(value); 443 } 444 // Fix up any apostrophes in number symbols 445 if (NUMBER_SEPARATOR_PATTERN.matcher(path).matches()) { 446 value = value.replace('\'', '\u2019'); 447 } 448 // Fix up hyphens, replacing with N-dash as appropriate 449 if (INTERVAL_FORMAT_PATHS.matcher(path).matches()) { 450 value = normalizeIntervalHyphens(value); 451 } else if (!isUnicodeSet) { 452 value = normalizeHyphens(value); 453 } 454 455 if (path.startsWith("//ldml/annotations/annotation")) { 456 if (path.contains(Emoji.TYPE_TTS)) { 457 // The row has something like " -name" in the first column. Cf. namePath, getNamePaths. 458 // Normally the value is like "zebra" or "unicorn face", without "|". 459 // If the user enters a value with "|", discard anything after "|"; e.g., change "a | b | c" to "a". 460 value = SPLIT_BAR.split(value).iterator().next(); 461 } else { 462 // The row has something like " –keywords" in the first column. Cf. keywordPath, getKeywordPaths. 463 // Normally the value is like "stripe | zebra", with "|". 464 value = annotationsForDisplay(value); 465 } 466 } 467 468 return value; 469 } catch (RuntimeException e) { 470 if (internalException != null) { 471 internalException[0] = e; 472 } 473 return original; 474 } 475 } 476 477 /** 478 * Strip out all code points less than U+0020 except for U+0009 tab, 479 * U+000A line feed, and U+000D carriage return. 480 * 481 * @param s the string 482 * @return the resulting string 483 */ stripProblematicControlCharacters(String s)484 private String stripProblematicControlCharacters(String s) { 485 if (s == null || s.isEmpty()) { 486 return s; 487 } 488 return s.codePoints() 489 .filter(c -> (c >= 0x20 || c == 9 || c == 0xA || c == 0xD)) 490 .collect(StringBuilder::new, StringBuilder::appendCodePoint, StringBuilder::append) 491 .toString(); 492 } 493 494 private static final boolean REMOVE_COVERED_KEYWORDS = true; 495 496 /** 497 * Produce a modification of the given annotation by sorting its components and filtering covered keywords. 498 * 499 * Examples: Given "b | a", return "a | b". Given "bear | panda | panda bear", return "bear | panda". 500 * 501 * @param value the string 502 * @return the possibly modified string 503 */ annotationsForDisplay(String value)504 private static String annotationsForDisplay(String value) { 505 TreeSet<String> sorted = new TreeSet<>(Collator.getInstance(ULocale.ROOT)); 506 sorted.addAll(SPLIT_BAR.splitToList(value)); 507 if (REMOVE_COVERED_KEYWORDS) { 508 filterCoveredKeywords(sorted); 509 } 510 value = JOIN_BAR.join(sorted); 511 return value; 512 } 513 514 /** 515 * Filter from the given set some keywords that include spaces, if they duplicate, 516 * or are "covered by", other keywords in the set. 517 * 518 * For example, if the set is {"bear", "panda", "panda bear"} (annotation was "bear | panda | panda bear"), 519 * then remove "panda bear", treating it as "covered" since the set already includes "panda" and "bear". 520 * 521 * @param sorted the set from which items may be removed 522 */ filterCoveredKeywords(TreeSet<String> sorted)523 public static void filterCoveredKeywords(TreeSet<String> sorted) { 524 // for now, just do single items 525 HashSet<String> toRemove = new HashSet<>(); 526 527 for (String item : sorted) { 528 List<String> list = SPLIT_SPACE.splitToList(item); 529 if (list.size() < 2) { 530 continue; 531 } 532 if (sorted.containsAll(list)) { 533 toRemove.add(item); 534 } 535 } 536 sorted.removeAll(toRemove); 537 } 538 displayUnicodeSet(String value)539 private String displayUnicodeSet(String value) { 540 if (value.startsWith("[") && value.endsWith("]")) { 541 value = value.substring(1, value.length() - 1); 542 } 543 544 value = replace(NEEDS_QUOTE1, value, "$1\\\\$2$3"); 545 value = replace(NEEDS_QUOTE2, value, "$1\\\\$2$3"); 546 547 // if (RTL.containsSome(value) && value.startsWith("[") && value.endsWith("]")) { 548 // return "\u200E[\u200E" + value.substring(1,value.length()-2) + "\u200E]\u200E"; 549 // } 550 return value; 551 } 552 inputUnicodeSet(String path, String value)553 private String inputUnicodeSet(String path, String value) { 554 // clean up the user's input. 555 // first, fix up the '[' 556 value = value.trim(); 557 558 // remove brackets and trim again before regex 559 if (value.startsWith("[")) { 560 value = value.substring(1); 561 } 562 if (value.endsWith("]") && (!value.endsWith("\\]") || value.endsWith("\\\\]"))) { 563 value = value.substring(0, value.length() - 1); 564 } 565 value = value.trim(); 566 567 value = replace(NEEDS_QUOTE1, value, "$1\\\\$2$3"); 568 value = replace(NEEDS_QUOTE2, value, "$1\\\\$2$3"); 569 570 // re-add brackets. 571 value = "[" + value + "]"; 572 573 UnicodeSet exemplar = new UnicodeSet(value); 574 XPathParts parts = XPathParts.getFrozenInstance(path); 575 if (parts.getElement(2).equals("parseLenients")) { 576 return exemplar.toPattern(false); 577 } 578 final String type = parts.getAttributeValue(-1, "type"); 579 ExemplarType exemplarType = type == null ? ExemplarType.main : ExemplarType.valueOf(type); 580 value = getCleanedUnicodeSet(exemplar, pp, exemplarType); 581 return value; 582 } 583 normalizeWhitespace(String path, String value)584 private String normalizeWhitespace(String path, String value) { 585 // turn all whitespace sequences (including tab and newline, and NBSP for certain paths) 586 // into a single space or a single NBSP depending on path. 587 if ((path.contains("/dateFormatLength") && path.contains("/pattern")) || 588 path.contains("/availableFormats/dateFormatItem") || 589 (path.startsWith("//ldml/dates/timeZoneNames/metazone") && path.contains("/long")) || 590 path.startsWith("//ldml/dates/timeZoneNames/regionFormat") || 591 path.startsWith("//ldml/localeDisplayNames/codePatterns/codePattern") || 592 path.startsWith("//ldml/localeDisplayNames/languages/language") || 593 path.startsWith("//ldml/localeDisplayNames/territories/territory") || 594 path.startsWith("//ldml/localeDisplayNames/types/type") || 595 (path.startsWith("//ldml/numbers/currencies/currency") && path.contains("/displayName")) || 596 (path.contains("/decimalFormatLength[@type=\"long\"]") && path.contains("/pattern")) || 597 path.startsWith("//ldml/posix/messages") || 598 (path.startsWith("//ldml/units/uni") && path.contains("/unitPattern "))) { 599 value = WHITESPACE_AND_NBSP_TO_NORMALIZE.matcher(value).replaceAll(" "); // replace with regular space 600 } else if ((path.contains("/currencies/currency") && (path.contains("/group") || path.contains("/pattern"))) 601 || 602 (path.contains("/currencyFormatLength") && path.contains("/pattern")) || 603 (path.contains("/currencySpacing") && path.contains("/insertBetween")) || 604 (path.contains("/decimalFormatLength") && path.contains("/pattern")) || // i.e. the non-long ones 605 (path.contains("/percentFormatLength") && path.contains("/pattern")) || 606 (path.startsWith("//ldml/numbers/symbols") && (path.contains("/group") || path.contains("/nan")))) { 607 value = WHITESPACE_AND_NBSP_TO_NORMALIZE.matcher(value).replaceAll("\u00A0"); // replace with NBSP 608 } else { 609 // in this case don't normalize away NBSP 610 value = WHITESPACE_NO_NBSP_TO_NORMALIZE.matcher(value).replaceAll(" "); // replace with regular space 611 } 612 return value; 613 } 614 normalizeCurrencyDisplayName(String value)615 private String normalizeCurrencyDisplayName(String value) { 616 StringBuilder result = new StringBuilder(); 617 boolean inParentheses = false; 618 for (int i = 0; i < value.length(); i++) { 619 char c = value.charAt(i); 620 if (c == '(') { 621 inParentheses = true; 622 } else if (c == ')') { 623 inParentheses = false; 624 } 625 if (inParentheses && c == '-' && Character.isDigit(value.charAt(i - 1))) { 626 c = 0x2013; /* Replace hyphen-minus with dash for date ranges */ 627 } 628 result.append(c); 629 } 630 return result.toString(); 631 } 632 normalizeApostrophes(String value)633 private String normalizeApostrophes(String value) { 634 // If our DAIP always had a CLDRFile to work with, then we could just check the exemplar set in it to see. 635 // But since we don't, we just maintain the list internally and use it. 636 if (LANGUAGES_USING_MODIFIER_APOSTROPHE.contains(locale.getLanguage())) { 637 return value.replace('\'', '\u02bc'); 638 } else { 639 char prev = 0; 640 StringBuilder builder = new StringBuilder(); 641 for (char c : value.toCharArray()) { 642 if (c == '\'') { 643 if (Character.isLetter(prev)) { 644 builder.append('\u2019'); 645 } else { 646 builder.append('\u2018'); 647 } 648 } else { 649 builder.append(c); 650 } 651 prev = c; 652 } 653 return builder.toString(); 654 } 655 } 656 normalizeIntervalHyphens(String value)657 private String normalizeIntervalHyphens(String value) { 658 DateTimePatternGenerator.FormatParser fp = new DateTimePatternGenerator.FormatParser(); 659 fp.set(DateIntervalInfo.genPatternInfo(value, false).getFirstPart()); 660 List<Object> items = fp.getItems(); 661 Object last = items.get(items.size() - 1); 662 if (last instanceof String) { 663 String separator = last.toString(); 664 if (separator.contains("-")) { 665 StringBuilder sb = new StringBuilder(); 666 sb.append(DateIntervalInfo.genPatternInfo(value, false).getFirstPart()); 667 if (sb.lastIndexOf(separator) >= 0) { 668 sb.delete(sb.lastIndexOf(separator), sb.length()); 669 sb.append(separator.replace("-", "\u2013")); 670 sb.append(DateIntervalInfo.genPatternInfo(value, false).getSecondPart()); 671 return sb.toString(); 672 } 673 } 674 } 675 return value; 676 } 677 normalizeHyphens(String value)678 private String normalizeHyphens(String value) { 679 int hyphenLocation = value.indexOf("-"); 680 if (hyphenLocation > 0 && 681 Character.isDigit(value.charAt(hyphenLocation - 1)) && 682 hyphenLocation < value.length() - 1 && 683 Character.isDigit(value.charAt(hyphenLocation + 1))) { 684 StringBuilder sb = new StringBuilder(); 685 sb.append(value.substring(0, hyphenLocation)); 686 sb.append("\u2013"); 687 sb.append(value.substring(hyphenLocation + 1)); 688 return sb.toString(); 689 } 690 return value; 691 } 692 standardizeRomanian(String value)693 private String standardizeRomanian(String value) { 694 StringBuilder builder = new StringBuilder(); 695 for (char c : value.toCharArray()) { 696 for (char[] pair : ROMANIAN_CONVERSIONS) { 697 if (c == pair[0]) { 698 c = pair[1]; 699 break; 700 } 701 } 702 builder.append(c); 703 } 704 return builder.toString(); 705 } 706 standardizeKwasio(String value)707 private String standardizeKwasio(String value) { 708 StringBuilder builder = new StringBuilder(); 709 for (char c : value.toCharArray()) { 710 for (char[] pair : KWASIO_CONVERSIONS) { 711 if (c == pair[0]) { 712 c = pair[1]; 713 break; 714 } 715 } 716 builder.append(c); 717 } 718 return builder.toString(); 719 } 720 721 // Use the myanmar-tools detector. standardizeMyanmar(String value)722 private String standardizeMyanmar(String value) { 723 if (detector.getZawgyiProbability(value) > 0.90) { 724 return zawgyiUnicodeTransliterator.transform(value); 725 } 726 return value; 727 } 728 standardizeNgomba(String value)729 private String standardizeNgomba(String value) { 730 StringBuilder builder = new StringBuilder(); 731 char[] charArray = value.toCharArray(); 732 for (int i = 0; i < charArray.length; i++) { 733 char c = charArray[i]; 734 boolean convertedSaltillo = false; 735 for (char[] pair : NGOMBA_CONVERSIONS) { 736 if (c == pair[0]) { 737 c = pair[1]; 738 if (c == '\uA78C') { 739 convertedSaltillo = true; 740 } 741 break; 742 } 743 } 744 if (convertedSaltillo && 745 ((i > 0 && i < charArray.length - 1 && Character.isUpperCase(charArray[i - 1]) && Character.isUpperCase(charArray[i + 1])) || 746 (i > 1 && Character.isUpperCase(charArray[i - 1]) && Character.isUpperCase(charArray[i - 2])))) { 747 c = '\uA78B'; // UPPER CASE SALTILLO 748 } 749 builder.append(c); 750 } 751 return builder.toString(); 752 } 753 replaceChars(String path, String value, char[][] charsToReplace, boolean skipAuxExemplars)754 private String replaceChars(String path, String value, char[][] charsToReplace, boolean skipAuxExemplars) { 755 if (skipAuxExemplars && path.contains("/exemplarCharacters[@type=\"auxiliary\"]")) { 756 return value; 757 } 758 StringBuilder builder = new StringBuilder(); 759 for (char c : value.toCharArray()) { 760 for (char[] pair : charsToReplace) { 761 if (c == pair[0]) { 762 c = pair[1]; 763 break; 764 } 765 } 766 builder.append(c); 767 } 768 return builder.toString(); 769 } 770 standardizeSwissGerman(String value)771 private String standardizeSwissGerman(String value) { 772 return value.replaceAll("\u00DF", "ss"); 773 } 774 standardizeCatalan(String value)775 private String standardizeCatalan(String value) { 776 StringBuilder builder = new StringBuilder(); 777 for (char c : value.toCharArray()) { 778 boolean didSubstitute = false; 779 for (char[] triple : CATALAN_CONVERSIONS) { 780 if (c == triple[0]) { 781 builder.append(triple[1]); 782 builder.append(triple[2]); 783 didSubstitute = true; 784 break; 785 } 786 } 787 if (!didSubstitute) { 788 builder.append(c); 789 } 790 } 791 return builder.toString(); 792 } 793 replace(Pattern pattern, String value, String replacement)794 private String replace(Pattern pattern, String value, String replacement) { 795 String value2 = pattern.matcher(value).replaceAll(replacement); 796 if (DEBUG_DAIP && !value.equals(value2)) { 797 System.out.println("\n" + value + " => " + value2); 798 } 799 return value2; 800 } 801 802 private static Pattern UNNORMALIZED_MALAYALAM = PatternCache.get( 803 "(\u0D23|\u0D28|\u0D30|\u0D32|\u0D33|\u0D15)\u0D4D\u200D"); 804 805 private static Map<Character, Character> NORMALIZING_MAP = Builder.with(new HashMap<Character, Character>()) 806 .put('\u0D23', '\u0D7A').put('\u0D28', '\u0D7B') 807 .put('\u0D30', '\u0D7C').put('\u0D32', '\u0D7D') 808 .put('\u0D33', '\u0D7E').put('\u0D15', '\u0D7F').get(); 809 810 /** 811 * Normalizes the Malayalam characters in the specified input. 812 * 813 * @param value 814 * the input to be normalized 815 * @return 816 */ normalizeMalayalam(String value)817 private String normalizeMalayalam(String value) { 818 // Normalize Malayalam characters. 819 Matcher matcher = UNNORMALIZED_MALAYALAM.matcher(value); 820 if (matcher.find()) { 821 StringBuffer buffer = new StringBuffer(); 822 int start = 0; 823 do { 824 buffer.append(value.substring(start, matcher.start(0))); 825 char codePoint = matcher.group(1).charAt(0); 826 buffer.append(NORMALIZING_MAP.get(codePoint)); 827 start = matcher.end(0); 828 } while (matcher.find()); 829 buffer.append(value.substring(start)); 830 value = buffer.toString(); 831 } 832 return value; 833 } 834 835 static final Transform<String, String> fixArabicPresentation = Transliterator.getInstance( 836 "[[:block=Arabic_Presentation_Forms_A:][:block=Arabic_Presentation_Forms_B:]] nfkc"); 837 838 /** 839 * Normalizes the Arabic presentation forms characters in the specified input. 840 * 841 * @param value 842 * the input to be normalized 843 * @return 844 */ replaceArabicPresentationForms(String value)845 private String replaceArabicPresentationForms(String value) { 846 value = fixArabicPresentation.transform(value); 847 return value; 848 } 849 850 static Pattern ADLAM_MISNASALIZED = PatternCache.get("([])['’‘]([])"); 851 public static String ADLAM_NASALIZATION = ""; // U+1E94B (Unicode 12.0) 852 fixAdlamNasalization(String fromString)853 public static String fixAdlamNasalization(String fromString) { 854 return ADLAM_MISNASALIZED.matcher(fromString) 855 .replaceAll("$1"+ADLAM_NASALIZATION+"$2"); // replace quote with 856 } 857 858 static Pattern REMOVE_QUOTE1 = PatternCache.get("(\\s)(\\\\[-\\}\\]\\&])()"); 859 static Pattern REMOVE_QUOTE2 = PatternCache.get("(\\\\[\\-\\{\\[\\&])(\\s)"); // ([^\\])([\\-\\{\\[])(\\s) 860 861 static Pattern NEEDS_QUOTE1 = PatternCache.get("(\\s|$)([-\\}\\]\\&])()"); 862 static Pattern NEEDS_QUOTE2 = PatternCache.get("([^\\\\])([\\-\\{\\[\\&])(\\s)"); // ([^\\])([\\-\\{\\[])(\\s) 863 getCleanedUnicodeSet(UnicodeSet exemplar, UnicodeSetPrettyPrinter prettyPrinter, ExemplarType exemplarType)864 public static String getCleanedUnicodeSet(UnicodeSet exemplar, UnicodeSetPrettyPrinter prettyPrinter, 865 ExemplarType exemplarType) { 866 if (prettyPrinter == null) { 867 return exemplar.toPattern(false); 868 } 869 String value; 870 prettyPrinter.setCompressRanges(exemplar.size() > 300); 871 value = exemplar.toPattern(false); 872 UnicodeSet toAdd = new UnicodeSet(); 873 874 for (UnicodeSetIterator usi = new UnicodeSetIterator(exemplar); usi.next();) { 875 String string = usi.getString(); 876 if (string.equals("ß") || string.equals("İ")) { 877 toAdd.add(string); 878 continue; 879 } 880 switch (string) { 881 case "\u2011": toAdd.add("-"); break; // nobreak hyphen 882 case "-": toAdd.add("\u2011"); break; // nobreak hyphen 883 884 case " ": toAdd.add("\u00a0"); break; // nobreak space 885 case "\u00a0": toAdd.add(" "); break; // nobreak space 886 887 case "\u202F": toAdd.add("\u2009"); break; // nobreak narrow space 888 case "\u2009": toAdd.add("\u202F"); break; // nobreak narrow space 889 } 890 if (exemplarType.convertUppercase) { 891 string = UCharacter.toLowerCase(ULocale.ENGLISH, string); 892 } 893 toAdd.add(string); 894 String composed = Normalizer.compose(string, false); 895 if (!string.equals(composed)) { 896 toAdd.add(composed); 897 } 898 } 899 900 toAdd.removeAll(exemplarType.toRemove); 901 902 if (DEBUG_DAIP && !toAdd.equals(exemplar)) { 903 UnicodeSet oldOnly = new UnicodeSet(exemplar).removeAll(toAdd); 904 UnicodeSet newOnly = new UnicodeSet(toAdd).removeAll(exemplar); 905 System.out.println("Exemplar:\t" + exemplarType + ",\tremoved\t" + oldOnly + ",\tadded\t" + newOnly); 906 } 907 908 String fixedExemplar = prettyPrinter.format(toAdd); 909 UnicodeSet doubleCheck = new UnicodeSet(fixedExemplar); 910 if (!toAdd.equals(doubleCheck)) { 911 // something went wrong, leave as is 912 } else if (!value.equals(fixedExemplar)) { // put in this condition just for debugging 913 if (DEBUG_DAIP) { 914 System.out.println(TestMetadata.showDifference( 915 With.codePoints(value), 916 With.codePoints(fixedExemplar), 917 "\n")); 918 } 919 value = fixedExemplar; 920 } 921 return value; 922 } 923 924 /** 925 * @return a canonical numeric pattern, based on the type, and the isPOSIX flag. The latter is set for en_US_POSIX. 926 */ 927 static final Splitter SEMI_SPLITTER = Splitter.on(';').trimResults(); 928 getCanonicalPattern(String inpattern, NumericType type, boolean isPOSIX)929 public static String getCanonicalPattern(String inpattern, NumericType type, boolean isPOSIX) { 930 // TODO fix later to properly handle quoted ; 931 932 DecimalFormat df = new DecimalFormat(inpattern); 933 if (type == NumericType.DECIMAL_ABBREVIATED || type == NumericType.CURRENCY_ABBREVIATED 934 || CldrUtility.INHERITANCE_MARKER.equals(inpattern)) { 935 return inpattern; // TODO fix when ICU bug is fixed 936 // df.setMaximumFractionDigits(df.getMinimumFractionDigits()); 937 // df.setMaximumIntegerDigits(Math.max(1, df.getMinimumIntegerDigits())); 938 } else { 939 // int decimals = type == CURRENCY_TYPE ? 2 : 1; 940 int[] digits = isPOSIX ? type.posixDigitCount : type.digitCount; 941 df.setMinimumIntegerDigits(digits[0]); 942 df.setMinimumFractionDigits(digits[1]); 943 df.setMaximumFractionDigits(digits[2]); 944 } 945 String pattern = df.toPattern(); 946 List<String> parts = SEMI_SPLITTER.splitToList(pattern); 947 String pattern2 = parts.get(0); 948 if (parts.size() > 1) { 949 pattern2 += ";" + parts.get(1); 950 } 951 if (!pattern2.equals(pattern)) { 952 pattern = pattern2; 953 } 954 // int pos = pattern.indexOf(';'); 955 // if (pos < 0) return pattern + ";-" + pattern; 956 return pattern; 957 } 958 959 /* 960 * This tests what type a numeric pattern is. 961 */ 962 public enum NumericType { 963 CURRENCY(new int[] { 1, 2, 2 }, new int[] { 1, 2, 2 }), CURRENCY_ABBREVIATED(), DECIMAL(new int[] { 1, 0, 3 }, 964 new int[] { 1, 0, 6 }), DECIMAL_ABBREVIATED(), PERCENT(new int[] { 1, 0, 0 }, 965 new int[] { 1, 0, 0 }), SCIENTIFIC(new int[] { 0, 0, 0 }, new int[] { 1, 6, 6 }), NOT_NUMERIC; 966 967 private static final Pattern NUMBER_PATH = Pattern 968 .compile("//ldml/numbers/((currency|decimal|percent|scientific)Formats|currencies/currency).*"); 969 private int[] digitCount; 970 private int[] posixDigitCount; 971 NumericType()972 private NumericType() { 973 } 974 NumericType(int[] digitCount, int[] posixDigitCount)975 private NumericType(int[] digitCount, int[] posixDigitCount) { 976 this.digitCount = digitCount; 977 this.posixDigitCount = posixDigitCount; 978 } 979 980 /** 981 * @return the numeric type of the xpath 982 */ getNumericType(String xpath)983 public static NumericType getNumericType(String xpath) { 984 Matcher matcher = NUMBER_PATH.matcher(xpath); 985 if (xpath.indexOf("/pattern") < 0) { 986 return NOT_NUMERIC; 987 } else if (matcher.matches()) { 988 if (matcher.group(1).equals("currencies/currency")) { 989 return CURRENCY; 990 } else { 991 NumericType type = NumericType.valueOf(matcher.group(2).toUpperCase()); 992 if (xpath.contains("=\"1000")) { 993 if (type == DECIMAL) { 994 type = DECIMAL_ABBREVIATED; 995 } else if (type == CURRENCY) { 996 type = CURRENCY_ABBREVIATED; 997 } else { 998 throw new IllegalArgumentException("Internal Error"); 999 } 1000 } 1001 return type; 1002 } 1003 } else { 1004 return NOT_NUMERIC; 1005 } 1006 } 1007 getDigitCount()1008 public int[] getDigitCount() { 1009 return digitCount; 1010 } 1011 getPosixDigitCount()1012 public int[] getPosixDigitCount() { 1013 return posixDigitCount; 1014 } 1015 } 1016 } 1017