1 /* Copyright (C) 2007-2013 Google and others. All Rights Reserved. */ 2 /* Copyright (C) 2007-2013 IBM Corp. and others. All Rights Reserved. */ 3 4 package org.unicode.cldr.test; 5 6 import java.util.Arrays; 7 import java.util.HashMap; 8 import java.util.HashSet; 9 import java.util.List; 10 import java.util.Map; 11 import java.util.Set; 12 import java.util.TreeSet; 13 import java.util.regex.Matcher; 14 import java.util.regex.Pattern; 15 16 import org.unicode.cldr.test.CheckExemplars.ExemplarType; 17 import org.unicode.cldr.util.Builder; 18 import org.unicode.cldr.util.CLDRFile; 19 import org.unicode.cldr.util.CLDRLocale; 20 import org.unicode.cldr.util.CldrUtility; 21 import org.unicode.cldr.util.DateTimeCanonicalizer; 22 import org.unicode.cldr.util.DateTimeCanonicalizer.DateTimePatternType; 23 import org.unicode.cldr.util.Emoji; 24 import org.unicode.cldr.util.ICUServiceBuilder; 25 import org.unicode.cldr.util.PatternCache; 26 import org.unicode.cldr.util.UnicodeSetPrettyPrinter; 27 import org.unicode.cldr.util.With; 28 import org.unicode.cldr.util.XPathParts; 29 30 import com.google.common.base.Joiner; 31 import com.google.common.base.Splitter; 32 import com.google.myanmartools.ZawgyiDetector; 33 import com.ibm.icu.lang.UCharacter; 34 import com.ibm.icu.text.Collator; 35 import com.ibm.icu.text.DateIntervalInfo; 36 import com.ibm.icu.text.DateTimePatternGenerator; 37 import com.ibm.icu.text.DecimalFormat; 38 import com.ibm.icu.text.Normalizer; 39 import com.ibm.icu.text.RuleBasedCollator; 40 import com.ibm.icu.text.Transform; 41 import com.ibm.icu.text.Transliterator; 42 import com.ibm.icu.text.UnicodeSet; 43 import com.ibm.icu.text.UnicodeSetIterator; 44 import com.ibm.icu.util.ULocale; 45 46 /** 47 * Class for processing the input and output of CLDR data for use in the 48 * Survey Tool and other tools. 49 */ 50 public class DisplayAndInputProcessor { 51 52 private static final boolean FIX_YEARS = true; 53 54 public static final boolean DEBUG_DAIP = CldrUtility.getProperty("DEBUG_DAIP", false); 55 56 public static final UnicodeSet RTL = new UnicodeSet("[[:Bidi_Class=Arabic_Letter:][:Bidi_Class=Right_To_Left:]]") 57 .freeze(); 58 59 public static final UnicodeSet TO_QUOTE = (UnicodeSet) new UnicodeSet( 60 "[[:Cn:]" + 61 "[:Default_Ignorable_Code_Point:]" + 62 "[:patternwhitespace:]" + 63 "[:Me:][:Mn:]]" // add non-spacing marks 64 ).freeze(); 65 66 public static final Pattern NUMBER_FORMAT_XPATH = Pattern 67 .compile("//ldml/numbers/.*Format\\[@type=\"standard\"]/pattern.*"); 68 69 public static final Pattern NUMBER_SEPARATOR_PATTERN = Pattern 70 .compile("//ldml/numbers/symbols.*/(decimal|group)"); 71 72 private static final Pattern APOSTROPHE_SKIP_PATHS = PatternCache.get("//ldml/(" 73 + "localeDisplayNames/languages/language\\[@type=\"mic\"].*|" 74 + "characters/.*|" 75 + "delimiters/.*|" 76 + "dates/.+/(pattern|intervalFormatItem|dateFormatItem).*|" 77 + "units/.+/unitPattern.*|" 78 + "units/.+/durationUnitPattern.*|" 79 + "numbers/symbols.*|" 80 + "numbers/miscPatterns.*|" 81 + "numbers/(decimal|currency|percent|scientific)Formats.+/(decimal|currency|percent|scientific)Format.*)"); 82 private static final Pattern INTERVAL_FORMAT_PATHS = PatternCache.get("//ldml/dates/.+/intervalFormatItem.*"); 83 private static final Pattern NON_DECIMAL_PERIOD = PatternCache.get("(?<![0#'])\\.(?![0#'])"); 84 private static final Pattern WHITESPACE_NO_NBSP_TO_NORMALIZE = PatternCache.get("\\s+"); // string of whitespace not 85 // including NBSP, i.e. [ 86 // \t\n\r]+ 87 private static final Pattern WHITESPACE_AND_NBSP_TO_NORMALIZE = PatternCache.get("[\\s\\u00A0]+"); // string of 88 // whitespace 89 // including NBSP, 90 // i.e. [ 91 // \u00A0\t\n\r]+ 92 private static final UnicodeSet UNICODE_WHITESPACE = new UnicodeSet("[:whitespace:]").freeze(); 93 94 private static final CLDRLocale MALAYALAM = CLDRLocale.getInstance("ml"); 95 private static final CLDRLocale ROMANIAN = CLDRLocale.getInstance("ro"); 96 private static final CLDRLocale CATALAN = CLDRLocale.getInstance("ca"); 97 private static final CLDRLocale NGOMBA = CLDRLocale.getInstance("jgo"); 98 private static final CLDRLocale KWASIO = CLDRLocale.getInstance("nmg"); 99 private static final CLDRLocale HEBREW = CLDRLocale.getInstance("he"); 100 private static final CLDRLocale MYANMAR = CLDRLocale.getInstance("my"); 101 private static final CLDRLocale KYRGYZ = CLDRLocale.getInstance("ky"); 102 private static final CLDRLocale URDU = CLDRLocale.getInstance("ur"); 103 private static final CLDRLocale PASHTO = CLDRLocale.getInstance("ps"); 104 private static final CLDRLocale FARSI = CLDRLocale.getInstance("fa"); 105 private static final CLDRLocale GERMAN_SWITZERLAND = CLDRLocale.getInstance("de_CH"); 106 private static final CLDRLocale SWISS_GERMAN = CLDRLocale.getInstance("gsw"); 107 public static final Set<String> LANGUAGES_USING_MODIFIER_APOSTROPHE = new HashSet<String>( 108 Arrays.asList("br", "bss", "cch", "gn", "ha", "ha_Latn", "lkt", "mgo", "moh", "nnh", "qu", "quc", "uk", "uz", "uz_Latn")); 109 110 // Ş ş Ţ ţ => Ș ș Ț ț 111 private static final char[][] ROMANIAN_CONVERSIONS = { 112 { '\u015E', '\u0218' }, { '\u015F', '\u0219' }, { '\u0162', '\u021A' }, 113 { '\u0163', '\u021B' } }; 114 115 private static final char[][] CATALAN_CONVERSIONS = { 116 { '\u013F', '\u004C', '\u00B7' }, // Ŀ -> L· 117 { '\u0140', '\u006C', '\u00B7' } }; // ŀ -> l· 118 119 private static final char[][] NGOMBA_CONVERSIONS = { 120 { '\u0251', '\u0061' }, { '\u0261', '\u0067' }, // ɑ -> a , ɡ -> g , See ticket #5691 121 { '\u2019', '\uA78C' }, { '\u02BC', '\uA78C' } }; // Saltillo, see ticket #6805 122 123 private static final char[][] KWASIO_CONVERSIONS = { 124 { '\u0306', '\u030C' }, // See ticket #6571, use caron instead of breve 125 { '\u0103', '\u01CE' }, { '\u0102', '\u01CD' }, // a-breve -> a-caron 126 { '\u0115', '\u011B' }, { '\u011A', '\u01CD' }, // e-breve -> e-caron 127 { '\u012D', '\u01D0' }, { '\u012C', '\u01CF' }, // i-breve -> i-caron 128 { '\u014F', '\u01D2' }, { '\u014E', '\u01D1' }, // o-breve -> o-caron 129 { '\u016D', '\u01D4' }, { '\u016C', '\u01D3' } // u-breve -> u-caron 130 }; 131 132 private static final char[][] HEBREW_CONVERSIONS = { 133 { '\'', '\u05F3' }, { '"', '\u05F4' } }; // ' -> geresh " -> gershayim 134 135 private static final char[][] KYRGYZ_CONVERSIONS = { 136 { 'ӊ', 'ң' }, { 'Ӊ', 'Ң' } }; // right modifier 137 138 private static final char[][] URDU_PLUS_CONVERSIONS = { 139 { '\u0643', '\u06A9' }}; // wrong char 140 141 private static final ZawgyiDetector detector = new ZawgyiDetector(); 142 private static final Transliterator zawgyiUnicodeTransliterator = 143 Transliterator.getInstance("Zawgyi-my"); 144 145 private Collator col; 146 147 private Collator spaceCol; 148 149 private UnicodeSetPrettyPrinter pp = null; 150 151 final private CLDRLocale locale; 152 private boolean isPosix; 153 154 /** 155 * Constructor, taking cldrFile. 156 * 157 * @param cldrFileToCheck 158 */ DisplayAndInputProcessor(CLDRFile cldrFileToCheck, boolean needsCollator)159 public DisplayAndInputProcessor(CLDRFile cldrFileToCheck, boolean needsCollator) { 160 init(this.locale = CLDRLocale.getInstance(cldrFileToCheck.getLocaleID()), needsCollator); 161 } 162 DisplayAndInputProcessor(CLDRFile cldrFileToCheck)163 public DisplayAndInputProcessor(CLDRFile cldrFileToCheck) { 164 init(this.locale = CLDRLocale.getInstance(cldrFileToCheck.getLocaleID()), true); 165 } 166 init(CLDRLocale locale, boolean needsCollator)167 void init(CLDRLocale locale, boolean needsCollator) { 168 isPosix = locale.toString().indexOf("POSIX") >= 0; 169 if (needsCollator) { 170 ICUServiceBuilder isb = null; 171 try { 172 isb = ICUServiceBuilder.forLocale(locale); 173 } catch (Exception e) { 174 } 175 176 if (isb != null) { 177 try { 178 col = isb.getRuleBasedCollator(); 179 } catch (Exception e) { 180 col = Collator.getInstance(ULocale.ROOT); 181 } 182 } else { 183 col = Collator.getInstance(ULocale.ROOT); 184 } 185 186 spaceCol = Collator.getInstance(locale.toULocale()); 187 if (spaceCol instanceof RuleBasedCollator) { 188 ((RuleBasedCollator) spaceCol).setAlternateHandlingShifted(false); 189 } 190 pp = new UnicodeSetPrettyPrinter().setOrdering(Collator.getInstance(ULocale.ROOT)) 191 .setSpaceComparator(Collator.getInstance(ULocale.ROOT).setStrength2(Collator.PRIMARY)) 192 .setCompressRanges(true) 193 .setToQuote(new UnicodeSet(TO_QUOTE)) 194 .setOrdering(col) 195 .setSpaceComparator(spaceCol); 196 } 197 } 198 getPrettyPrinter()199 public UnicodeSetPrettyPrinter getPrettyPrinter() { 200 return pp; 201 } 202 203 /** 204 * Constructor, taking locale. 205 * 206 * @param locale 207 */ DisplayAndInputProcessor(ULocale locale, boolean needsCollator)208 public DisplayAndInputProcessor(ULocale locale, boolean needsCollator) { 209 init(this.locale = CLDRLocale.getInstance(locale), needsCollator); 210 } 211 212 /** 213 * Constructor, taking locale. 214 * 215 * @param locale 216 */ DisplayAndInputProcessor(ULocale locale)217 public DisplayAndInputProcessor(ULocale locale) { 218 init(this.locale = CLDRLocale.getInstance(locale), true); 219 } 220 221 /** 222 * Constructor, taking locale. 223 * 224 * @param locale 225 */ DisplayAndInputProcessor(CLDRLocale locale, boolean needsCollator)226 public DisplayAndInputProcessor(CLDRLocale locale, boolean needsCollator) { 227 init(this.locale = locale, needsCollator); 228 } 229 230 /** 231 * Constructor, taking locale. 232 * 233 * @param locale 234 */ DisplayAndInputProcessor(CLDRLocale locale)235 public DisplayAndInputProcessor(CLDRLocale locale) { 236 init(this.locale = locale, true); 237 } 238 239 /** 240 * Process the value for display. The result is a string for display in the 241 * Survey tool or similar program. 242 * 243 * @param path 244 * @param value 245 * @param fullPath 246 * @return 247 */ processForDisplay(String path, String value)248 public synchronized String processForDisplay(String path, String value) { 249 value = Normalizer.compose(value, false); // Always normalize all text to NFC. 250 if (hasUnicodeSetValue(path)) { 251 value = displayUnicodeSet(value); 252 } else if (path.contains("stopword")) { 253 return value.trim().isEmpty() ? "NONE" : value; 254 } else { 255 NumericType numericType = NumericType.getNumericType(path); 256 if (numericType != NumericType.NOT_NUMERIC) { 257 // Canonicalize existing values that aren't canonicalized yet. 258 // New values will be canonicalized on input using processInput(). 259 try { 260 value = getCanonicalPattern(value, numericType, isPosix); 261 } catch (IllegalArgumentException e) { 262 if (DEBUG_DAIP) System.err.println("Illegal pattern: " + value); 263 } 264 if (numericType != NumericType.CURRENCY && numericType != NumericType.CURRENCY_ABBREVIATED) { 265 value = value.replace("'", ""); 266 } 267 } 268 } 269 // Fix up any apostrophes in number symbols 270 if (NUMBER_SEPARATOR_PATTERN.matcher(path).matches()) { 271 value = value.replace('\'', '\u2019'); 272 } 273 // Fix up any apostrophes as appropriate (Don't do so for things like date patterns... 274 if (!APOSTROPHE_SKIP_PATHS.matcher(path).matches()) { 275 value = normalizeApostrophes(value); 276 } 277 // Fix up hyphens, replacing with N-dash as appropriate 278 if (INTERVAL_FORMAT_PATHS.matcher(path).matches()) { 279 value = normalizeIntervalHyphens(value); 280 } else { 281 value = normalizeHyphens(value); 282 } 283 return value; 284 } 285 hasUnicodeSetValue(String path)286 private boolean hasUnicodeSetValue(String path) { 287 return path.startsWith("//ldml/characters/exemplarCharacters") || path.startsWith("//ldml/characters/parseLenients"); 288 } 289 290 static final UnicodeSet WHITESPACE = new UnicodeSet("[:whitespace:]").freeze(); 291 static final DateTimeCanonicalizer dtc = new DateTimeCanonicalizer(FIX_YEARS); 292 293 public static final Splitter SPLIT_BAR = Splitter.on('|').trimResults().omitEmptyStrings(); 294 static final Splitter SPLIT_SPACE = Splitter.on(' ').trimResults().omitEmptyStrings(); 295 static final Joiner JOIN_BAR = Joiner.on(" | "); 296 297 /** 298 * Process the value for input. The result is a cleaned-up value. For example, 299 * an exemplar set is modified to be in the normal format, and any missing [ ] 300 * are added (a common omission on entry). If there are any failures then the 301 * original value is returned, so that the proper error message can be given. 302 * 303 * @param path 304 * @param value 305 * @param internalException 306 * TODO 307 * @param fullPath 308 * @return 309 */ processInput(String path, String value, Exception[] internalException)310 public synchronized String processInput(String path, String value, Exception[] internalException) { 311 String original = value; 312 value = Normalizer.compose(value, false); // Always normalize all input to NFC. 313 if (internalException != null) { 314 internalException[0] = null; 315 } 316 try { 317 // Normalise Malayalam characters. 318 boolean isUnicodeSet = hasUnicodeSetValue(path); 319 if (locale.childOf(MALAYALAM)) { 320 String newvalue = normalizeMalayalam(value); 321 if (DEBUG_DAIP) System.out.println("DAIP: Normalized Malayalam '" + value + "' to '" + newvalue + "'"); 322 value = newvalue; 323 } else if (locale.childOf(ROMANIAN) && !isUnicodeSet) { 324 value = standardizeRomanian(value); 325 } else if (locale.childOf(CATALAN) && !isUnicodeSet) { 326 value = standardizeCatalan(value); 327 } else if (locale.childOf(NGOMBA) && !isUnicodeSet) { 328 value = standardizeNgomba(value); 329 } else if (locale.childOf(KWASIO) && !isUnicodeSet) { 330 value = standardizeKwasio(value); 331 } else if (locale.childOf(HEBREW) && !APOSTROPHE_SKIP_PATHS.matcher(path).matches()) { 332 value = replaceChars(path, value, HEBREW_CONVERSIONS, false); 333 } else if ((locale.childOf(SWISS_GERMAN) || locale.childOf(GERMAN_SWITZERLAND)) && !isUnicodeSet) { 334 value = standardizeSwissGerman(value); 335 } else if (locale.childOf(MYANMAR) && !isUnicodeSet) { 336 value = standardizeMyanmar(value); 337 } else if (locale.childOf(KYRGYZ)) { 338 value = replaceChars(path, value, KYRGYZ_CONVERSIONS, false); 339 } else if (locale.childOf(URDU) || locale.childOf(PASHTO) || locale.childOf(FARSI)) { 340 value = replaceChars(path, value, URDU_PLUS_CONVERSIONS, true); 341 } 342 343 if (UNICODE_WHITESPACE.containsSome(value)) { 344 value = normalizeWhitespace(path, value); 345 } 346 347 // all of our values should not have leading or trailing spaces, except insertBetween 348 if (!path.contains("/insertBetween") && !isUnicodeSet) { 349 value = value.trim(); 350 } 351 352 // fix grouping separator if space 353 if (path.startsWith("//ldml/numbers/symbols") && !path.contains("/alias")) { 354 if (value.isEmpty()) { 355 value = "\u00A0"; 356 } 357 value = value.replace(' ', '\u00A0'); 358 } 359 360 // fix date patterns 361 DateTimePatternType datetimePatternType = DateTimePatternType.fromPath(path); 362 if (DateTimePatternType.STOCK_AVAILABLE_INTERVAL_PATTERNS.contains(datetimePatternType)) { 363 try { 364 value = dtc.getCanonicalDatePattern(path, value, datetimePatternType); 365 } catch (IllegalArgumentException ex) { 366 return value; 367 } 368 } 369 370 if (path.startsWith("//ldml/numbers/currencies/currency") && path.contains("displayName")) { 371 value = normalizeCurrencyDisplayName(value); 372 } 373 NumericType numericType = NumericType.getNumericType(path); 374 if (numericType != NumericType.NOT_NUMERIC) { 375 if (numericType == NumericType.CURRENCY) { 376 value = value.replaceAll(" ", "\u00A0"); 377 if (numericType == NumericType.CURRENCY_ABBREVIATED) { 378 value = value.replaceAll("0\\.0+", "0"); 379 } 380 } else { 381 value = value.replaceAll("([%\u00A4]) ", "$1\u00A0") 382 .replaceAll(" ([%\u00A4])", "\u00A0$1"); 383 value = replace(NON_DECIMAL_PERIOD, value, "'.'"); 384 if (numericType == NumericType.DECIMAL_ABBREVIATED) { 385 value = value.replaceAll("0\\.0+", "0"); 386 } 387 } 388 value = getCanonicalPattern(value, numericType, isPosix); 389 } 390 391 // fix [,] 392 if (path.startsWith("//ldml/localeDisplayNames/languages/language") 393 || path.startsWith("//ldml/localeDisplayNames/scripts/script") 394 || path.startsWith("//ldml/localeDisplayNames/territories/territory") 395 || path.startsWith("//ldml/localeDisplayNames/variants/variant") 396 || path.startsWith("//ldml/localeDisplayNames/keys/key") 397 || path.startsWith("//ldml/localeDisplayNames/types/type")) { 398 value = value.replace('[', '(').replace(']', ')').replace('[', '(').replace(']', ')'); 399 } 400 401 // Normalize two single quotes for the inches symbol. 402 if (path.contains("/units")) { 403 value = value.replace("''", "″"); 404 } 405 406 // check specific cases 407 if (isUnicodeSet) { 408 value = inputUnicodeSet(path, value); 409 } else if (path.contains("stopword")) { 410 if (value.equals("NONE")) { 411 value = ""; 412 } 413 } 414 415 // Normalize ellipsis data. 416 if (path.startsWith("//ldml/characters/ellipsis")) { 417 value = value.replace("...", "…"); 418 } 419 420 // Replace Arabic presentation forms with their nominal counterparts 421 value = replaceArabicPresentationForms(value); 422 423 // Fix up any apostrophes as appropriate (Don't do so for things like date patterns... 424 if (!APOSTROPHE_SKIP_PATHS.matcher(path).matches()) { 425 value = normalizeApostrophes(value); 426 } 427 // Fix up any apostrophes in number symbols 428 if (NUMBER_SEPARATOR_PATTERN.matcher(path).matches()) { 429 value = value.replace('\'', '\u2019'); 430 } 431 // Fix up hyphens, replacing with N-dash as appropriate 432 if (INTERVAL_FORMAT_PATHS.matcher(path).matches()) { 433 value = normalizeIntervalHyphens(value); 434 } else if (!isUnicodeSet) { 435 value = normalizeHyphens(value); 436 } 437 438 if (path.startsWith("//ldml/annotations/annotation")) { 439 if (path.contains(Emoji.TYPE_TTS)) { 440 // The row has something like " -name" in the first column. Cf. namePath, getNamePaths. 441 // Normally the value is like "zebra" or "unicorn face", without "|". 442 // If the user enters a value with "|", discard anything after "|"; e.g., change "a | b | c" to "a". 443 value = SPLIT_BAR.split(value).iterator().next(); 444 } else { 445 // The row has something like " –keywords" in the first column. Cf. keywordPath, getKeywordPaths. 446 // Normally the value is like "stripe | zebra", with "|". 447 value = annotationsForDisplay(value); 448 } 449 } 450 451 return value; 452 } catch (RuntimeException e) { 453 if (internalException != null) { 454 internalException[0] = e; 455 } 456 return original; 457 } 458 } 459 460 private static final boolean REMOVE_COVERED_KEYWORDS = true; 461 462 /** 463 * Produce a modification of the given annotation by sorting its components and filtering covered keywords. 464 * 465 * Examples: Given "b | a", return "a | b". Given "bear | panda | panda bear", return "bear | panda". 466 * 467 * @param value the string 468 * @return the possibly modified string 469 */ annotationsForDisplay(String value)470 private static String annotationsForDisplay(String value) { 471 TreeSet<String> sorted = new TreeSet<>(Collator.getInstance(ULocale.ROOT)); 472 sorted.addAll(SPLIT_BAR.splitToList(value)); 473 if (REMOVE_COVERED_KEYWORDS) { 474 filterCoveredKeywords(sorted); 475 } 476 value = JOIN_BAR.join(sorted); 477 return value; 478 } 479 480 /** 481 * Filter from the given set some keywords that include spaces, if they duplicate, 482 * or are "covered by", other keywords in the set. 483 * 484 * For example, if the set is {"bear", "panda", "panda bear"} (annotation was "bear | panda | panda bear"), 485 * then remove "panda bear", treating it as "covered" since the set already includes "panda" and "bear". 486 * 487 * @param sorted the set from which items may be removed 488 */ filterCoveredKeywords(TreeSet<String> sorted)489 public static void filterCoveredKeywords(TreeSet<String> sorted) { 490 // for now, just do single items 491 HashSet<String> toRemove = new HashSet<>(); 492 493 for (String item : sorted) { 494 List<String> list = SPLIT_SPACE.splitToList(item); 495 if (list.size() < 2) { 496 continue; 497 } 498 if (sorted.containsAll(list)) { 499 toRemove.add(item); 500 } 501 } 502 sorted.removeAll(toRemove); 503 } 504 displayUnicodeSet(String value)505 private String displayUnicodeSet(String value) { 506 if (value.startsWith("[") && value.endsWith("]")) { 507 value = value.substring(1, value.length() - 1); 508 } 509 510 value = replace(NEEDS_QUOTE1, value, "$1\\\\$2$3"); 511 value = replace(NEEDS_QUOTE2, value, "$1\\\\$2$3"); 512 513 // if (RTL.containsSome(value) && value.startsWith("[") && value.endsWith("]")) { 514 // return "\u200E[\u200E" + value.substring(1,value.length()-2) + "\u200E]\u200E"; 515 // } 516 return value; 517 } 518 inputUnicodeSet(String path, String value)519 private String inputUnicodeSet(String path, String value) { 520 // clean up the user's input. 521 // first, fix up the '[' 522 value = value.trim(); 523 524 // remove brackets and trim again before regex 525 if (value.startsWith("[")) { 526 value = value.substring(1); 527 } 528 if (value.endsWith("]") && (!value.endsWith("\\]") || value.endsWith("\\\\]"))) { 529 value = value.substring(0, value.length() - 1); 530 } 531 value = value.trim(); 532 533 value = replace(NEEDS_QUOTE1, value, "$1\\\\$2$3"); 534 value = replace(NEEDS_QUOTE2, value, "$1\\\\$2$3"); 535 536 // re-add brackets. 537 value = "[" + value + "]"; 538 539 UnicodeSet exemplar = new UnicodeSet(value); 540 XPathParts parts = XPathParts.getFrozenInstance(path); // new XPathParts().set(path); 541 if (parts.getElement(2).equals("parseLenients")) { 542 return exemplar.toPattern(false); 543 } 544 final String type = parts.getAttributeValue(-1, "type"); 545 ExemplarType exemplarType = type == null ? ExemplarType.main : ExemplarType.valueOf(type); 546 value = getCleanedUnicodeSet(exemplar, pp, exemplarType); 547 return value; 548 } 549 normalizeWhitespace(String path, String value)550 private String normalizeWhitespace(String path, String value) { 551 // turn all whitespace sequences (including tab and newline, and NBSP for certain paths) 552 // into a single space or a single NBSP depending on path. 553 if ((path.contains("/dateFormatLength") && path.contains("/pattern")) || 554 path.contains("/availableFormats/dateFormatItem") || 555 (path.startsWith("//ldml/dates/timeZoneNames/metazone") && path.contains("/long")) || 556 path.startsWith("//ldml/dates/timeZoneNames/regionFormat") || 557 path.startsWith("//ldml/localeDisplayNames/codePatterns/codePattern") || 558 path.startsWith("//ldml/localeDisplayNames/languages/language") || 559 path.startsWith("//ldml/localeDisplayNames/territories/territory") || 560 path.startsWith("//ldml/localeDisplayNames/types/type") || 561 (path.startsWith("//ldml/numbers/currencies/currency") && path.contains("/displayName")) || 562 (path.contains("/decimalFormatLength[@type=\"long\"]") && path.contains("/pattern")) || 563 path.startsWith("//ldml/posix/messages") || 564 (path.startsWith("//ldml/units/uni") && path.contains("/unitPattern "))) { 565 value = WHITESPACE_AND_NBSP_TO_NORMALIZE.matcher(value).replaceAll(" "); // replace with regular space 566 } else if ((path.contains("/currencies/currency") && (path.contains("/group") || path.contains("/pattern"))) 567 || 568 (path.contains("/currencyFormatLength") && path.contains("/pattern")) || 569 (path.contains("/currencySpacing") && path.contains("/insertBetween")) || 570 (path.contains("/decimalFormatLength") && path.contains("/pattern")) || // i.e. the non-long ones 571 (path.contains("/percentFormatLength") && path.contains("/pattern")) || 572 (path.startsWith("//ldml/numbers/symbols") && (path.contains("/group") || path.contains("/nan")))) { 573 value = WHITESPACE_AND_NBSP_TO_NORMALIZE.matcher(value).replaceAll("\u00A0"); // replace with NBSP 574 } else { 575 // in this case don't normalize away NBSP 576 value = WHITESPACE_NO_NBSP_TO_NORMALIZE.matcher(value).replaceAll(" "); // replace with regular space 577 } 578 return value; 579 } 580 normalizeCurrencyDisplayName(String value)581 private String normalizeCurrencyDisplayName(String value) { 582 StringBuilder result = new StringBuilder(); 583 boolean inParentheses = false; 584 for (int i = 0; i < value.length(); i++) { 585 char c = value.charAt(i); 586 if (c == '(') { 587 inParentheses = true; 588 } else if (c == ')') { 589 inParentheses = false; 590 } 591 if (inParentheses && c == '-' && Character.isDigit(value.charAt(i - 1))) { 592 c = 0x2013; /* Replace hyphen-minus with dash for date ranges */ 593 } 594 result.append(c); 595 } 596 return result.toString(); 597 } 598 normalizeApostrophes(String value)599 private String normalizeApostrophes(String value) { 600 // If our DAIP always had a CLDRFile to work with, then we could just check the exemplar set in it to see. 601 // But since we don't, we just maintain the list internally and use it. 602 if (LANGUAGES_USING_MODIFIER_APOSTROPHE.contains(locale.getLanguage())) { 603 return value.replace('\'', '\u02bc'); 604 } else { 605 char prev = 0; 606 StringBuilder builder = new StringBuilder(); 607 for (char c : value.toCharArray()) { 608 if (c == '\'') { 609 if (Character.isLetter(prev)) { 610 builder.append('\u2019'); 611 } else { 612 builder.append('\u2018'); 613 } 614 } else { 615 builder.append(c); 616 } 617 prev = c; 618 } 619 return builder.toString(); 620 } 621 } 622 normalizeIntervalHyphens(String value)623 private String normalizeIntervalHyphens(String value) { 624 DateTimePatternGenerator.FormatParser fp = new DateTimePatternGenerator.FormatParser(); 625 fp.set(DateIntervalInfo.genPatternInfo(value, false).getFirstPart()); 626 List<Object> items = fp.getItems(); 627 Object last = items.get(items.size() - 1); 628 if (last instanceof String) { 629 String separator = last.toString(); 630 if (separator.contains("-")) { 631 StringBuilder sb = new StringBuilder(); 632 sb.append(DateIntervalInfo.genPatternInfo(value, false).getFirstPart()); 633 if (sb.lastIndexOf(separator) >= 0) { 634 sb.delete(sb.lastIndexOf(separator), sb.length()); 635 sb.append(separator.replace("-", "\u2013")); 636 sb.append(DateIntervalInfo.genPatternInfo(value, false).getSecondPart()); 637 return sb.toString(); 638 } 639 } 640 } 641 return value; 642 } 643 normalizeHyphens(String value)644 private String normalizeHyphens(String value) { 645 int hyphenLocation = value.indexOf("-"); 646 if (hyphenLocation > 0 && 647 Character.isDigit(value.charAt(hyphenLocation - 1)) && 648 hyphenLocation < value.length() - 1 && 649 Character.isDigit(value.charAt(hyphenLocation + 1))) { 650 StringBuilder sb = new StringBuilder(); 651 sb.append(value.substring(0, hyphenLocation)); 652 sb.append("\u2013"); 653 sb.append(value.substring(hyphenLocation + 1)); 654 return sb.toString(); 655 } 656 return value; 657 } 658 standardizeRomanian(String value)659 private String standardizeRomanian(String value) { 660 StringBuilder builder = new StringBuilder(); 661 for (char c : value.toCharArray()) { 662 for (char[] pair : ROMANIAN_CONVERSIONS) { 663 if (c == pair[0]) { 664 c = pair[1]; 665 break; 666 } 667 } 668 builder.append(c); 669 } 670 return builder.toString(); 671 } 672 standardizeKwasio(String value)673 private String standardizeKwasio(String value) { 674 StringBuilder builder = new StringBuilder(); 675 for (char c : value.toCharArray()) { 676 for (char[] pair : KWASIO_CONVERSIONS) { 677 if (c == pair[0]) { 678 c = pair[1]; 679 break; 680 } 681 } 682 builder.append(c); 683 } 684 return builder.toString(); 685 } 686 687 // Use the myanmar-tools detector. standardizeMyanmar(String value)688 private String standardizeMyanmar(String value) { 689 if (detector.getZawgyiProbability(value) > 0.90) { 690 return zawgyiUnicodeTransliterator.transform(value); 691 } 692 return value; 693 } 694 standardizeNgomba(String value)695 private String standardizeNgomba(String value) { 696 StringBuilder builder = new StringBuilder(); 697 char[] charArray = value.toCharArray(); 698 for (int i = 0; i < charArray.length; i++) { 699 char c = charArray[i]; 700 boolean convertedSaltillo = false; 701 for (char[] pair : NGOMBA_CONVERSIONS) { 702 if (c == pair[0]) { 703 c = pair[1]; 704 if (c == '\uA78C') { 705 convertedSaltillo = true; 706 } 707 break; 708 } 709 } 710 if (convertedSaltillo && 711 ((i > 0 && i < charArray.length - 1 && Character.isUpperCase(charArray[i - 1]) && Character.isUpperCase(charArray[i + 1])) || 712 (i > 1 && Character.isUpperCase(charArray[i - 1]) && Character.isUpperCase(charArray[i - 2])))) { 713 c = '\uA78B'; // UPPER CASE SALTILLO 714 } 715 builder.append(c); 716 } 717 return builder.toString(); 718 } 719 replaceChars(String path, String value, char[][] charsToReplace, boolean skipAuxExemplars)720 private String replaceChars(String path, String value, char[][] charsToReplace, boolean skipAuxExemplars) { 721 if (skipAuxExemplars && path.contains("/exemplarCharacters[@type=\"auxiliary\"]")) { 722 return value; 723 } 724 StringBuilder builder = new StringBuilder(); 725 for (char c : value.toCharArray()) { 726 for (char[] pair : charsToReplace) { 727 if (c == pair[0]) { 728 c = pair[1]; 729 break; 730 } 731 } 732 builder.append(c); 733 } 734 return builder.toString(); 735 } 736 standardizeSwissGerman(String value)737 private String standardizeSwissGerman(String value) { 738 return value.replaceAll("\u00DF", "ss"); 739 } 740 standardizeCatalan(String value)741 private String standardizeCatalan(String value) { 742 StringBuilder builder = new StringBuilder(); 743 for (char c : value.toCharArray()) { 744 boolean didSubstitute = false; 745 for (char[] triple : CATALAN_CONVERSIONS) { 746 if (c == triple[0]) { 747 builder.append(triple[1]); 748 builder.append(triple[2]); 749 didSubstitute = true; 750 break; 751 } 752 } 753 if (!didSubstitute) { 754 builder.append(c); 755 } 756 } 757 return builder.toString(); 758 } 759 replace(Pattern pattern, String value, String replacement)760 private String replace(Pattern pattern, String value, String replacement) { 761 String value2 = pattern.matcher(value).replaceAll(replacement); 762 if (DEBUG_DAIP && !value.equals(value2)) { 763 System.out.println("\n" + value + " => " + value2); 764 } 765 return value2; 766 } 767 768 private static Pattern UNNORMALIZED_MALAYALAM = PatternCache.get( 769 "(\u0D23|\u0D28|\u0D30|\u0D32|\u0D33|\u0D15)\u0D4D\u200D"); 770 771 private static Map<Character, Character> NORMALIZING_MAP = Builder.with(new HashMap<Character, Character>()) 772 .put('\u0D23', '\u0D7A').put('\u0D28', '\u0D7B') 773 .put('\u0D30', '\u0D7C').put('\u0D32', '\u0D7D') 774 .put('\u0D33', '\u0D7E').put('\u0D15', '\u0D7F').get(); 775 776 /** 777 * Normalizes the Malayalam characters in the specified input. 778 * 779 * @param value 780 * the input to be normalized 781 * @return 782 */ normalizeMalayalam(String value)783 private String normalizeMalayalam(String value) { 784 // Normalize Malayalam characters. 785 Matcher matcher = UNNORMALIZED_MALAYALAM.matcher(value); 786 if (matcher.find()) { 787 StringBuffer buffer = new StringBuffer(); 788 int start = 0; 789 do { 790 buffer.append(value.substring(start, matcher.start(0))); 791 char codePoint = matcher.group(1).charAt(0); 792 buffer.append(NORMALIZING_MAP.get(codePoint)); 793 start = matcher.end(0); 794 } while (matcher.find()); 795 buffer.append(value.substring(start)); 796 value = buffer.toString(); 797 } 798 return value; 799 } 800 801 static final Transform<String, String> fixArabicPresentation = Transliterator.getInstance( 802 "[[:block=Arabic_Presentation_Forms_A:][:block=Arabic_Presentation_Forms_B:]] nfkc"); 803 804 /** 805 * Normalizes the Arabic presentation forms characters in the specified input. 806 * 807 * @param value 808 * the input to be normalized 809 * @return 810 */ replaceArabicPresentationForms(String value)811 private String replaceArabicPresentationForms(String value) { 812 value = fixArabicPresentation.transform(value); 813 return value; 814 } 815 816 static Pattern REMOVE_QUOTE1 = PatternCache.get("(\\s)(\\\\[-\\}\\]\\&])()"); 817 static Pattern REMOVE_QUOTE2 = PatternCache.get("(\\\\[\\-\\{\\[\\&])(\\s)"); // ([^\\])([\\-\\{\\[])(\\s) 818 819 static Pattern NEEDS_QUOTE1 = PatternCache.get("(\\s|$)([-\\}\\]\\&])()"); 820 static Pattern NEEDS_QUOTE2 = PatternCache.get("([^\\\\])([\\-\\{\\[\\&])(\\s)"); // ([^\\])([\\-\\{\\[])(\\s) 821 getCleanedUnicodeSet(UnicodeSet exemplar, UnicodeSetPrettyPrinter prettyPrinter, ExemplarType exemplarType)822 public static String getCleanedUnicodeSet(UnicodeSet exemplar, UnicodeSetPrettyPrinter prettyPrinter, 823 ExemplarType exemplarType) { 824 if (prettyPrinter == null) { 825 return exemplar.toPattern(false); 826 } 827 String value; 828 prettyPrinter.setCompressRanges(exemplar.size() > 300); 829 value = exemplar.toPattern(false); 830 UnicodeSet toAdd = new UnicodeSet(); 831 832 for (UnicodeSetIterator usi = new UnicodeSetIterator(exemplar); usi.next();) { 833 String string = usi.getString(); 834 if (string.equals("ß") || string.equals("İ")) { 835 toAdd.add(string); 836 continue; 837 } 838 if (exemplarType.convertUppercase) { 839 string = UCharacter.toLowerCase(ULocale.ENGLISH, string); 840 } 841 toAdd.add(string); 842 String composed = Normalizer.compose(string, false); 843 if (!string.equals(composed)) { 844 toAdd.add(composed); 845 } 846 } 847 848 toAdd.removeAll(exemplarType.toRemove); 849 850 if (DEBUG_DAIP && !toAdd.equals(exemplar)) { 851 UnicodeSet oldOnly = new UnicodeSet(exemplar).removeAll(toAdd); 852 UnicodeSet newOnly = new UnicodeSet(toAdd).removeAll(exemplar); 853 System.out.println("Exemplar:\t" + exemplarType + ",\tremoved\t" + oldOnly + ",\tadded\t" + newOnly); 854 } 855 856 String fixedExemplar = prettyPrinter.format(toAdd); 857 UnicodeSet doubleCheck = new UnicodeSet(fixedExemplar); 858 if (!toAdd.equals(doubleCheck)) { 859 // something went wrong, leave as is 860 } else if (!value.equals(fixedExemplar)) { // put in this condition just for debugging 861 if (DEBUG_DAIP) { 862 System.out.println(TestMetadata.showDifference( 863 With.codePoints(value), 864 With.codePoints(fixedExemplar), 865 "\n")); 866 } 867 value = fixedExemplar; 868 } 869 return value; 870 } 871 872 /** 873 * @return a canonical numeric pattern, based on the type, and the isPOSIX flag. The latter is set for en_US_POSIX. 874 */ 875 static final Splitter SEMI_SPLITTER = Splitter.on(';').trimResults(); 876 getCanonicalPattern(String inpattern, NumericType type, boolean isPOSIX)877 public static String getCanonicalPattern(String inpattern, NumericType type, boolean isPOSIX) { 878 // TODO fix later to properly handle quoted ; 879 880 DecimalFormat df = new DecimalFormat(inpattern); 881 if (type == NumericType.DECIMAL_ABBREVIATED || type == NumericType.CURRENCY_ABBREVIATED 882 || CldrUtility.INHERITANCE_MARKER.equals(inpattern)) { 883 return inpattern; // TODO fix when ICU bug is fixed 884 // df.setMaximumFractionDigits(df.getMinimumFractionDigits()); 885 // df.setMaximumIntegerDigits(Math.max(1, df.getMinimumIntegerDigits())); 886 } else { 887 // int decimals = type == CURRENCY_TYPE ? 2 : 1; 888 int[] digits = isPOSIX ? type.posixDigitCount : type.digitCount; 889 df.setMinimumIntegerDigits(digits[0]); 890 df.setMinimumFractionDigits(digits[1]); 891 df.setMaximumFractionDigits(digits[2]); 892 } 893 String pattern = df.toPattern(); 894 List<String> parts = SEMI_SPLITTER.splitToList(pattern); 895 String pattern2 = parts.get(0); 896 if (parts.size() > 1) { 897 pattern2 += ";" + parts.get(1); 898 } 899 if (!pattern2.equals(pattern)) { 900 pattern = pattern2; 901 } 902 // int pos = pattern.indexOf(';'); 903 // if (pos < 0) return pattern + ";-" + pattern; 904 return pattern; 905 } 906 907 /* 908 * This tests what type a numeric pattern is. 909 */ 910 public enum NumericType { 911 CURRENCY(new int[] { 1, 2, 2 }, new int[] { 1, 2, 2 }), CURRENCY_ABBREVIATED(), DECIMAL(new int[] { 1, 0, 3 }, 912 new int[] { 1, 0, 6 }), DECIMAL_ABBREVIATED(), PERCENT(new int[] { 1, 0, 0 }, 913 new int[] { 1, 0, 0 }), SCIENTIFIC(new int[] { 0, 0, 0 }, new int[] { 1, 6, 6 }), NOT_NUMERIC; 914 915 private static final Pattern NUMBER_PATH = Pattern 916 .compile("//ldml/numbers/((currency|decimal|percent|scientific)Formats|currencies/currency).*"); 917 private int[] digitCount; 918 private int[] posixDigitCount; 919 NumericType()920 private NumericType() { 921 }; 922 NumericType(int[] digitCount, int[] posixDigitCount)923 private NumericType(int[] digitCount, int[] posixDigitCount) { 924 this.digitCount = digitCount; 925 this.posixDigitCount = posixDigitCount; 926 } 927 928 /** 929 * @return the numeric type of the xpath 930 */ getNumericType(String xpath)931 public static NumericType getNumericType(String xpath) { 932 Matcher matcher = NUMBER_PATH.matcher(xpath); 933 if (xpath.indexOf("/pattern") < 0) { 934 return NOT_NUMERIC; 935 } else if (matcher.matches()) { 936 if (matcher.group(1).equals("currencies/currency")) { 937 return CURRENCY; 938 } else { 939 NumericType type = NumericType.valueOf(matcher.group(2).toUpperCase()); 940 if (xpath.contains("=\"1000")) { 941 if (type == DECIMAL) { 942 type = DECIMAL_ABBREVIATED; 943 } else if (type == CURRENCY) { 944 type = CURRENCY_ABBREVIATED; 945 } else { 946 throw new IllegalArgumentException("Internal Error"); 947 } 948 } 949 return type; 950 } 951 } else { 952 return NOT_NUMERIC; 953 } 954 } 955 getDigitCount()956 public int[] getDigitCount() { 957 return digitCount; 958 } 959 getPosixDigitCount()960 public int[] getPosixDigitCount() { 961 return posixDigitCount; 962 } 963 }; 964 } 965