1 /* Copyright (C) 2007-2013 Google and others. All Rights Reserved. */ 2 /* Copyright (C) 2007-2013 IBM Corp. and others. All Rights Reserved. */ 3 4 package org.unicode.cldr.test; 5 6 import java.util.Arrays; 7 import java.util.HashMap; 8 import java.util.HashSet; 9 import java.util.List; 10 import java.util.Map; 11 import java.util.Set; 12 import java.util.TreeSet; 13 import java.util.regex.Matcher; 14 import java.util.regex.Pattern; 15 16 import org.unicode.cldr.test.CheckExemplars.ExemplarType; 17 import org.unicode.cldr.util.AnnotationUtil; 18 import org.unicode.cldr.util.Builder; 19 import org.unicode.cldr.util.CLDRConfig; 20 import org.unicode.cldr.util.CLDRFile; 21 import org.unicode.cldr.util.CLDRLocale; 22 import org.unicode.cldr.util.CldrUtility; 23 import org.unicode.cldr.util.DateTimeCanonicalizer; 24 import org.unicode.cldr.util.DateTimeCanonicalizer.DateTimePatternType; 25 import org.unicode.cldr.util.Emoji; 26 import org.unicode.cldr.util.ICUServiceBuilder; 27 import org.unicode.cldr.util.PatternCache; 28 import org.unicode.cldr.util.SupplementalDataInfo; 29 import org.unicode.cldr.util.UnicodeSetPrettyPrinter; 30 import org.unicode.cldr.util.With; 31 import org.unicode.cldr.util.XPathParts; 32 33 import com.google.common.base.Joiner; 34 import com.google.common.base.Splitter; 35 import com.google.myanmartools.ZawgyiDetector; 36 import com.ibm.icu.lang.UCharacter; 37 import com.ibm.icu.text.Collator; 38 import com.ibm.icu.text.DateIntervalInfo; 39 import com.ibm.icu.text.DateTimePatternGenerator; 40 import com.ibm.icu.text.DecimalFormat; 41 import com.ibm.icu.text.Normalizer; 42 import com.ibm.icu.text.RuleBasedCollator; 43 import com.ibm.icu.text.Transform; 44 import com.ibm.icu.text.Transliterator; 45 import com.ibm.icu.text.UnicodeSet; 46 import com.ibm.icu.text.UnicodeSetIterator; 47 import com.ibm.icu.util.ULocale; 48 49 /** 50 * Class for processing the input and output of CLDR data for use in the 51 * Survey Tool and other tools. 52 */ 53 public class DisplayAndInputProcessor { 54 55 private static final boolean FIX_YEARS = true; 56 57 public static final boolean DEBUG_DAIP = CldrUtility.getProperty("DEBUG_DAIP", false); 58 59 public static final UnicodeSet RTL = new UnicodeSet("[[:Bidi_Class=Arabic_Letter:][:Bidi_Class=Right_To_Left:]]") 60 .freeze(); 61 62 public static final UnicodeSet TO_QUOTE = new UnicodeSet( 63 "[[:Cn:]" + 64 "[:Default_Ignorable_Code_Point:]" + 65 "[:patternwhitespace:]" + 66 "[:Me:][:Mn:]]" // add non-spacing marks 67 ).freeze(); 68 69 public static final Pattern NUMBER_FORMAT_XPATH = Pattern 70 .compile("//ldml/numbers/.*Format\\[@type=\"standard\"]/pattern.*"); 71 72 public static final Pattern NUMBER_SEPARATOR_PATTERN = Pattern 73 .compile("//ldml/numbers/symbols.*/(decimal|group)"); 74 75 private static final Pattern APOSTROPHE_SKIP_PATHS = PatternCache.get("//ldml/(" 76 + "localeDisplayNames/languages/language\\[@type=\"mic\"].*|" 77 + "characters/.*|" 78 + "delimiters/.*|" 79 + "dates/.+/(pattern|intervalFormatItem|dateFormatItem).*|" 80 + "units/.+/unitPattern.*|" 81 + "units/.+/durationUnitPattern.*|" 82 + "numbers/symbols.*|" 83 + "numbers/miscPatterns.*|" 84 + "numbers/(decimal|currency|percent|scientific)Formats.+/(decimal|currency|percent|scientific)Format.*)"); 85 private static final Pattern INTERVAL_FORMAT_PATHS = PatternCache.get("//ldml/dates/.+/intervalFormat(Item.*|Fallback)"); 86 private static final Pattern NON_DECIMAL_PERIOD = PatternCache.get("(?<![0#'])\\.(?![0#'])"); 87 88 // Pattern to match against paths that might have time formats with h or K (12-hour cycles) 89 private static final Pattern HOUR_FORMAT_XPATHS = PatternCache 90 .get("//ldml/dates/calendars/calendar\\[@type=\"[^\"]*\"]/(" 91 + "timeFormats/timeFormatLength\\[@type=\"[^\"]*\"]/timeFormat\\[@type=\"standard\"]/pattern\\[@type=\"standard\"].*|" 92 + "dateTimeFormats/availableFormats/dateFormatItem\\[@id=\"[A-GL-Ma-gl-m]*[hK][A-Za-z]*\"].*|" 93 + "dateTimeFormats/intervalFormats/intervalFormatItem\\[@id=\"[A-GL-Ma-gl-m]*[hK][A-Za-z]*\"].*)"); 94 95 private static final Pattern AMPM_SPACE_BEFORE = PatternCache.get("([Khms])([ \\u00A0]+)(a+)"); // time, space, a+ 96 private static final Pattern AMPM_SPACE_AFTER = PatternCache.get("(a+)([ \\u00A0]+)([Kh])"); // a+, space, hour 97 98 // Pattern to match against paths that might have date formats with y 99 private static final Pattern YEAR_FORMAT_XPATHS = PatternCache 100 .get("//ldml/dates/calendars/calendar\\[@type=\"[^\"]*\"]/(" 101 + "dateFormats/dateFormatLength\\[@type=\"[^\"]*\"]/dateFormat\\[@type=\"standard\"]/pattern\\[@type=\"standard\"].*|" 102 + "dateTimeFormats/availableFormats/dateFormatItem\\[@id=\"[A-XZa-xz]*y[A-Za-z]*\"].*|" 103 + "dateTimeFormats/intervalFormats/intervalFormatItem\\[@id=\"[A-XZa-xz]*y[A-Za-z]*\"].*)"); 104 105 // Cyrillic year markers are or begin with (in various languages) \u0430 \u0433 \u0435 \u0436 \u043E \u0440 \u0441 106 private static final Pattern YEAR_SPACE_YEARMARKER = PatternCache.get("y[ \\u00A0]+('?[агежорс])"); // y, space, Cyrillic year marker start 107 108 public static final Pattern UNIT_NARROW_XPATHS = PatternCache 109 .get("//ldml/units/unitLength\\[@type=\"narrow\"]unit\\[@type=\"[^\"]*\"]/unitPattern.*"); 110 111 public static final Pattern UNIT_SHORT_XPATHS = PatternCache 112 .get("//ldml/units/unitLength\\[@type=\"short\"]unit\\[@type=\"[^\"]*\"]/unitPattern.*"); 113 114 private static final Pattern PLACEHOLDER_SPACE_AFTER = PatternCache.get("\\}[ \\u00A0\\u202F]+"); 115 private static final Pattern PLACEHOLDER_SPACE_BEFORE = PatternCache.get("[ \\u00A0\\u202F]+\\{"); 116 private static final Pattern INTERVAL_FALLBACK_RANGE = PatternCache.get("\\} [\\u2013-] \\{"); 117 118 /** 119 * string of whitespace not including NBSP, i.e. [\t\n\r]+ 120 */ 121 private static final Pattern WHITESPACE_NO_NBSP_TO_NORMALIZE = PatternCache.get("\\s+"); // 122 123 /** 124 * string of whitespace including NBSP, i.e. [\u00A0\t\n\r]+ 125 */ 126 private static final Pattern WHITESPACE_AND_NBSP_TO_NORMALIZE = PatternCache.get("[\\s\\u00A0]+"); 127 128 /** 129 * one or more NBSP followed by one or more regular spaces 130 */ 131 private static final Pattern NBSP_PLUS_SPACE_TO_NORMALIZE = PatternCache.get("\\u00A0+\\u0020+"); 132 133 /** 134 * one or more regular spaces followed by one or more NBSP 135 */ 136 private static final Pattern SPACE_PLUS_NBSP_TO_NORMALIZE = PatternCache.get("\\u0020+\\u00A0+"); 137 138 private static final Pattern INITIAL_NBSP = PatternCache.get("^[\\u00A0\\u202F]+"); 139 private static final Pattern FINAL_NBSP = PatternCache.get("[\\u00A0\\u202F]+$"); 140 private static final Pattern MULTIPLE_NBSP = PatternCache.get("\\u00A0\\u00A0+"); 141 142 // The following includes (among others) \u0009, \u0020, \u00A0, \u2007, \u2009, \u202F, \u3000 143 private static final UnicodeSet UNICODE_WHITESPACE = new UnicodeSet("[:whitespace:]").freeze(); 144 145 private static final CLDRLocale MALAYALAM = CLDRLocale.getInstance("ml"); 146 private static final CLDRLocale ROMANIAN = CLDRLocale.getInstance("ro"); 147 private static final CLDRLocale CATALAN = CLDRLocale.getInstance("ca"); 148 private static final CLDRLocale NGOMBA = CLDRLocale.getInstance("jgo"); 149 private static final CLDRLocale KWASIO = CLDRLocale.getInstance("nmg"); 150 private static final CLDRLocale HEBREW = CLDRLocale.getInstance("he"); 151 private static final CLDRLocale MYANMAR = CLDRLocale.getInstance("my"); 152 private static final CLDRLocale KYRGYZ = CLDRLocale.getInstance("ky"); 153 private static final CLDRLocale URDU = CLDRLocale.getInstance("ur"); 154 private static final CLDRLocale PASHTO = CLDRLocale.getInstance("ps"); 155 private static final CLDRLocale FARSI = CLDRLocale.getInstance("fa"); 156 private static final CLDRLocale GERMAN_SWITZERLAND = CLDRLocale.getInstance("de_CH"); 157 private static final CLDRLocale SWISS_GERMAN = CLDRLocale.getInstance("gsw"); 158 private static final CLDRLocale FF_ADLAM = CLDRLocale.getInstance("ff_Adlm"); 159 public static final Set<String> LANGUAGES_USING_MODIFIER_APOSTROPHE = new HashSet<>( 160 Arrays.asList("br", "bss", "cad", "cic", "cch", "gn", "ha", "ha_Latn", "lkt", "mgo", "moh", "mus", "nnh", "qu", "quc", "uk", "uz", "uz_Latn")); 161 162 // Ş ş Ţ ţ => Ș ș Ț ț 163 private static final char[][] ROMANIAN_CONVERSIONS = { 164 { '\u015E', '\u0218' }, { '\u015F', '\u0219' }, { '\u0162', '\u021A' }, 165 { '\u0163', '\u021B' } }; 166 167 private static final char[][] CATALAN_CONVERSIONS = { 168 { '\u013F', '\u004C', '\u00B7' }, // Ŀ -> L· 169 { '\u0140', '\u006C', '\u00B7' } }; // ŀ -> l· 170 171 private static final char[][] NGOMBA_CONVERSIONS = { 172 { '\u0251', '\u0061' }, { '\u0261', '\u0067' }, // ɑ -> a , ɡ -> g , See ticket #5691 173 { '\u2019', '\uA78C' }, { '\u02BC', '\uA78C' } }; // Saltillo, see ticket #6805 174 175 private static final char[][] KWASIO_CONVERSIONS = { 176 { '\u0306', '\u030C' }, // See ticket #6571, use caron instead of breve 177 { '\u0103', '\u01CE' }, { '\u0102', '\u01CD' }, // a-breve -> a-caron 178 { '\u0115', '\u011B' }, { '\u011A', '\u01CD' }, // e-breve -> e-caron 179 { '\u012D', '\u01D0' }, { '\u012C', '\u01CF' }, // i-breve -> i-caron 180 { '\u014F', '\u01D2' }, { '\u014E', '\u01D1' }, // o-breve -> o-caron 181 { '\u016D', '\u01D4' }, { '\u016C', '\u01D3' } // u-breve -> u-caron 182 }; 183 184 private static final char[][] HEBREW_CONVERSIONS = { 185 { '\'', '\u05F3' }, { '"', '\u05F4' } }; // ' -> geresh " -> gershayim 186 187 private static final char[][] KYRGYZ_CONVERSIONS = { 188 { 'ӊ', 'ң' }, { 'Ӊ', 'Ң' } }; // right modifier 189 190 private static final char[][] URDU_PLUS_CONVERSIONS = { 191 { '\u0643', '\u06A9' }}; // wrong char 192 193 private static final ZawgyiDetector detector = new ZawgyiDetector(); 194 private static final Transliterator zawgyiUnicodeTransliterator = 195 Transliterator.getInstance("Zawgyi-my"); 196 197 private Collator col; 198 199 private Collator spaceCol; 200 201 private UnicodeSetPrettyPrinter pp = null; 202 203 final private CLDRLocale locale; 204 private String scriptCode; // actual or default script code (not null after init) 205 private boolean isPosix; 206 207 /** 208 * Constructor, taking cldrFile. 209 * 210 * @param cldrFileToCheck 211 */ DisplayAndInputProcessor(CLDRFile cldrFileToCheck, boolean needsCollator)212 public DisplayAndInputProcessor(CLDRFile cldrFileToCheck, boolean needsCollator) { 213 init(this.locale = CLDRLocale.getInstance(cldrFileToCheck.getLocaleID()), needsCollator); 214 } 215 DisplayAndInputProcessor(CLDRFile cldrFileToCheck)216 public DisplayAndInputProcessor(CLDRFile cldrFileToCheck) { 217 init(this.locale = CLDRLocale.getInstance(cldrFileToCheck.getLocaleID()), true); 218 } 219 init(CLDRLocale locale, boolean needsCollator)220 void init(CLDRLocale locale, boolean needsCollator) { 221 isPosix = locale.toString().indexOf("POSIX") >= 0; 222 if (needsCollator) { 223 ICUServiceBuilder isb = null; 224 try { 225 isb = ICUServiceBuilder.forLocale(locale); 226 } catch (Exception e) { 227 } 228 229 if (isb != null) { 230 try { 231 col = isb.getRuleBasedCollator(); 232 } catch (Exception e) { 233 col = Collator.getInstance(ULocale.ROOT); 234 } 235 } else { 236 col = Collator.getInstance(ULocale.ROOT); 237 } 238 239 spaceCol = Collator.getInstance(locale.toULocale()); 240 if (spaceCol instanceof RuleBasedCollator) { 241 ((RuleBasedCollator) spaceCol).setAlternateHandlingShifted(false); 242 } 243 pp = new UnicodeSetPrettyPrinter().setOrdering(Collator.getInstance(ULocale.ROOT)) 244 .setSpaceComparator(Collator.getInstance(ULocale.ROOT).setStrength2(Collator.PRIMARY)) 245 .setCompressRanges(true) 246 .setToQuote(new UnicodeSet(TO_QUOTE)) 247 .setOrdering(col) 248 .setSpaceComparator(spaceCol); 249 } 250 String script = locale.getScript(); 251 if (script == null || script.length() < 4) { 252 SupplementalDataInfo sdi = CLDRConfig.getInstance().getSupplementalDataInfo(); 253 script = sdi.getDefaultScript(locale.getBaseName()); 254 if (script == null || script.length() < 4 || script.equals("Zzzz")) { 255 script = sdi.getDefaultScript(locale.getLanguage()); 256 } 257 if (script == null || script.length() < 4) { 258 script = "Zzzz"; 259 } 260 } 261 scriptCode = script; 262 } 263 getPrettyPrinter()264 public UnicodeSetPrettyPrinter getPrettyPrinter() { 265 return pp; 266 } 267 268 /** 269 * Constructor, taking ULocale and boolean. 270 * 271 * @param locale the ULocale 272 * @param needsCollator true or false 273 * 274 * Called by getProcessor, with locale = SurveyMain.TRANS_HINT_LOCALE 275 */ DisplayAndInputProcessor(ULocale locale, boolean needsCollator)276 public DisplayAndInputProcessor(ULocale locale, boolean needsCollator) { 277 init(this.locale = CLDRLocale.getInstance(locale), needsCollator); 278 } 279 280 /** 281 * Constructor, taking ULocale. 282 * 283 * @param locale the ULocale 284 */ DisplayAndInputProcessor(ULocale locale)285 public DisplayAndInputProcessor(ULocale locale) { 286 init(this.locale = CLDRLocale.getInstance(locale), true /* needsCollator */); 287 } 288 289 /** 290 * Constructor, taking CLDRLocale and boolean. 291 * 292 * @param locale the CLDRLocale 293 * @param needsCollator true or false 294 */ DisplayAndInputProcessor(CLDRLocale locale, boolean needsCollator)295 public DisplayAndInputProcessor(CLDRLocale locale, boolean needsCollator) { 296 init(this.locale = locale, needsCollator); 297 } 298 299 /** 300 * Constructor, taking locale. 301 * 302 * @param locale 303 */ DisplayAndInputProcessor(CLDRLocale locale)304 public DisplayAndInputProcessor(CLDRLocale locale) { 305 init(this.locale = locale, true); 306 } 307 308 /** 309 * Process the value for display. The result is a string for display in the 310 * Survey tool or similar program. 311 * 312 * @param path 313 * @param value 314 * @return 315 */ processForDisplay(String path, String value)316 public synchronized String processForDisplay(String path, String value) { 317 value = Normalizer.compose(value, false); // Always normalize all text to NFC. 318 if (hasUnicodeSetValue(path)) { 319 value = displayUnicodeSet(value); 320 } else if (path.contains("stopword")) { 321 return value.trim().isEmpty() ? "NONE" : value; 322 } else { 323 NumericType numericType = NumericType.getNumericType(path); 324 if (numericType != NumericType.NOT_NUMERIC) { 325 // Canonicalize existing values that aren't canonicalized yet. 326 // New values will be canonicalized on input using processInput(). 327 try { 328 value = getCanonicalPattern(value, numericType, isPosix); 329 } catch (IllegalArgumentException e) { 330 if (DEBUG_DAIP) System.err.println("Illegal pattern: " + value); 331 } 332 if (numericType != NumericType.CURRENCY && numericType != NumericType.CURRENCY_ABBREVIATED) { 333 value = value.replace("'", ""); 334 } 335 } 336 } 337 // Fix up any apostrophes in number symbols 338 if (NUMBER_SEPARATOR_PATTERN.matcher(path).matches()) { 339 value = value.replace('\'', '\u2019'); 340 } 341 // Fix up any apostrophes as appropriate (Don't do so for things like date patterns... 342 if (!APOSTROPHE_SKIP_PATHS.matcher(path).matches()) { 343 value = normalizeApostrophes(value); 344 } 345 // Fix up hyphens, replacing with N-dash as appropriate 346 if (INTERVAL_FORMAT_PATHS.matcher(path).matches()) { 347 value = normalizeIntervalHyphensAndSpaces(value); // This may also adjust spaces around en dash 348 } else { 349 value = normalizeHyphens(value); 350 } 351 return value; 352 } 353 hasUnicodeSetValue(String path)354 private boolean hasUnicodeSetValue(String path) { 355 return path.startsWith("//ldml/characters/exemplarCharacters") || path.startsWith("//ldml/characters/parseLenients"); 356 } 357 358 static final UnicodeSet WHITESPACE = new UnicodeSet("[:whitespace:]").freeze(); 359 static final DateTimeCanonicalizer dtc = new DateTimeCanonicalizer(FIX_YEARS); 360 361 private static final String BAR_VL = "\\|"; // U+007C VERTICAL LINE (pipe, bar) literal 362 private static final String BAR_EL = "\\s+l\\s+"; // U+006C LATIN SMALL LETTER L with space 363 private static final String BAR_DANDA = "।"; // U+0964 DEVANAGARI DANDA 364 private static final String BAR_REGEX = "(" + BAR_VL + "|" + BAR_EL + "|" + BAR_DANDA + ")"; 365 public static final Splitter SPLIT_BAR = Splitter.on(Pattern.compile(BAR_REGEX)).trimResults().omitEmptyStrings(); 366 static final Splitter SPLIT_SPACE = Splitter.on(' ').trimResults().omitEmptyStrings(); 367 static final Joiner JOIN_BAR = Joiner.on(" | "); 368 static final Joiner JOIN_SPACE = Joiner.on(' '); 369 370 /** 371 * Process the value for input. The result is a cleaned-up value. For example, 372 * an exemplar set is modified to be in the normal format, and any missing [ ] 373 * are added (a common omission on entry). If there are any failures then the 374 * original value is returned, so that the proper error message can be given. 375 * 376 * @param path 377 * @param value 378 * @param internalException 379 * @return 380 */ processInput(String path, String value, Exception[] internalException)381 public synchronized String processInput(String path, String value, Exception[] internalException) { 382 String original = value; 383 value = stripProblematicControlCharacters(value); 384 value = Normalizer.compose(value, false); // Always normalize all input to NFC. 385 value = value.replace('\u00B5', '\u03BC'); // use the right Greek mu character 386 387 if (internalException != null) { 388 internalException[0] = null; 389 } 390 // skip processing for inheritance marker 391 if (CldrUtility.INHERITANCE_MARKER.equals(value)) { 392 return value; // Reference: https://unicode.org/cldr/trac/ticket/11261 393 } 394 // for root annotations 395 if (CLDRLocale.ROOT.equals(locale) && path.contains("/annotations")) { 396 return value; // Reference: https://unicode.org/cldr/trac/ticket/11261 397 } 398 399 try { 400 // Normalise Malayalam characters. 401 boolean isUnicodeSet = hasUnicodeSetValue(path); 402 if (locale.childOf(MALAYALAM)) { 403 String newvalue = normalizeMalayalam(value); 404 if (DEBUG_DAIP) System.out.println("DAIP: Normalized Malayalam '" + value + "' to '" + newvalue + "'"); 405 value = newvalue; 406 } else if (locale.childOf(ROMANIAN) && !isUnicodeSet) { 407 value = standardizeRomanian(value); 408 } else if (locale.childOf(CATALAN) && !isUnicodeSet) { 409 value = standardizeCatalan(value); 410 } else if (locale.childOf(NGOMBA) && !isUnicodeSet) { 411 value = standardizeNgomba(value); 412 } else if (locale.childOf(KWASIO) && !isUnicodeSet) { 413 value = standardizeKwasio(value); 414 } else if (locale.childOf(HEBREW) && !APOSTROPHE_SKIP_PATHS.matcher(path).matches()) { 415 value = replaceChars(path, value, HEBREW_CONVERSIONS, false); 416 } else if ((locale.childOf(SWISS_GERMAN) || locale.childOf(GERMAN_SWITZERLAND)) && !isUnicodeSet) { 417 value = standardizeSwissGerman(value); 418 } else if (locale.childOf(MYANMAR) && !isUnicodeSet) { 419 value = standardizeMyanmar(value); 420 } else if (locale.childOf(KYRGYZ)) { 421 value = replaceChars(path, value, KYRGYZ_CONVERSIONS, false); 422 } else if (locale.childOf(URDU) || locale.childOf(PASHTO) || locale.childOf(FARSI)) { 423 value = replaceChars(path, value, URDU_PLUS_CONVERSIONS, true); 424 } else if (locale.childOf(FF_ADLAM) && !isUnicodeSet) { 425 value = fixAdlamNasalization(value); 426 } 427 428 if (UNICODE_WHITESPACE.containsSome(value)) { 429 value = normalizeWhitespace(path, value); 430 } 431 432 // all of our values should not have leading or trailing spaces, except insertBetween, 433 // foreignSpaceReplacement, and anything with built-in attribute xml:space="preserve" 434 if (!path.contains("/insertBetween") && !path.contains("/foreignSpaceReplacement") && 435 !path.contains("[@xml:space=\"preserve\"]") && !isUnicodeSet) { 436 value = value.trim(); 437 } 438 439 // fix grouping separator if space 440 if (path.startsWith("//ldml/numbers/symbols") && !path.contains("/alias")) { 441 if (value.isEmpty()) { 442 value = "\u00A0"; 443 } 444 value = value.replace(' ', '\u00A0'); 445 } 446 447 // fix date patterns 448 DateTimePatternType datetimePatternType = DateTimePatternType.fromPath(path); 449 if (DateTimePatternType.STOCK_AVAILABLE_INTERVAL_PATTERNS.contains(datetimePatternType)) { 450 try { 451 value = dtc.getCanonicalDatePattern(path, value, datetimePatternType); 452 } catch (IllegalArgumentException ex) { 453 return value; 454 } 455 } 456 457 if (path.startsWith("//ldml/numbers/currencies/currency") && path.contains("displayName")) { 458 value = normalizeCurrencyDisplayName(value); 459 } 460 NumericType numericType = NumericType.getNumericType(path); 461 if (numericType != NumericType.NOT_NUMERIC) { 462 if (numericType == NumericType.CURRENCY) { 463 value = value.replaceAll(" ", "\u00A0"); 464 if (numericType == NumericType.CURRENCY_ABBREVIATED) { 465 value = value.replaceAll("0\\.0+", "0"); 466 } 467 } else { 468 value = value.replaceAll("([%\u00A4]) ", "$1\u00A0") 469 .replaceAll(" ([%\u00A4])", "\u00A0$1"); 470 value = replace(NON_DECIMAL_PERIOD, value, "'.'"); 471 if (numericType == NumericType.DECIMAL_ABBREVIATED) { 472 value = value.replaceAll("0\\.0+", "0"); 473 } 474 } 475 value = getCanonicalPattern(value, numericType, isPosix); 476 } 477 478 // fix [,] 479 if (path.startsWith("//ldml/localeDisplayNames/languages/language") 480 || path.startsWith("//ldml/localeDisplayNames/scripts/script") 481 || path.startsWith("//ldml/localeDisplayNames/territories/territory") 482 || path.startsWith("//ldml/localeDisplayNames/variants/variant") 483 || path.startsWith("//ldml/localeDisplayNames/keys/key") 484 || path.startsWith("//ldml/localeDisplayNames/types/type")) { 485 value = value.replace('[', '(').replace(']', ')').replace('[', '(').replace(']', ')'); 486 } 487 488 // Normalize two single quotes for the inches symbol. 489 if (path.contains("/units")) { 490 value = value.replace("''", "″"); 491 } 492 493 // check specific cases 494 if (isUnicodeSet) { 495 value = inputUnicodeSet(path, value); 496 } else if (path.contains("stopword")) { 497 if (value.equals("NONE")) { 498 value = ""; 499 } 500 } 501 502 // Normalize ellipsis data. 503 if (path.startsWith("//ldml/characters/ellipsis")) { 504 value = value.replace("...", "…"); 505 } 506 507 if (path.startsWith("//ldml/personNames/nameOrderLocales")) { 508 value = normalizeNameOrderLocales(value); 509 } 510 511 // Replace Arabic presentation forms with their nominal counterparts 512 value = replaceArabicPresentationForms(value); 513 514 // Fix up any apostrophes as appropriate (Don't do so for things like date patterns... 515 if (!APOSTROPHE_SKIP_PATHS.matcher(path).matches()) { 516 value = normalizeApostrophes(value); 517 } 518 // Fix up any apostrophes in number symbols 519 if (NUMBER_SEPARATOR_PATTERN.matcher(path).matches()) { 520 value = value.replace('\'', '\u2019'); 521 } 522 // Fix up hyphens, replacing with N-dash as appropriate 523 if (INTERVAL_FORMAT_PATHS.matcher(path).matches()) { 524 value = normalizeIntervalHyphensAndSpaces(value); // This may also adjust spaces around en dash 525 } else if (!isUnicodeSet) { 526 value = normalizeHyphens(value); 527 } 528 529 if (AnnotationUtil.pathIsAnnotation(path)) { 530 if (path.contains(Emoji.TYPE_TTS)) { 531 // The row has something like " -name" in the first column. Cf. namePath, getNamePaths. 532 // Normally the value is like "zebra" or "unicorn face", without "|". 533 // If the user enters a value with "|", discard anything after "|"; e.g., change "a | b | c" to "a". 534 value = SPLIT_BAR.split(value).iterator().next(); 535 } else { 536 // The row has something like " –keywords" in the first column. Cf. keywordPath, getKeywordPaths. 537 // Normally the value is like "stripe | zebra", with "|". 538 value = annotationsForDisplay(value); 539 } 540 } 541 value = normalizeZeroWidthSpace(value); 542 return value; 543 } catch (RuntimeException e) { 544 if (internalException != null) { 545 internalException[0] = e; 546 } 547 return original; 548 } 549 } 550 normalizeNameOrderLocales(String value)551 private String normalizeNameOrderLocales(String value) { 552 TreeSet<String> result = new TreeSet<>(SPLIT_SPACE.splitToList(value)); 553 result.remove("zxx"); 554 if (result.remove("und")) { // put und at the front 555 if (result.isEmpty()) { 556 return "und"; 557 } else { 558 return "und " + JOIN_SPACE.join(result); 559 } 560 } 561 return JOIN_SPACE.join(result); 562 } 563 564 /** 565 * Strip out all code points less than U+0020 except for U+0009 tab, 566 * U+000A line feed, and U+000D carriage return. 567 * 568 * @param s the string 569 * @return the resulting string 570 */ stripProblematicControlCharacters(String s)571 private String stripProblematicControlCharacters(String s) { 572 if (s == null || s.isEmpty()) { 573 return s; 574 } 575 return s.codePoints() 576 .filter(c -> (c >= 0x20 || c == 9 || c == 0xA || c == 0xD)) 577 .collect(StringBuilder::new, StringBuilder::appendCodePoint, StringBuilder::append) 578 .toString(); 579 } 580 581 private static final boolean REMOVE_COVERED_KEYWORDS = true; 582 583 /** 584 * Produce a modification of the given annotation by sorting its components and filtering covered keywords. 585 * 586 * Examples: Given "b | a", return "a | b". Given "bear | panda | panda bear", return "bear | panda". 587 * 588 * @param value the string 589 * @return the possibly modified string 590 */ annotationsForDisplay(String value)591 private static String annotationsForDisplay(String value) { 592 TreeSet<String> sorted = new TreeSet<>(Collator.getInstance(ULocale.ROOT)); 593 sorted.addAll(SPLIT_BAR.splitToList(value)); 594 if (REMOVE_COVERED_KEYWORDS) { 595 filterCoveredKeywords(sorted); 596 } 597 value = JOIN_BAR.join(sorted); 598 return value; 599 } 600 601 /** 602 * Filter from the given set some keywords that include spaces, if they duplicate, 603 * or are "covered by", other keywords in the set. 604 * 605 * For example, if the set is {"bear", "panda", "panda bear"} (annotation was "bear | panda | panda bear"), 606 * then remove "panda bear", treating it as "covered" since the set already includes "panda" and "bear". 607 * 608 * @param sorted the set from which items may be removed 609 */ filterCoveredKeywords(TreeSet<String> sorted)610 public static void filterCoveredKeywords(TreeSet<String> sorted) { 611 // for now, just do single items 612 HashSet<String> toRemove = new HashSet<>(); 613 614 for (String item : sorted) { 615 List<String> list = SPLIT_SPACE.splitToList(item); 616 if (list.size() < 2) { 617 continue; 618 } 619 if (sorted.containsAll(list)) { 620 toRemove.add(item); 621 } 622 } 623 sorted.removeAll(toRemove); 624 } 625 displayUnicodeSet(String value)626 private String displayUnicodeSet(String value) { 627 if (value.startsWith("[") && value.endsWith("]")) { 628 value = value.substring(1, value.length() - 1); 629 } 630 631 value = replace(NEEDS_QUOTE1, value, "$1\\\\$2$3"); 632 value = replace(NEEDS_QUOTE2, value, "$1\\\\$2$3"); 633 634 // if (RTL.containsSome(value) && value.startsWith("[") && value.endsWith("]")) { 635 // return "\u200E[\u200E" + value.substring(1,value.length()-2) + "\u200E]\u200E"; 636 // } 637 return value; 638 } 639 inputUnicodeSet(String path, String value)640 private String inputUnicodeSet(String path, String value) { 641 // clean up the user's input. 642 // first, fix up the '[' 643 value = value.trim(); 644 645 // remove brackets and trim again before regex 646 if (value.startsWith("[")) { 647 value = value.substring(1); 648 } 649 if (value.endsWith("]") && (!value.endsWith("\\]") || value.endsWith("\\\\]"))) { 650 value = value.substring(0, value.length() - 1); 651 } 652 value = value.trim(); 653 654 value = replace(NEEDS_QUOTE1, value, "$1\\\\$2$3"); 655 value = replace(NEEDS_QUOTE2, value, "$1\\\\$2$3"); 656 657 // re-add brackets. 658 value = "[" + value + "]"; 659 660 UnicodeSet exemplar = new UnicodeSet(value); 661 XPathParts parts = XPathParts.getFrozenInstance(path); 662 if (parts.getElement(2).equals("parseLenients")) { 663 return exemplar.toPattern(false); 664 } 665 final String type = parts.getAttributeValue(-1, "type"); 666 ExemplarType exemplarType = type == null ? ExemplarType.main : ExemplarType.valueOf(type); 667 value = getCleanedUnicodeSet(exemplar, pp, exemplarType); 668 return value; 669 } 670 normalizeCurrencyDisplayName(String value)671 private String normalizeCurrencyDisplayName(String value) { 672 StringBuilder result = new StringBuilder(); 673 boolean inParentheses = false; 674 for (int i = 0; i < value.length(); i++) { 675 char c = value.charAt(i); 676 if (c == '(') { 677 inParentheses = true; 678 } else if (c == ')') { 679 inParentheses = false; 680 } 681 if (inParentheses && c == '-' && Character.isDigit(value.charAt(i - 1))) { 682 c = 0x2013; /* Replace hyphen-minus with dash for date ranges */ 683 } 684 result.append(c); 685 } 686 return result.toString(); 687 } 688 normalizeApostrophes(String value)689 private String normalizeApostrophes(String value) { 690 // If our DAIP always had a CLDRFile to work with, then we could just check the exemplar set in it to see. 691 // But since we don't, we just maintain the list internally and use it. 692 if (LANGUAGES_USING_MODIFIER_APOSTROPHE.contains(locale.getLanguage())) { 693 return value.replace('\'', '\u02bc'); 694 } else { 695 char prev = 0; 696 StringBuilder builder = new StringBuilder(); 697 for (char c : value.toCharArray()) { 698 if (c == '\'') { 699 if (Character.isLetter(prev)) { 700 builder.append('\u2019'); 701 } else { 702 builder.append('\u2018'); 703 } 704 } else { 705 builder.append(c); 706 } 707 prev = c; 708 } 709 return builder.toString(); 710 } 711 } 712 normalizeIntervalHyphensAndSpaces(String value)713 private String normalizeIntervalHyphensAndSpaces(String value) { 714 if (value.indexOf("{0}") >= 0) { 715 // intervalFormatFallback pattern, not handled by DateTimePatternGenerator.FormatParser 716 if (scriptCode.equals("Latn")) { 717 value = INTERVAL_FALLBACK_RANGE.matcher(value).replaceAll("}\u2009\u2013\u2009{"); 718 } 719 return value; 720 } 721 DateTimePatternGenerator.FormatParser fp = new DateTimePatternGenerator.FormatParser(); 722 fp.set(DateIntervalInfo.genPatternInfo(value, false).getFirstPart()); // first format & separator including spaces 723 List<Object> items = fp.getItems(); 724 Object last = items.get(items.size() - 1); 725 if (last instanceof String) { 726 String separator = last.toString(); // separator including spaces 727 String replacement = separator; 728 if (scriptCode.equals("Latn") && (separator.equals(" - ") || separator.equals(" \u2013 "))) { 729 replacement = "\u2009\u2013\u2009"; // Per CLDR-14032 730 } else if (separator.contains("-")) { 731 replacement = separator.replace("-", "\u2013"); 732 } 733 if (!replacement.equals(separator)) { 734 StringBuilder sb = new StringBuilder(); 735 sb.append(DateIntervalInfo.genPatternInfo(value, false).getFirstPart()); 736 if (sb.lastIndexOf(separator) >= 0) { 737 sb.delete(sb.lastIndexOf(separator), sb.length()); 738 sb.append(replacement); 739 sb.append(DateIntervalInfo.genPatternInfo(value, false).getSecondPart()); // second format only 740 return sb.toString(); 741 } 742 } 743 } 744 return value; 745 } 746 normalizeHyphens(String value)747 private String normalizeHyphens(String value) { 748 int hyphenLocation = value.indexOf("-"); 749 if (hyphenLocation > 0 && 750 Character.isDigit(value.charAt(hyphenLocation - 1)) && 751 hyphenLocation < value.length() - 1 && 752 Character.isDigit(value.charAt(hyphenLocation + 1))) { 753 StringBuilder sb = new StringBuilder(); 754 sb.append(value.substring(0, hyphenLocation)); 755 sb.append("\u2013"); 756 sb.append(value.substring(hyphenLocation + 1)); 757 return sb.toString(); 758 } 759 return value; 760 } 761 standardizeRomanian(String value)762 private String standardizeRomanian(String value) { 763 StringBuilder builder = new StringBuilder(); 764 for (char c : value.toCharArray()) { 765 for (char[] pair : ROMANIAN_CONVERSIONS) { 766 if (c == pair[0]) { 767 c = pair[1]; 768 break; 769 } 770 } 771 builder.append(c); 772 } 773 return builder.toString(); 774 } 775 standardizeKwasio(String value)776 private String standardizeKwasio(String value) { 777 StringBuilder builder = new StringBuilder(); 778 for (char c : value.toCharArray()) { 779 for (char[] pair : KWASIO_CONVERSIONS) { 780 if (c == pair[0]) { 781 c = pair[1]; 782 break; 783 } 784 } 785 builder.append(c); 786 } 787 return builder.toString(); 788 } 789 790 // Use the myanmar-tools detector. standardizeMyanmar(String value)791 private String standardizeMyanmar(String value) { 792 if (detector.getZawgyiProbability(value) > 0.90) { 793 return zawgyiUnicodeTransliterator.transform(value); 794 } 795 return value; 796 } 797 standardizeNgomba(String value)798 private String standardizeNgomba(String value) { 799 StringBuilder builder = new StringBuilder(); 800 char[] charArray = value.toCharArray(); 801 for (int i = 0; i < charArray.length; i++) { 802 char c = charArray[i]; 803 boolean convertedSaltillo = false; 804 for (char[] pair : NGOMBA_CONVERSIONS) { 805 if (c == pair[0]) { 806 c = pair[1]; 807 if (c == '\uA78C') { 808 convertedSaltillo = true; 809 } 810 break; 811 } 812 } 813 if (convertedSaltillo && 814 ((i > 0 && i < charArray.length - 1 && Character.isUpperCase(charArray[i - 1]) && Character.isUpperCase(charArray[i + 1])) || 815 (i > 1 && Character.isUpperCase(charArray[i - 1]) && Character.isUpperCase(charArray[i - 2])))) { 816 c = '\uA78B'; // UPPER CASE SALTILLO 817 } 818 builder.append(c); 819 } 820 return builder.toString(); 821 } 822 replaceChars(String path, String value, char[][] charsToReplace, boolean skipAuxExemplars)823 private String replaceChars(String path, String value, char[][] charsToReplace, boolean skipAuxExemplars) { 824 if (skipAuxExemplars && path.contains("/exemplarCharacters[@type=\"auxiliary\"]")) { 825 return value; 826 } 827 StringBuilder builder = new StringBuilder(); 828 for (char c : value.toCharArray()) { 829 for (char[] pair : charsToReplace) { 830 if (c == pair[0]) { 831 c = pair[1]; 832 break; 833 } 834 } 835 builder.append(c); 836 } 837 return builder.toString(); 838 } 839 standardizeSwissGerman(String value)840 private String standardizeSwissGerman(String value) { 841 return value.replaceAll("\u00DF", "ss"); 842 } 843 standardizeCatalan(String value)844 private String standardizeCatalan(String value) { 845 StringBuilder builder = new StringBuilder(); 846 for (char c : value.toCharArray()) { 847 boolean didSubstitute = false; 848 for (char[] triple : CATALAN_CONVERSIONS) { 849 if (c == triple[0]) { 850 builder.append(triple[1]); 851 builder.append(triple[2]); 852 didSubstitute = true; 853 break; 854 } 855 } 856 if (!didSubstitute) { 857 builder.append(c); 858 } 859 } 860 return builder.toString(); 861 } 862 replace(Pattern pattern, String value, String replacement)863 private String replace(Pattern pattern, String value, String replacement) { 864 String value2 = pattern.matcher(value).replaceAll(replacement); 865 if (DEBUG_DAIP && !value.equals(value2)) { 866 System.out.println("\n" + value + " => " + value2); 867 } 868 return value2; 869 } 870 871 private static Pattern UNNORMALIZED_MALAYALAM = PatternCache.get( 872 "(\u0D23|\u0D28|\u0D30|\u0D32|\u0D33|\u0D15)\u0D4D\u200D"); 873 874 private static Map<Character, Character> NORMALIZING_MAP = Builder.with(new HashMap<Character, Character>()) 875 .put('\u0D23', '\u0D7A').put('\u0D28', '\u0D7B') 876 .put('\u0D30', '\u0D7C').put('\u0D32', '\u0D7D') 877 .put('\u0D33', '\u0D7E').put('\u0D15', '\u0D7F').get(); 878 879 /** 880 * Normalizes the Malayalam characters in the specified input. 881 * 882 * @param value 883 * the input to be normalized 884 * @return 885 */ normalizeMalayalam(String value)886 private String normalizeMalayalam(String value) { 887 // Normalize Malayalam characters. 888 Matcher matcher = UNNORMALIZED_MALAYALAM.matcher(value); 889 if (matcher.find()) { 890 StringBuffer buffer = new StringBuffer(); 891 int start = 0; 892 do { 893 buffer.append(value.substring(start, matcher.start(0))); 894 char codePoint = matcher.group(1).charAt(0); 895 buffer.append(NORMALIZING_MAP.get(codePoint)); 896 start = matcher.end(0); 897 } while (matcher.find()); 898 buffer.append(value.substring(start)); 899 value = buffer.toString(); 900 } 901 return value; 902 } 903 904 static final Transform<String, String> fixArabicPresentation = Transliterator.getInstance( 905 "[[:block=Arabic_Presentation_Forms_A:][:block=Arabic_Presentation_Forms_B:]] nfkc"); 906 907 /** 908 * Normalizes the Arabic presentation forms characters in the specified input. 909 * 910 * @param value 911 * the input to be normalized 912 * @return 913 */ replaceArabicPresentationForms(String value)914 private String replaceArabicPresentationForms(String value) { 915 value = fixArabicPresentation.transform(value); 916 return value; 917 } 918 919 static Pattern ADLAM_MISNASALIZED = PatternCache.get("([])['’‘]([])"); 920 public static String ADLAM_NASALIZATION = ""; // U+1E94B (Unicode 12.0) 921 fixAdlamNasalization(String fromString)922 public static String fixAdlamNasalization(String fromString) { 923 return ADLAM_MISNASALIZED.matcher(fromString) 924 .replaceAll("$1"+ADLAM_NASALIZATION+"$2"); // replace quote with 925 } 926 927 static Pattern REMOVE_QUOTE1 = PatternCache.get("(\\s)(\\\\[-\\}\\]\\&])()"); 928 static Pattern REMOVE_QUOTE2 = PatternCache.get("(\\\\[\\-\\{\\[\\&])(\\s)"); // ([^\\])([\\-\\{\\[])(\\s) 929 930 static Pattern NEEDS_QUOTE1 = PatternCache.get("(\\s|$)([-\\}\\]\\&])()"); 931 static Pattern NEEDS_QUOTE2 = PatternCache.get("([^\\\\])([\\-\\{\\[\\&])(\\s)"); // ([^\\])([\\-\\{\\[])(\\s) 932 getCleanedUnicodeSet(UnicodeSet exemplar, UnicodeSetPrettyPrinter prettyPrinter, ExemplarType exemplarType)933 public static String getCleanedUnicodeSet(UnicodeSet exemplar, UnicodeSetPrettyPrinter prettyPrinter, 934 ExemplarType exemplarType) { 935 if (prettyPrinter == null) { 936 return exemplar.toPattern(false); 937 } 938 String value; 939 prettyPrinter.setCompressRanges(exemplar.size() > 300); 940 value = exemplar.toPattern(false); 941 UnicodeSet toAdd = new UnicodeSet(); 942 943 for (UnicodeSetIterator usi = new UnicodeSetIterator(exemplar); usi.next();) { 944 String string = usi.getString(); 945 if (string.equals("ß") || string.equals("İ")) { 946 toAdd.add(string); 947 continue; 948 } 949 switch (string) { 950 case "\u2011": toAdd.add("-"); break; // nobreak hyphen 951 case "-": toAdd.add("\u2011"); break; // nobreak hyphen 952 953 case " ": toAdd.add("\u00a0"); break; // nobreak space 954 case "\u00a0": toAdd.add(" "); break; // nobreak space 955 956 case "\u202F": toAdd.add("\u2009"); break; // nobreak narrow space 957 case "\u2009": toAdd.add("\u202F"); break; // nobreak narrow space 958 } 959 if (exemplarType.convertUppercase) { 960 string = UCharacter.toLowerCase(ULocale.ENGLISH, string); 961 } 962 toAdd.add(string); 963 String composed = Normalizer.compose(string, false); 964 if (!string.equals(composed)) { 965 toAdd.add(composed); 966 } 967 } 968 969 toAdd.removeAll(exemplarType.toRemove); 970 971 if (DEBUG_DAIP && !toAdd.equals(exemplar)) { 972 UnicodeSet oldOnly = new UnicodeSet(exemplar).removeAll(toAdd); 973 UnicodeSet newOnly = new UnicodeSet(toAdd).removeAll(exemplar); 974 System.out.println("Exemplar:\t" + exemplarType + ",\tremoved\t" + oldOnly + ",\tadded\t" + newOnly); 975 } 976 977 String fixedExemplar = prettyPrinter.format(toAdd); 978 UnicodeSet doubleCheck = new UnicodeSet(fixedExemplar); 979 if (!toAdd.equals(doubleCheck)) { 980 // something went wrong, leave as is 981 } else if (!value.equals(fixedExemplar)) { // put in this condition just for debugging 982 if (DEBUG_DAIP) { 983 System.out.println(TestMetadata.showDifference( 984 With.codePoints(value), 985 With.codePoints(fixedExemplar), 986 "\n")); 987 } 988 value = fixedExemplar; 989 } 990 return value; 991 } 992 993 /** 994 * @return a canonical numeric pattern, based on the type, and the isPOSIX flag. The latter is set for en_US_POSIX. 995 */ 996 static final Splitter SEMI_SPLITTER = Splitter.on(';').trimResults(); 997 getCanonicalPattern(String inpattern, NumericType type, boolean isPOSIX)998 public static String getCanonicalPattern(String inpattern, NumericType type, boolean isPOSIX) { 999 // TODO fix later to properly handle quoted ; 1000 1001 DecimalFormat df = new DecimalFormat(inpattern); 1002 if (type == NumericType.DECIMAL_ABBREVIATED || type == NumericType.CURRENCY_ABBREVIATED 1003 || CldrUtility.INHERITANCE_MARKER.equals(inpattern)) { 1004 return inpattern; // TODO fix when ICU bug is fixed 1005 // df.setMaximumFractionDigits(df.getMinimumFractionDigits()); 1006 // df.setMaximumIntegerDigits(Math.max(1, df.getMinimumIntegerDigits())); 1007 } else { 1008 // int decimals = type == CURRENCY_TYPE ? 2 : 1; 1009 int[] digits = isPOSIX ? type.posixDigitCount : type.digitCount; 1010 df.setMinimumIntegerDigits(digits[0]); 1011 df.setMinimumFractionDigits(digits[1]); 1012 df.setMaximumFractionDigits(digits[2]); 1013 } 1014 String pattern = df.toPattern(); 1015 List<String> parts = SEMI_SPLITTER.splitToList(pattern); 1016 String pattern2 = parts.get(0); 1017 if (parts.size() > 1) { 1018 pattern2 += ";" + parts.get(1); 1019 } 1020 if (!pattern2.equals(pattern)) { 1021 pattern = pattern2; 1022 } 1023 // int pos = pattern.indexOf(';'); 1024 // if (pos < 0) return pattern + ";-" + pattern; 1025 return pattern; 1026 } 1027 1028 /* 1029 * This tests what type a numeric pattern is. 1030 */ 1031 public enum NumericType { 1032 CURRENCY(new int[] { 1, 2, 2 }, new int[] { 1, 2, 2 }), CURRENCY_ABBREVIATED(), DECIMAL(new int[] { 1, 0, 3 }, 1033 new int[] { 1, 0, 6 }), DECIMAL_ABBREVIATED(), PERCENT(new int[] { 1, 0, 0 }, 1034 new int[] { 1, 0, 0 }), SCIENTIFIC(new int[] { 0, 0, 0 }, new int[] { 1, 6, 6 }), NOT_NUMERIC; 1035 1036 private static final Pattern NUMBER_PATH = Pattern 1037 .compile("//ldml/numbers/((currency|decimal|percent|scientific)Formats|currencies/currency).*"); 1038 private int[] digitCount; 1039 private int[] posixDigitCount; 1040 NumericType()1041 private NumericType() { 1042 } 1043 NumericType(int[] digitCount, int[] posixDigitCount)1044 private NumericType(int[] digitCount, int[] posixDigitCount) { 1045 this.digitCount = digitCount; 1046 this.posixDigitCount = posixDigitCount; 1047 } 1048 1049 /** 1050 * @return the numeric type of the xpath 1051 */ getNumericType(String xpath)1052 public static NumericType getNumericType(String xpath) { 1053 Matcher matcher = NUMBER_PATH.matcher(xpath); 1054 if (xpath.indexOf("/pattern") < 0) { 1055 return NOT_NUMERIC; 1056 } else if (matcher.matches()) { 1057 if (matcher.group(1).equals("currencies/currency")) { 1058 return CURRENCY; 1059 } else { 1060 NumericType type = NumericType.valueOf(matcher.group(2).toUpperCase()); 1061 if (xpath.contains("=\"1000")) { 1062 if (type == DECIMAL) { 1063 type = DECIMAL_ABBREVIATED; 1064 } else if (type == CURRENCY) { 1065 type = CURRENCY_ABBREVIATED; 1066 } else { 1067 throw new IllegalArgumentException("Internal Error"); 1068 } 1069 } 1070 return type; 1071 } 1072 } else { 1073 return NOT_NUMERIC; 1074 } 1075 } 1076 getDigitCount()1077 public int[] getDigitCount() { 1078 return digitCount; 1079 } 1080 getPosixDigitCount()1081 public int[] getPosixDigitCount() { 1082 return posixDigitCount; 1083 } 1084 } 1085 1086 /** 1087 * Turn all whitespace sequences (including tab and newline, and NBSP for certain paths) 1088 * into a single space or a single NBSP depending on path. 1089 * Also trim initial/final NBSP, unless the value is only the one character, "\u00A0" 1090 * 1091 * @param path 1092 * @param value 1093 * @return the normalized value 1094 */ normalizeWhitespace(String path, String value)1095 private String normalizeWhitespace(String path, String value) { 1096 PathSpaceType pst = PathSpaceType.get(path); 1097 if (pst == PathSpaceType.allowSp) { 1098 value = WHITESPACE_AND_NBSP_TO_NORMALIZE.matcher(value).replaceAll(" "); // replace with regular space 1099 } else if (pst == PathSpaceType.allowNbsp) { 1100 value = WHITESPACE_AND_NBSP_TO_NORMALIZE.matcher(value).replaceAll("\u00A0"); // replace with NBSP 1101 value = trimNBSP(value); 1102 } else if (pst == PathSpaceType.allowNNbsp) { 1103 value = WHITESPACE_AND_NBSP_TO_NORMALIZE.matcher(value).replaceAll("\u202F"); // replace with NNBSP 1104 value = trimNBSP(value); 1105 } else if (pst == PathSpaceType.allowSpOrNbsp) { 1106 /* 1107 * in this case don't normalize away NBSP 1108 */ 1109 value = WHITESPACE_NO_NBSP_TO_NORMALIZE.matcher(value).replaceAll(" "); // replace with regular space 1110 /* 1111 * if any NBSP and regular space are adjacent, replace with NBSP 1112 */ 1113 value = NBSP_PLUS_SPACE_TO_NORMALIZE.matcher(value).replaceAll("\u00A0"); 1114 value = SPACE_PLUS_NBSP_TO_NORMALIZE.matcher(value).replaceAll("\u00A0"); 1115 value = MULTIPLE_NBSP.matcher(value).replaceAll("\u00A0"); 1116 value = trimNBSP(value); 1117 } else { 1118 throw new IllegalArgumentException("Unknown PathSpaceType " + pst); 1119 } 1120 1121 // Further whitespace adjustments per CLDR-14032 1122 if ((scriptCode.equals("Latn") || scriptCode.equals("Cyrl") || scriptCode.equals("Grek")) && 1123 HOUR_FORMAT_XPATHS.matcher(path).matches()) { 1124 String test = AMPM_SPACE_BEFORE.matcher(value).replaceAll("$1$2"); // value without a+ 1125 if (value.length() - test.length() != 4) { // exclude patterns with aaaa 1126 value = AMPM_SPACE_BEFORE.matcher(value).replaceAll("$1\u202F$3"); 1127 } 1128 test = AMPM_SPACE_AFTER.matcher(value).replaceAll("$2$3"); // value without a+ 1129 if (value.length() - test.length() != 4) { // exclude patterns with aaaa 1130 value = AMPM_SPACE_AFTER.matcher(value).replaceAll("$1\u202F$3"); 1131 } 1132 } 1133 if (scriptCode.equals("Cyrl") && YEAR_FORMAT_XPATHS.matcher(path).matches()) { 1134 value = YEAR_SPACE_YEARMARKER.matcher(value).replaceAll("y\u202F$1"); 1135 } 1136 if (UNIT_NARROW_XPATHS.matcher(path).matches()) { 1137 value = PLACEHOLDER_SPACE_AFTER.matcher(value).replaceAll("}\u202F"); // Narrow NBSP 1138 value = PLACEHOLDER_SPACE_BEFORE.matcher(value).replaceAll("\u202F{"); 1139 } 1140 if (UNIT_SHORT_XPATHS.matcher(path).matches()) { 1141 value = PLACEHOLDER_SPACE_AFTER.matcher(value).replaceAll("}\u00A0"); // Regular NBSP 1142 value = PLACEHOLDER_SPACE_BEFORE.matcher(value).replaceAll("\u00A0{"); 1143 } 1144 1145 return value; 1146 } 1147 1148 /** 1149 * Delete any initial or final NBSP, unless the value is just NBSP 1150 * 1151 * @param value 1152 * @return the trimmed value 1153 */ trimNBSP(String value)1154 private String trimNBSP(String value) { 1155 if (!value.equals("\u00A0") && !value.equals("\u202F")) { 1156 value = INITIAL_NBSP.matcher(value).replaceAll(""); 1157 value = FINAL_NBSP.matcher(value).replaceAll(""); 1158 } 1159 return value; 1160 } 1161 1162 /** 1163 * Categorize xpaths according to whether they allow space, NBSP, or both 1164 */ 1165 public enum PathSpaceType { 1166 allowSp, allowNbsp, allowNNbsp, allowSpOrNbsp; 1167 get(String path)1168 public static PathSpaceType get(String path) { 1169 if (wantsRegularSpace(path)) { 1170 return allowSp; 1171 } else if (wantsNBSP(path)) { 1172 return allowNbsp; 1173 } else if (wantsNNBSP(path)) { 1174 return allowNNbsp; 1175 } else { 1176 return allowSpOrNbsp; 1177 } 1178 } 1179 wantsRegularSpace(String path)1180 private static boolean wantsRegularSpace(String path) { 1181 if ((path.contains("/dateFormatLength") && path.contains("/pattern")) || 1182 path.contains("/availableFormats/dateFormatItem") || 1183 (path.startsWith("//ldml/dates/timeZoneNames/metazone") && path.contains("/long")) || 1184 path.startsWith("//ldml/dates/timeZoneNames/regionFormat") || 1185 path.startsWith("//ldml/localeDisplayNames/codePatterns/codePattern") || 1186 path.startsWith("//ldml/localeDisplayNames/languages/language") || 1187 path.startsWith("//ldml/localeDisplayNames/territories/territory") || 1188 path.startsWith("//ldml/localeDisplayNames/types/type") || 1189 (path.startsWith("//ldml/numbers/currencies/currency") && path.contains("/displayName")) || 1190 (path.contains("/decimalFormatLength[@type=\"long\"]") && path.contains("/pattern")) || 1191 path.startsWith("//ldml/posix/messages") || 1192 (path.startsWith("//ldml/units/uni") && path.contains("/unitPattern "))) { 1193 return true; 1194 } 1195 return false; 1196 } 1197 wantsNBSP(String path)1198 private static boolean wantsNBSP(String path) { 1199 if ((path.contains("/currencies/currency") && (path.contains("/group") || path.contains("/pattern"))) || 1200 (path.contains("/currencyFormatLength") && path.contains("/pattern")) || 1201 (path.contains("/currencySpacing") && path.contains("/insertBetween")) || 1202 (path.contains("/decimalFormatLength") && path.contains("/pattern")) || // i.e. the non-long ones 1203 (path.contains("/percentFormatLength") && path.contains("/pattern")) || 1204 (path.startsWith("//ldml/numbers/symbols") && (path.contains("/group") || path.contains("/nan")))) { 1205 return true; 1206 } 1207 return false; 1208 } 1209 wantsNNBSP(String path)1210 private static boolean wantsNNBSP(String path) { 1211 if ((path.contains("/dayPeriodWidth[@type=\"abbreviated\"]") || path.contains("/dayPeriodWidth[@type=\"narrow\"]")) && 1212 (path.contains("/dayPeriod[@type=\"am\"]") || path.contains("/dayPeriod[@type=\"pm\"]")) ) { 1213 return true; 1214 } 1215 return false; 1216 } 1217 } 1218 1219 private static final Pattern ZERO_WIDTH_SPACES = PatternCache.get("\\u200B+"); 1220 private static final Set<String> LOCALES_NOT_ALLOWING_ZWS = new HashSet<>(Arrays.asList("da", "fr")); 1221 1222 /** 1223 * Remove occurrences of U+200B ZERO_WIDTH_SPACE under certain conditions 1224 * 1225 * @param value the value to be normalized 1226 * @return the normalized value 1227 * 1228 * TODO: extend this method to address more concerns, after clarifying the conditions 1229 * - enlarge the set LOCALES_NOT_ALLOWING_ZWS? 1230 * - strip initial and final ZWS in all locales? 1231 * - reduce two or more adjacent ZWS to one ZWS? 1232 * - allow or prohibit ZWS by itself as currency symbol, as currently in locales kea, pt_CV, pt_PT 1233 * - allow or prohibit ZWS preceding URL as in "as per [U+200B]http://unicode.org/repos/cldr/trunk/specs/ldml/tr35-general.html#Annotations" 1234 * Reference: https://unicode-org.atlassian.net/browse/CLDR-15976 1235 */ normalizeZeroWidthSpace(String value)1236 private String normalizeZeroWidthSpace(String value) { 1237 if (ZERO_WIDTH_SPACES.matcher(value).find()) { 1238 final String localeId = locale.getBaseName(); 1239 if (LOCALES_NOT_ALLOWING_ZWS.contains(localeId)) { 1240 value = ZERO_WIDTH_SPACES.matcher(value).replaceAll(""); 1241 } 1242 } 1243 return value; 1244 } 1245 } 1246