• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /* Copyright (C) 2007-2013 Google and others.  All Rights Reserved. */
2 /* Copyright (C) 2007-2013 IBM Corp. and others. All Rights Reserved. */
3 
4 package org.unicode.cldr.test;
5 
6 import java.util.Arrays;
7 import java.util.HashMap;
8 import java.util.HashSet;
9 import java.util.List;
10 import java.util.Map;
11 import java.util.Set;
12 import java.util.TreeSet;
13 import java.util.regex.Matcher;
14 import java.util.regex.Pattern;
15 
16 import org.unicode.cldr.test.CheckExemplars.ExemplarType;
17 import org.unicode.cldr.util.AnnotationUtil;
18 import org.unicode.cldr.util.Builder;
19 import org.unicode.cldr.util.CLDRConfig;
20 import org.unicode.cldr.util.CLDRFile;
21 import org.unicode.cldr.util.CLDRLocale;
22 import org.unicode.cldr.util.CldrUtility;
23 import org.unicode.cldr.util.DateTimeCanonicalizer;
24 import org.unicode.cldr.util.DateTimeCanonicalizer.DateTimePatternType;
25 import org.unicode.cldr.util.Emoji;
26 import org.unicode.cldr.util.ICUServiceBuilder;
27 import org.unicode.cldr.util.PatternCache;
28 import org.unicode.cldr.util.SupplementalDataInfo;
29 import org.unicode.cldr.util.UnicodeSetPrettyPrinter;
30 import org.unicode.cldr.util.With;
31 import org.unicode.cldr.util.XPathParts;
32 
33 import com.google.common.base.Joiner;
34 import com.google.common.base.Splitter;
35 import com.google.myanmartools.ZawgyiDetector;
36 import com.ibm.icu.lang.UCharacter;
37 import com.ibm.icu.text.Collator;
38 import com.ibm.icu.text.DateIntervalInfo;
39 import com.ibm.icu.text.DateTimePatternGenerator;
40 import com.ibm.icu.text.DecimalFormat;
41 import com.ibm.icu.text.Normalizer;
42 import com.ibm.icu.text.RuleBasedCollator;
43 import com.ibm.icu.text.Transform;
44 import com.ibm.icu.text.Transliterator;
45 import com.ibm.icu.text.UnicodeSet;
46 import com.ibm.icu.text.UnicodeSetIterator;
47 import com.ibm.icu.util.ULocale;
48 
49 /**
50  * Class for processing the input and output of CLDR data for use in the
51  * Survey Tool and other tools.
52  */
53 public class DisplayAndInputProcessor {
54 
55     private static final boolean FIX_YEARS = true;
56 
57     public static final boolean DEBUG_DAIP = CldrUtility.getProperty("DEBUG_DAIP", false);
58 
59     public static final UnicodeSet RTL = new UnicodeSet("[[:Bidi_Class=Arabic_Letter:][:Bidi_Class=Right_To_Left:]]")
60         .freeze();
61 
62     public static final UnicodeSet TO_QUOTE = new UnicodeSet(
63         "[[:Cn:]" +
64             "[:Default_Ignorable_Code_Point:]" +
65             "[:patternwhitespace:]" +
66             "[:Me:][:Mn:]]" // add non-spacing marks
67     ).freeze();
68 
69     public static final Pattern NUMBER_FORMAT_XPATH = Pattern
70         .compile("//ldml/numbers/.*Format\\[@type=\"standard\"]/pattern.*");
71 
72     public static final Pattern NUMBER_SEPARATOR_PATTERN = Pattern
73         .compile("//ldml/numbers/symbols.*/(decimal|group)");
74 
75     private static final Pattern APOSTROPHE_SKIP_PATHS = PatternCache.get("//ldml/("
76         + "localeDisplayNames/languages/language\\[@type=\"mic\"].*|"
77         + "characters/.*|"
78         + "delimiters/.*|"
79         + "dates/.+/(pattern|intervalFormatItem|dateFormatItem).*|"
80         + "units/.+/unitPattern.*|"
81         + "units/.+/durationUnitPattern.*|"
82         + "numbers/symbols.*|"
83         + "numbers/miscPatterns.*|"
84         + "numbers/(decimal|currency|percent|scientific)Formats.+/(decimal|currency|percent|scientific)Format.*)");
85     private static final Pattern INTERVAL_FORMAT_PATHS = PatternCache.get("//ldml/dates/.+/intervalFormat(Item.*|Fallback)");
86     private static final Pattern NON_DECIMAL_PERIOD = PatternCache.get("(?<![0#'])\\.(?![0#'])");
87 
88     // Pattern to match against paths that might have time formats with h or K (12-hour cycles)
89     private static final Pattern HOUR_FORMAT_XPATHS = PatternCache
90         .get("//ldml/dates/calendars/calendar\\[@type=\"[^\"]*\"]/("
91             + "timeFormats/timeFormatLength\\[@type=\"[^\"]*\"]/timeFormat\\[@type=\"standard\"]/pattern\\[@type=\"standard\"].*|"
92             + "dateTimeFormats/availableFormats/dateFormatItem\\[@id=\"[A-GL-Ma-gl-m]*[hK][A-Za-z]*\"].*|"
93             + "dateTimeFormats/intervalFormats/intervalFormatItem\\[@id=\"[A-GL-Ma-gl-m]*[hK][A-Za-z]*\"].*)");
94 
95     private static final Pattern AMPM_SPACE_BEFORE = PatternCache.get("([Khms])([ \\u00A0]+)(a+)"); // time, space, a+
96     private static final Pattern AMPM_SPACE_AFTER = PatternCache.get("(a+)([ \\u00A0]+)([Kh])"); // a+, space, hour
97 
98     // Pattern to match against paths that might have date formats with y
99     private static final Pattern YEAR_FORMAT_XPATHS = PatternCache
100         .get("//ldml/dates/calendars/calendar\\[@type=\"[^\"]*\"]/("
101             + "dateFormats/dateFormatLength\\[@type=\"[^\"]*\"]/dateFormat\\[@type=\"standard\"]/pattern\\[@type=\"standard\"].*|"
102             + "dateTimeFormats/availableFormats/dateFormatItem\\[@id=\"[A-XZa-xz]*y[A-Za-z]*\"].*|"
103             + "dateTimeFormats/intervalFormats/intervalFormatItem\\[@id=\"[A-XZa-xz]*y[A-Za-z]*\"].*)");
104 
105     // Cyrillic year markers are or begin with (in various languages) \u0430 \u0433 \u0435 \u0436 \u043E \u0440 \u0441
106     private static final Pattern YEAR_SPACE_YEARMARKER = PatternCache.get("y[ \\u00A0]+('?[агежорс])"); // y, space, Cyrillic year marker start
107 
108     public static final Pattern UNIT_NARROW_XPATHS = PatternCache
109         .get("//ldml/units/unitLength\\[@type=\"narrow\"]unit\\[@type=\"[^\"]*\"]/unitPattern.*");
110 
111     public static final Pattern UNIT_SHORT_XPATHS = PatternCache
112         .get("//ldml/units/unitLength\\[@type=\"short\"]unit\\[@type=\"[^\"]*\"]/unitPattern.*");
113 
114     private static final Pattern PLACEHOLDER_SPACE_AFTER = PatternCache.get("\\}[ \\u00A0\\u202F]+");
115     private static final Pattern PLACEHOLDER_SPACE_BEFORE = PatternCache.get("[ \\u00A0\\u202F]+\\{");
116     private static final Pattern INTERVAL_FALLBACK_RANGE = PatternCache.get("\\} [\\u2013-] \\{");
117 
118     /**
119      * string of whitespace not including NBSP, i.e. [\t\n\r]+
120      */
121     private static final Pattern WHITESPACE_NO_NBSP_TO_NORMALIZE = PatternCache.get("\\s+"); //
122 
123     /**
124      * string of whitespace including NBSP, i.e. [\u00A0\t\n\r]+
125      */
126     private static final Pattern WHITESPACE_AND_NBSP_TO_NORMALIZE = PatternCache.get("[\\s\\u00A0]+");
127 
128     /**
129      * one or more NBSP followed by one or more regular spaces
130      */
131     private static final Pattern NBSP_PLUS_SPACE_TO_NORMALIZE = PatternCache.get("\\u00A0+\\u0020+");
132 
133     /**
134      * one or more regular spaces followed by one or more NBSP
135      */
136     private static final Pattern SPACE_PLUS_NBSP_TO_NORMALIZE = PatternCache.get("\\u0020+\\u00A0+");
137 
138     private static final Pattern INITIAL_NBSP = PatternCache.get("^[\\u00A0\\u202F]+");
139     private static final Pattern FINAL_NBSP = PatternCache.get("[\\u00A0\\u202F]+$");
140     private static final Pattern MULTIPLE_NBSP = PatternCache.get("\\u00A0\\u00A0+");
141 
142     // The following includes (among others) \u0009, \u0020, \u00A0, \u2007, \u2009, \u202F, \u3000
143     private static final UnicodeSet UNICODE_WHITESPACE = new UnicodeSet("[:whitespace:]").freeze();
144 
145     private static final CLDRLocale MALAYALAM = CLDRLocale.getInstance("ml");
146     private static final CLDRLocale ROMANIAN = CLDRLocale.getInstance("ro");
147     private static final CLDRLocale CATALAN = CLDRLocale.getInstance("ca");
148     private static final CLDRLocale NGOMBA = CLDRLocale.getInstance("jgo");
149     private static final CLDRLocale KWASIO = CLDRLocale.getInstance("nmg");
150     private static final CLDRLocale HEBREW = CLDRLocale.getInstance("he");
151     private static final CLDRLocale MYANMAR = CLDRLocale.getInstance("my");
152     private static final CLDRLocale KYRGYZ = CLDRLocale.getInstance("ky");
153     private static final CLDRLocale URDU = CLDRLocale.getInstance("ur");
154     private static final CLDRLocale PASHTO = CLDRLocale.getInstance("ps");
155     private static final CLDRLocale FARSI = CLDRLocale.getInstance("fa");
156     private static final CLDRLocale GERMAN_SWITZERLAND = CLDRLocale.getInstance("de_CH");
157     private static final CLDRLocale SWISS_GERMAN = CLDRLocale.getInstance("gsw");
158     private static final CLDRLocale FF_ADLAM = CLDRLocale.getInstance("ff_Adlm");
159     public static final Set<String> LANGUAGES_USING_MODIFIER_APOSTROPHE = new HashSet<>(
160         Arrays.asList("br", "bss", "cad", "cic", "cch", "gn", "ha", "ha_Latn", "lkt", "mgo", "moh", "mus", "nnh", "qu", "quc", "uk", "uz", "uz_Latn"));
161 
162     // Ş ş Ţ ţ  =>  Ș ș Ț ț
163     private static final char[][] ROMANIAN_CONVERSIONS = {
164         { '\u015E', '\u0218' }, { '\u015F', '\u0219' }, { '\u0162', '\u021A' },
165         { '\u0163', '\u021B' } };
166 
167     private static final char[][] CATALAN_CONVERSIONS = {
168         { '\u013F', '\u004C', '\u00B7' }, // Ŀ -> L·
169         { '\u0140', '\u006C', '\u00B7' } }; // ŀ -> l·
170 
171     private static final char[][] NGOMBA_CONVERSIONS = {
172         { '\u0251', '\u0061' }, { '\u0261', '\u0067' }, //  ɑ -> a , ɡ -> g , See ticket #5691
173         { '\u2019', '\uA78C' }, { '\u02BC', '\uA78C' } }; //  Saltillo, see ticket #6805
174 
175     private static final char[][] KWASIO_CONVERSIONS = {
176         { '\u0306', '\u030C' }, // See ticket #6571, use caron instead of breve
177         { '\u0103', '\u01CE' }, { '\u0102', '\u01CD' }, // a-breve -> a-caron
178         { '\u0115', '\u011B' }, { '\u011A', '\u01CD' }, // e-breve -> e-caron
179         { '\u012D', '\u01D0' }, { '\u012C', '\u01CF' }, // i-breve -> i-caron
180         { '\u014F', '\u01D2' }, { '\u014E', '\u01D1' }, // o-breve -> o-caron
181         { '\u016D', '\u01D4' }, { '\u016C', '\u01D3' } // u-breve -> u-caron
182     };
183 
184     private static final char[][] HEBREW_CONVERSIONS = {
185         { '\'', '\u05F3' }, { '"', '\u05F4' } }; //  ' -> geresh  " -> gershayim
186 
187     private static final char[][] KYRGYZ_CONVERSIONS = {
188         { 'ӊ', 'ң' }, { 'Ӊ', 'Ң' } }; //  right modifier
189 
190     private static final char[][] URDU_PLUS_CONVERSIONS = {
191         { '\u0643', '\u06A9' }}; //  wrong char
192 
193     private static final ZawgyiDetector detector = new ZawgyiDetector();
194     private static final Transliterator zawgyiUnicodeTransliterator =
195         Transliterator.getInstance("Zawgyi-my");
196 
197     private Collator col;
198 
199     private Collator spaceCol;
200 
201     private UnicodeSetPrettyPrinter pp = null;
202 
203     final private CLDRLocale locale;
204     private String scriptCode; // actual or default script code (not null after init)
205     private boolean isPosix;
206 
207     /**
208      * Constructor, taking cldrFile.
209      *
210      * @param cldrFileToCheck
211      */
DisplayAndInputProcessor(CLDRFile cldrFileToCheck, boolean needsCollator)212     public DisplayAndInputProcessor(CLDRFile cldrFileToCheck, boolean needsCollator) {
213         init(this.locale = CLDRLocale.getInstance(cldrFileToCheck.getLocaleID()), needsCollator);
214     }
215 
DisplayAndInputProcessor(CLDRFile cldrFileToCheck)216     public DisplayAndInputProcessor(CLDRFile cldrFileToCheck) {
217         init(this.locale = CLDRLocale.getInstance(cldrFileToCheck.getLocaleID()), true);
218     }
219 
init(CLDRLocale locale, boolean needsCollator)220     void init(CLDRLocale locale, boolean needsCollator) {
221         isPosix = locale.toString().indexOf("POSIX") >= 0;
222         if (needsCollator) {
223             ICUServiceBuilder isb = null;
224             try {
225                 isb = ICUServiceBuilder.forLocale(locale);
226             } catch (Exception e) {
227             }
228 
229             if (isb != null) {
230                 try {
231                     col = isb.getRuleBasedCollator();
232                 } catch (Exception e) {
233                     col = Collator.getInstance(ULocale.ROOT);
234                 }
235             } else {
236                 col = Collator.getInstance(ULocale.ROOT);
237             }
238 
239             spaceCol = Collator.getInstance(locale.toULocale());
240             if (spaceCol instanceof RuleBasedCollator) {
241                 ((RuleBasedCollator) spaceCol).setAlternateHandlingShifted(false);
242             }
243             pp = new UnicodeSetPrettyPrinter().setOrdering(Collator.getInstance(ULocale.ROOT))
244                 .setSpaceComparator(Collator.getInstance(ULocale.ROOT).setStrength2(Collator.PRIMARY))
245                 .setCompressRanges(true)
246                 .setToQuote(new UnicodeSet(TO_QUOTE))
247                 .setOrdering(col)
248                 .setSpaceComparator(spaceCol);
249         }
250         String script = locale.getScript();
251         if (script == null || script.length() < 4) {
252             SupplementalDataInfo sdi = CLDRConfig.getInstance().getSupplementalDataInfo();
253             script = sdi.getDefaultScript(locale.getBaseName());
254             if (script == null || script.length() < 4 || script.equals("Zzzz")) {
255                 script = sdi.getDefaultScript(locale.getLanguage());
256             }
257             if (script == null || script.length() < 4) {
258                 script = "Zzzz";
259             }
260         }
261         scriptCode = script;
262     }
263 
getPrettyPrinter()264     public UnicodeSetPrettyPrinter getPrettyPrinter() {
265         return pp;
266     }
267 
268     /**
269      * Constructor, taking ULocale and boolean.
270      *
271      * @param locale the ULocale
272      * @param needsCollator true or false
273      *
274      * Called by getProcessor, with locale = SurveyMain.TRANS_HINT_LOCALE
275      */
DisplayAndInputProcessor(ULocale locale, boolean needsCollator)276     public DisplayAndInputProcessor(ULocale locale, boolean needsCollator) {
277         init(this.locale = CLDRLocale.getInstance(locale), needsCollator);
278     }
279 
280     /**
281      * Constructor, taking ULocale.
282      *
283      * @param locale the ULocale
284      */
DisplayAndInputProcessor(ULocale locale)285     public DisplayAndInputProcessor(ULocale locale) {
286         init(this.locale = CLDRLocale.getInstance(locale), true /* needsCollator */);
287     }
288 
289     /**
290      * Constructor, taking CLDRLocale and boolean.
291      *
292      * @param locale the CLDRLocale
293      * @param needsCollator true or false
294      */
DisplayAndInputProcessor(CLDRLocale locale, boolean needsCollator)295     public DisplayAndInputProcessor(CLDRLocale locale, boolean needsCollator) {
296         init(this.locale = locale, needsCollator);
297     }
298 
299     /**
300      * Constructor, taking locale.
301      *
302      * @param locale
303      */
DisplayAndInputProcessor(CLDRLocale locale)304     public DisplayAndInputProcessor(CLDRLocale locale) {
305         init(this.locale = locale, true);
306     }
307 
308     /**
309      * Process the value for display. The result is a string for display in the
310      * Survey tool or similar program.
311      *
312      * @param path
313      * @param value
314      * @return
315      */
processForDisplay(String path, String value)316     public synchronized String processForDisplay(String path, String value) {
317         value = Normalizer.compose(value, false); // Always normalize all text to NFC.
318         if (hasUnicodeSetValue(path)) {
319             value = displayUnicodeSet(value);
320         } else if (path.contains("stopword")) {
321             return value.trim().isEmpty() ? "NONE" : value;
322         } else {
323             NumericType numericType = NumericType.getNumericType(path);
324             if (numericType != NumericType.NOT_NUMERIC) {
325                 // Canonicalize existing values that aren't canonicalized yet.
326                 // New values will be canonicalized on input using processInput().
327                 try {
328                     value = getCanonicalPattern(value, numericType, isPosix);
329                 } catch (IllegalArgumentException e) {
330                     if (DEBUG_DAIP) System.err.println("Illegal pattern: " + value);
331                 }
332                 if (numericType != NumericType.CURRENCY && numericType != NumericType.CURRENCY_ABBREVIATED) {
333                     value = value.replace("'", "");
334                 }
335             }
336         }
337         // Fix up any apostrophes in number symbols
338         if (NUMBER_SEPARATOR_PATTERN.matcher(path).matches()) {
339             value = value.replace('\'', '\u2019');
340         }
341         // Fix up any apostrophes as appropriate (Don't do so for things like date patterns...
342         if (!APOSTROPHE_SKIP_PATHS.matcher(path).matches()) {
343             value = normalizeApostrophes(value);
344         }
345         // Fix up hyphens, replacing with N-dash as appropriate
346         if (INTERVAL_FORMAT_PATHS.matcher(path).matches()) {
347             value = normalizeIntervalHyphensAndSpaces(value); // This may also adjust spaces around en dash
348         } else {
349             value = normalizeHyphens(value);
350         }
351         return value;
352     }
353 
hasUnicodeSetValue(String path)354     private boolean hasUnicodeSetValue(String path) {
355         return path.startsWith("//ldml/characters/exemplarCharacters") || path.startsWith("//ldml/characters/parseLenients");
356     }
357 
358     static final UnicodeSet WHITESPACE = new UnicodeSet("[:whitespace:]").freeze();
359     static final DateTimeCanonicalizer dtc = new DateTimeCanonicalizer(FIX_YEARS);
360 
361     private static final String BAR_VL = "\\|"; // U+007C VERTICAL LINE (pipe, bar) literal
362     private static final String BAR_EL = "\\s+l\\s+"; // U+006C LATIN SMALL LETTER L with space
363     private static final String BAR_DANDA = "।"; // U+0964 DEVANAGARI DANDA
364     private static final String BAR_REGEX = "(" + BAR_VL + "|" + BAR_EL + "|" + BAR_DANDA + ")";
365     public static final Splitter SPLIT_BAR = Splitter.on(Pattern.compile(BAR_REGEX)).trimResults().omitEmptyStrings();
366     static final Splitter SPLIT_SPACE = Splitter.on(' ').trimResults().omitEmptyStrings();
367     static final Joiner JOIN_BAR = Joiner.on(" | ");
368     static final Joiner JOIN_SPACE = Joiner.on(' ');
369 
370     /**
371      * Process the value for input. The result is a cleaned-up value. For example,
372      * an exemplar set is modified to be in the normal format, and any missing [ ]
373      * are added (a common omission on entry). If there are any failures then the
374      * original value is returned, so that the proper error message can be given.
375      *
376      * @param path
377      * @param value
378      * @param internalException
379      * @return
380      */
processInput(String path, String value, Exception[] internalException)381     public synchronized String processInput(String path, String value, Exception[] internalException) {
382         String original = value;
383         value = stripProblematicControlCharacters(value);
384         value = Normalizer.compose(value, false); // Always normalize all input to NFC.
385         value = value.replace('\u00B5', '\u03BC'); // use the right Greek mu character
386 
387         if (internalException != null) {
388             internalException[0] = null;
389         }
390         // skip processing for inheritance marker
391         if (CldrUtility.INHERITANCE_MARKER.equals(value)) {
392             return value; // Reference: https://unicode.org/cldr/trac/ticket/11261
393         }
394         // for root annotations
395         if (CLDRLocale.ROOT.equals(locale) && path.contains("/annotations")) {
396             return value; // Reference: https://unicode.org/cldr/trac/ticket/11261
397         }
398 
399         try {
400             // Normalise Malayalam characters.
401             boolean isUnicodeSet = hasUnicodeSetValue(path);
402             if (locale.childOf(MALAYALAM)) {
403                 String newvalue = normalizeMalayalam(value);
404                 if (DEBUG_DAIP) System.out.println("DAIP: Normalized Malayalam '" + value + "' to '" + newvalue + "'");
405                 value = newvalue;
406             } else if (locale.childOf(ROMANIAN) && !isUnicodeSet) {
407                 value = standardizeRomanian(value);
408             } else if (locale.childOf(CATALAN) && !isUnicodeSet) {
409                 value = standardizeCatalan(value);
410             } else if (locale.childOf(NGOMBA) && !isUnicodeSet) {
411                 value = standardizeNgomba(value);
412             } else if (locale.childOf(KWASIO) && !isUnicodeSet) {
413                 value = standardizeKwasio(value);
414             } else if (locale.childOf(HEBREW) && !APOSTROPHE_SKIP_PATHS.matcher(path).matches()) {
415                 value = replaceChars(path, value, HEBREW_CONVERSIONS, false);
416             } else if ((locale.childOf(SWISS_GERMAN) || locale.childOf(GERMAN_SWITZERLAND)) && !isUnicodeSet) {
417                 value = standardizeSwissGerman(value);
418             } else if (locale.childOf(MYANMAR) && !isUnicodeSet) {
419                 value = standardizeMyanmar(value);
420             } else if (locale.childOf(KYRGYZ)) {
421                 value = replaceChars(path, value, KYRGYZ_CONVERSIONS, false);
422             } else if (locale.childOf(URDU) || locale.childOf(PASHTO) || locale.childOf(FARSI)) {
423                 value = replaceChars(path, value, URDU_PLUS_CONVERSIONS, true);
424             } else if (locale.childOf(FF_ADLAM) && !isUnicodeSet) {
425                 value = fixAdlamNasalization(value);
426             }
427 
428             if (UNICODE_WHITESPACE.containsSome(value)) {
429                 value = normalizeWhitespace(path, value);
430             }
431 
432             // all of our values should not have leading or trailing spaces, except insertBetween,
433             // foreignSpaceReplacement, and anything with built-in attribute xml:space="preserve"
434             if (!path.contains("/insertBetween") && !path.contains("/foreignSpaceReplacement") &&
435                 !path.contains("[@xml:space=\"preserve\"]") && !isUnicodeSet) {
436                 value = value.trim();
437             }
438 
439             // fix grouping separator if space
440             if (path.startsWith("//ldml/numbers/symbols") && !path.contains("/alias")) {
441                 if (value.isEmpty()) {
442                     value = "\u00A0";
443                 }
444                 value = value.replace(' ', '\u00A0');
445             }
446 
447             // fix date patterns
448             DateTimePatternType datetimePatternType = DateTimePatternType.fromPath(path);
449             if (DateTimePatternType.STOCK_AVAILABLE_INTERVAL_PATTERNS.contains(datetimePatternType)) {
450                 try {
451                     value = dtc.getCanonicalDatePattern(path, value, datetimePatternType);
452                 } catch (IllegalArgumentException ex) {
453                     return value;
454                 }
455             }
456 
457             if (path.startsWith("//ldml/numbers/currencies/currency") && path.contains("displayName")) {
458                 value = normalizeCurrencyDisplayName(value);
459             }
460             NumericType numericType = NumericType.getNumericType(path);
461             if (numericType != NumericType.NOT_NUMERIC) {
462                 if (numericType == NumericType.CURRENCY) {
463                     value = value.replaceAll(" ", "\u00A0");
464                     if (numericType == NumericType.CURRENCY_ABBREVIATED) {
465                         value = value.replaceAll("0\\.0+", "0");
466                     }
467                 } else {
468                     value = value.replaceAll("([%\u00A4]) ", "$1\u00A0")
469                         .replaceAll(" ([%\u00A4])", "\u00A0$1");
470                     value = replace(NON_DECIMAL_PERIOD, value, "'.'");
471                     if (numericType == NumericType.DECIMAL_ABBREVIATED) {
472                         value = value.replaceAll("0\\.0+", "0");
473                     }
474                 }
475                 value = getCanonicalPattern(value, numericType, isPosix);
476             }
477 
478             // fix [,]
479             if (path.startsWith("//ldml/localeDisplayNames/languages/language")
480                 || path.startsWith("//ldml/localeDisplayNames/scripts/script")
481                 || path.startsWith("//ldml/localeDisplayNames/territories/territory")
482                 || path.startsWith("//ldml/localeDisplayNames/variants/variant")
483                 || path.startsWith("//ldml/localeDisplayNames/keys/key")
484                 || path.startsWith("//ldml/localeDisplayNames/types/type")) {
485                 value = value.replace('[', '(').replace(']', ')').replace('[', '(').replace(']', ')');
486             }
487 
488             // Normalize two single quotes for the inches symbol.
489             if (path.contains("/units")) {
490                 value = value.replace("''", "″");
491             }
492 
493             // check specific cases
494             if (isUnicodeSet) {
495                 value = inputUnicodeSet(path, value);
496             } else if (path.contains("stopword")) {
497                 if (value.equals("NONE")) {
498                     value = "";
499                 }
500             }
501 
502             // Normalize ellipsis data.
503             if (path.startsWith("//ldml/characters/ellipsis")) {
504                 value = value.replace("...", "…");
505             }
506 
507             if (path.startsWith("//ldml/personNames/nameOrderLocales")) {
508                 value = normalizeNameOrderLocales(value);
509             }
510 
511             // Replace Arabic presentation forms with their nominal counterparts
512             value = replaceArabicPresentationForms(value);
513 
514             // Fix up any apostrophes as appropriate (Don't do so for things like date patterns...
515             if (!APOSTROPHE_SKIP_PATHS.matcher(path).matches()) {
516                 value = normalizeApostrophes(value);
517             }
518             // Fix up any apostrophes in number symbols
519             if (NUMBER_SEPARATOR_PATTERN.matcher(path).matches()) {
520                 value = value.replace('\'', '\u2019');
521             }
522             // Fix up hyphens, replacing with N-dash as appropriate
523             if (INTERVAL_FORMAT_PATHS.matcher(path).matches()) {
524                 value = normalizeIntervalHyphensAndSpaces(value); // This may also adjust spaces around en dash
525             } else if (!isUnicodeSet) {
526                 value = normalizeHyphens(value);
527             }
528 
529             if (AnnotationUtil.pathIsAnnotation(path)) {
530                 if (path.contains(Emoji.TYPE_TTS)) {
531                     // The row has something like "�� -name" in the first column. Cf. namePath, getNamePaths.
532                     // Normally the value is like "zebra" or "unicorn face", without "|".
533                     // If the user enters a value with "|",  discard anything after "|"; e.g., change "a | b | c" to "a".
534                     value = SPLIT_BAR.split(value).iterator().next();
535                 } else {
536                     // The row has something like "�� –keywords" in the first column. Cf. keywordPath, getKeywordPaths.
537                     // Normally the value is like "stripe | zebra", with "|".
538                     value = annotationsForDisplay(value);
539                 }
540             }
541             value = normalizeZeroWidthSpace(value);
542             return value;
543         } catch (RuntimeException e) {
544             if (internalException != null) {
545                 internalException[0] = e;
546             }
547             return original;
548         }
549     }
550 
normalizeNameOrderLocales(String value)551     private String normalizeNameOrderLocales(String value) {
552         TreeSet<String> result = new TreeSet<>(SPLIT_SPACE.splitToList(value));
553         result.remove("zxx");
554         if (result.remove("und")) { // put und at the front
555             if (result.isEmpty()) {
556                 return "und";
557             } else {
558                 return "und " + JOIN_SPACE.join(result);
559             }
560         }
561         return JOIN_SPACE.join(result);
562     }
563 
564     /**
565      * Strip out all code points less than U+0020 except for U+0009 tab,
566      * U+000A line feed, and U+000D carriage return.
567      *
568      * @param s the string
569      * @return the resulting string
570      */
stripProblematicControlCharacters(String s)571     private String stripProblematicControlCharacters(String s) {
572         if (s == null || s.isEmpty()) {
573             return s;
574         }
575         return s.codePoints()
576             .filter(c -> (c >= 0x20 || c == 9 || c == 0xA || c == 0xD))
577             .collect(StringBuilder::new, StringBuilder::appendCodePoint, StringBuilder::append)
578             .toString();
579     }
580 
581     private static final boolean REMOVE_COVERED_KEYWORDS = true;
582 
583     /**
584      * Produce a modification of the given annotation by sorting its components and filtering covered keywords.
585      *
586      * Examples: Given "b | a", return "a | b". Given "bear | panda | panda bear", return "bear | panda".
587      *
588      * @param value the string
589      * @return the possibly modified string
590      */
annotationsForDisplay(String value)591     private static String annotationsForDisplay(String value) {
592         TreeSet<String> sorted = new TreeSet<>(Collator.getInstance(ULocale.ROOT));
593         sorted.addAll(SPLIT_BAR.splitToList(value));
594         if (REMOVE_COVERED_KEYWORDS) {
595             filterCoveredKeywords(sorted);
596         }
597         value = JOIN_BAR.join(sorted);
598         return value;
599     }
600 
601     /**
602      * Filter from the given set some keywords that include spaces, if they duplicate,
603      * or are "covered by", other keywords in the set.
604      *
605      * For example, if the set is {"bear", "panda", "panda bear"} (annotation was "bear | panda | panda bear"),
606      * then remove "panda bear", treating it as "covered" since the set already includes "panda" and "bear".
607      *
608      * @param sorted the set from which items may be removed
609      */
filterCoveredKeywords(TreeSet<String> sorted)610     public static void filterCoveredKeywords(TreeSet<String> sorted) {
611         // for now, just do single items
612         HashSet<String> toRemove = new HashSet<>();
613 
614         for (String item : sorted) {
615             List<String> list = SPLIT_SPACE.splitToList(item);
616             if (list.size() < 2) {
617                 continue;
618             }
619             if (sorted.containsAll(list)) {
620                 toRemove.add(item);
621             }
622         }
623         sorted.removeAll(toRemove);
624     }
625 
displayUnicodeSet(String value)626     private String displayUnicodeSet(String value) {
627         if (value.startsWith("[") && value.endsWith("]")) {
628             value = value.substring(1, value.length() - 1);
629         }
630 
631         value = replace(NEEDS_QUOTE1, value, "$1\\\\$2$3");
632         value = replace(NEEDS_QUOTE2, value, "$1\\\\$2$3");
633 
634         // if (RTL.containsSome(value) && value.startsWith("[") && value.endsWith("]")) {
635         // return "\u200E[\u200E" + value.substring(1,value.length()-2) + "\u200E]\u200E";
636         // }
637         return value;
638     }
639 
inputUnicodeSet(String path, String value)640     private String inputUnicodeSet(String path, String value) {
641         // clean up the user's input.
642         // first, fix up the '['
643         value = value.trim();
644 
645         // remove brackets and trim again before regex
646         if (value.startsWith("[")) {
647             value = value.substring(1);
648         }
649         if (value.endsWith("]") && (!value.endsWith("\\]") || value.endsWith("\\\\]"))) {
650             value = value.substring(0, value.length() - 1);
651         }
652         value = value.trim();
653 
654         value = replace(NEEDS_QUOTE1, value, "$1\\\\$2$3");
655         value = replace(NEEDS_QUOTE2, value, "$1\\\\$2$3");
656 
657         // re-add brackets.
658         value = "[" + value + "]";
659 
660         UnicodeSet exemplar = new UnicodeSet(value);
661         XPathParts parts = XPathParts.getFrozenInstance(path);
662         if (parts.getElement(2).equals("parseLenients")) {
663             return exemplar.toPattern(false);
664         }
665         final String type = parts.getAttributeValue(-1, "type");
666         ExemplarType exemplarType = type == null ? ExemplarType.main : ExemplarType.valueOf(type);
667         value = getCleanedUnicodeSet(exemplar, pp, exemplarType);
668         return value;
669     }
670 
normalizeCurrencyDisplayName(String value)671     private String normalizeCurrencyDisplayName(String value) {
672         StringBuilder result = new StringBuilder();
673         boolean inParentheses = false;
674         for (int i = 0; i < value.length(); i++) {
675             char c = value.charAt(i);
676             if (c == '(') {
677                 inParentheses = true;
678             } else if (c == ')') {
679                 inParentheses = false;
680             }
681             if (inParentheses && c == '-' && Character.isDigit(value.charAt(i - 1))) {
682                 c = 0x2013; /* Replace hyphen-minus with dash for date ranges */
683             }
684             result.append(c);
685         }
686         return result.toString();
687     }
688 
normalizeApostrophes(String value)689     private String normalizeApostrophes(String value) {
690         // If our DAIP always had a CLDRFile to work with, then we could just check the exemplar set in it to see.
691         // But since we don't, we just maintain the list internally and use it.
692         if (LANGUAGES_USING_MODIFIER_APOSTROPHE.contains(locale.getLanguage())) {
693             return value.replace('\'', '\u02bc');
694         } else {
695             char prev = 0;
696             StringBuilder builder = new StringBuilder();
697             for (char c : value.toCharArray()) {
698                 if (c == '\'') {
699                     if (Character.isLetter(prev)) {
700                         builder.append('\u2019');
701                     } else {
702                         builder.append('\u2018');
703                     }
704                 } else {
705                     builder.append(c);
706                 }
707                 prev = c;
708             }
709             return builder.toString();
710         }
711     }
712 
normalizeIntervalHyphensAndSpaces(String value)713     private String normalizeIntervalHyphensAndSpaces(String value) {
714         if (value.indexOf("{0}") >= 0) {
715             // intervalFormatFallback pattern, not handled by DateTimePatternGenerator.FormatParser
716             if (scriptCode.equals("Latn")) {
717                 value = INTERVAL_FALLBACK_RANGE.matcher(value).replaceAll("}\u2009\u2013\u2009{");
718             }
719             return value;
720         }
721         DateTimePatternGenerator.FormatParser fp = new DateTimePatternGenerator.FormatParser();
722         fp.set(DateIntervalInfo.genPatternInfo(value, false).getFirstPart()); // first format & separator including spaces
723         List<Object> items = fp.getItems();
724         Object last = items.get(items.size() - 1);
725         if (last instanceof String) {
726             String separator = last.toString(); // separator including spaces
727             String replacement = separator;
728             if (scriptCode.equals("Latn") && (separator.equals(" - ") || separator.equals(" \u2013 "))) {
729                 replacement = "\u2009\u2013\u2009"; // Per CLDR-14032
730             } else if (separator.contains("-")) {
731                 replacement = separator.replace("-", "\u2013");
732             }
733             if (!replacement.equals(separator)) {
734                 StringBuilder sb = new StringBuilder();
735                 sb.append(DateIntervalInfo.genPatternInfo(value, false).getFirstPart());
736                 if (sb.lastIndexOf(separator) >= 0) {
737                     sb.delete(sb.lastIndexOf(separator), sb.length());
738                     sb.append(replacement);
739                     sb.append(DateIntervalInfo.genPatternInfo(value, false).getSecondPart()); // second format only
740                     return sb.toString();
741                 }
742             }
743         }
744         return value;
745     }
746 
normalizeHyphens(String value)747     private String normalizeHyphens(String value) {
748         int hyphenLocation = value.indexOf("-");
749         if (hyphenLocation > 0 &&
750             Character.isDigit(value.charAt(hyphenLocation - 1)) &&
751             hyphenLocation < value.length() - 1 &&
752             Character.isDigit(value.charAt(hyphenLocation + 1))) {
753             StringBuilder sb = new StringBuilder();
754             sb.append(value.substring(0, hyphenLocation));
755             sb.append("\u2013");
756             sb.append(value.substring(hyphenLocation + 1));
757             return sb.toString();
758         }
759         return value;
760     }
761 
standardizeRomanian(String value)762     private String standardizeRomanian(String value) {
763         StringBuilder builder = new StringBuilder();
764         for (char c : value.toCharArray()) {
765             for (char[] pair : ROMANIAN_CONVERSIONS) {
766                 if (c == pair[0]) {
767                     c = pair[1];
768                     break;
769                 }
770             }
771             builder.append(c);
772         }
773         return builder.toString();
774     }
775 
standardizeKwasio(String value)776     private String standardizeKwasio(String value) {
777         StringBuilder builder = new StringBuilder();
778         for (char c : value.toCharArray()) {
779             for (char[] pair : KWASIO_CONVERSIONS) {
780                 if (c == pair[0]) {
781                     c = pair[1];
782                     break;
783                 }
784             }
785             builder.append(c);
786         }
787         return builder.toString();
788     }
789 
790     // Use the myanmar-tools detector.
standardizeMyanmar(String value)791     private String standardizeMyanmar(String value) {
792         if (detector.getZawgyiProbability(value) > 0.90) {
793             return zawgyiUnicodeTransliterator.transform(value);
794         }
795         return value;
796     }
797 
standardizeNgomba(String value)798     private String standardizeNgomba(String value) {
799         StringBuilder builder = new StringBuilder();
800         char[] charArray = value.toCharArray();
801         for (int i = 0; i < charArray.length; i++) {
802             char c = charArray[i];
803             boolean convertedSaltillo = false;
804             for (char[] pair : NGOMBA_CONVERSIONS) {
805                 if (c == pair[0]) {
806                     c = pair[1];
807                     if (c == '\uA78C') {
808                         convertedSaltillo = true;
809                     }
810                     break;
811                 }
812             }
813             if (convertedSaltillo &&
814                 ((i > 0 && i < charArray.length - 1 && Character.isUpperCase(charArray[i - 1]) && Character.isUpperCase(charArray[i + 1])) ||
815                     (i > 1 && Character.isUpperCase(charArray[i - 1]) && Character.isUpperCase(charArray[i - 2])))) {
816                 c = '\uA78B'; // UPPER CASE SALTILLO
817             }
818             builder.append(c);
819         }
820         return builder.toString();
821     }
822 
replaceChars(String path, String value, char[][] charsToReplace, boolean skipAuxExemplars)823     private String replaceChars(String path, String value, char[][] charsToReplace, boolean skipAuxExemplars) {
824         if (skipAuxExemplars && path.contains("/exemplarCharacters[@type=\"auxiliary\"]")) {
825             return value;
826         }
827         StringBuilder builder = new StringBuilder();
828         for (char c : value.toCharArray()) {
829             for (char[] pair : charsToReplace) {
830                 if (c == pair[0]) {
831                     c = pair[1];
832                     break;
833                 }
834             }
835             builder.append(c);
836         }
837         return builder.toString();
838     }
839 
standardizeSwissGerman(String value)840     private String standardizeSwissGerman(String value) {
841         return value.replaceAll("\u00DF", "ss");
842     }
843 
standardizeCatalan(String value)844     private String standardizeCatalan(String value) {
845         StringBuilder builder = new StringBuilder();
846         for (char c : value.toCharArray()) {
847             boolean didSubstitute = false;
848             for (char[] triple : CATALAN_CONVERSIONS) {
849                 if (c == triple[0]) {
850                     builder.append(triple[1]);
851                     builder.append(triple[2]);
852                     didSubstitute = true;
853                     break;
854                 }
855             }
856             if (!didSubstitute) {
857                 builder.append(c);
858             }
859         }
860         return builder.toString();
861     }
862 
replace(Pattern pattern, String value, String replacement)863     private String replace(Pattern pattern, String value, String replacement) {
864         String value2 = pattern.matcher(value).replaceAll(replacement);
865         if (DEBUG_DAIP && !value.equals(value2)) {
866             System.out.println("\n" + value + " => " + value2);
867         }
868         return value2;
869     }
870 
871     private static Pattern UNNORMALIZED_MALAYALAM = PatternCache.get(
872         "(\u0D23|\u0D28|\u0D30|\u0D32|\u0D33|\u0D15)\u0D4D\u200D");
873 
874     private static Map<Character, Character> NORMALIZING_MAP = Builder.with(new HashMap<Character, Character>())
875         .put('\u0D23', '\u0D7A').put('\u0D28', '\u0D7B')
876         .put('\u0D30', '\u0D7C').put('\u0D32', '\u0D7D')
877         .put('\u0D33', '\u0D7E').put('\u0D15', '\u0D7F').get();
878 
879     /**
880      * Normalizes the Malayalam characters in the specified input.
881      *
882      * @param value
883      *            the input to be normalized
884      * @return
885      */
normalizeMalayalam(String value)886     private String normalizeMalayalam(String value) {
887         // Normalize Malayalam characters.
888         Matcher matcher = UNNORMALIZED_MALAYALAM.matcher(value);
889         if (matcher.find()) {
890             StringBuffer buffer = new StringBuffer();
891             int start = 0;
892             do {
893                 buffer.append(value.substring(start, matcher.start(0)));
894                 char codePoint = matcher.group(1).charAt(0);
895                 buffer.append(NORMALIZING_MAP.get(codePoint));
896                 start = matcher.end(0);
897             } while (matcher.find());
898             buffer.append(value.substring(start));
899             value = buffer.toString();
900         }
901         return value;
902     }
903 
904     static final Transform<String, String> fixArabicPresentation = Transliterator.getInstance(
905         "[[:block=Arabic_Presentation_Forms_A:][:block=Arabic_Presentation_Forms_B:]] nfkc");
906 
907     /**
908      * Normalizes the Arabic presentation forms characters in the specified input.
909      *
910      * @param value
911      *            the input to be normalized
912      * @return
913      */
replaceArabicPresentationForms(String value)914     private String replaceArabicPresentationForms(String value) {
915         value = fixArabicPresentation.transform(value);
916         return value;
917     }
918 
919     static Pattern ADLAM_MISNASALIZED = PatternCache.get("([����])['’‘]([����������������])");
920     public static String ADLAM_NASALIZATION = "��"; // U+1E94B (Unicode 12.0)
921 
fixAdlamNasalization(String fromString)922     public static String fixAdlamNasalization(String fromString) {
923         return ADLAM_MISNASALIZED.matcher(fromString)
924         .replaceAll("$1"+ADLAM_NASALIZATION+"$2");  // replace quote with ��
925     }
926 
927     static Pattern REMOVE_QUOTE1 = PatternCache.get("(\\s)(\\\\[-\\}\\]\\&])()");
928     static Pattern REMOVE_QUOTE2 = PatternCache.get("(\\\\[\\-\\{\\[\\&])(\\s)"); // ([^\\])([\\-\\{\\[])(\\s)
929 
930     static Pattern NEEDS_QUOTE1 = PatternCache.get("(\\s|$)([-\\}\\]\\&])()");
931     static Pattern NEEDS_QUOTE2 = PatternCache.get("([^\\\\])([\\-\\{\\[\\&])(\\s)"); // ([^\\])([\\-\\{\\[])(\\s)
932 
getCleanedUnicodeSet(UnicodeSet exemplar, UnicodeSetPrettyPrinter prettyPrinter, ExemplarType exemplarType)933     public static String getCleanedUnicodeSet(UnicodeSet exemplar, UnicodeSetPrettyPrinter prettyPrinter,
934         ExemplarType exemplarType) {
935         if (prettyPrinter == null) {
936             return exemplar.toPattern(false);
937         }
938         String value;
939         prettyPrinter.setCompressRanges(exemplar.size() > 300);
940         value = exemplar.toPattern(false);
941         UnicodeSet toAdd = new UnicodeSet();
942 
943         for (UnicodeSetIterator usi = new UnicodeSetIterator(exemplar); usi.next();) {
944             String string = usi.getString();
945             if (string.equals("ß") || string.equals("İ")) {
946                 toAdd.add(string);
947                 continue;
948             }
949             switch (string) {
950             case "\u2011": toAdd.add("-"); break; // nobreak hyphen
951             case "-": toAdd.add("\u2011"); break; // nobreak hyphen
952 
953             case " ": toAdd.add("\u00a0"); break; // nobreak space
954             case "\u00a0": toAdd.add(" "); break; // nobreak space
955 
956             case "\u202F": toAdd.add("\u2009"); break; // nobreak narrow space
957             case "\u2009": toAdd.add("\u202F"); break; // nobreak narrow space
958             }
959             if (exemplarType.convertUppercase) {
960                 string = UCharacter.toLowerCase(ULocale.ENGLISH, string);
961             }
962             toAdd.add(string);
963             String composed = Normalizer.compose(string, false);
964             if (!string.equals(composed)) {
965                 toAdd.add(composed);
966             }
967         }
968 
969         toAdd.removeAll(exemplarType.toRemove);
970 
971         if (DEBUG_DAIP && !toAdd.equals(exemplar)) {
972             UnicodeSet oldOnly = new UnicodeSet(exemplar).removeAll(toAdd);
973             UnicodeSet newOnly = new UnicodeSet(toAdd).removeAll(exemplar);
974             System.out.println("Exemplar:\t" + exemplarType + ",\tremoved\t" + oldOnly + ",\tadded\t" + newOnly);
975         }
976 
977         String fixedExemplar = prettyPrinter.format(toAdd);
978         UnicodeSet doubleCheck = new UnicodeSet(fixedExemplar);
979         if (!toAdd.equals(doubleCheck)) {
980             // something went wrong, leave as is
981         } else if (!value.equals(fixedExemplar)) { // put in this condition just for debugging
982             if (DEBUG_DAIP) {
983                 System.out.println(TestMetadata.showDifference(
984                     With.codePoints(value),
985                     With.codePoints(fixedExemplar),
986                     "\n"));
987             }
988             value = fixedExemplar;
989         }
990         return value;
991     }
992 
993     /**
994      * @return a canonical numeric pattern, based on the type, and the isPOSIX flag. The latter is set for en_US_POSIX.
995      */
996     static final Splitter SEMI_SPLITTER = Splitter.on(';').trimResults();
997 
getCanonicalPattern(String inpattern, NumericType type, boolean isPOSIX)998     public static String getCanonicalPattern(String inpattern, NumericType type, boolean isPOSIX) {
999         // TODO fix later to properly handle quoted ;
1000 
1001         DecimalFormat df = new DecimalFormat(inpattern);
1002         if (type == NumericType.DECIMAL_ABBREVIATED || type == NumericType.CURRENCY_ABBREVIATED
1003             || CldrUtility.INHERITANCE_MARKER.equals(inpattern)) {
1004             return inpattern; // TODO fix when ICU bug is fixed
1005             // df.setMaximumFractionDigits(df.getMinimumFractionDigits());
1006             // df.setMaximumIntegerDigits(Math.max(1, df.getMinimumIntegerDigits()));
1007         } else {
1008             // int decimals = type == CURRENCY_TYPE ? 2 : 1;
1009             int[] digits = isPOSIX ? type.posixDigitCount : type.digitCount;
1010             df.setMinimumIntegerDigits(digits[0]);
1011             df.setMinimumFractionDigits(digits[1]);
1012             df.setMaximumFractionDigits(digits[2]);
1013         }
1014         String pattern = df.toPattern();
1015         List<String> parts = SEMI_SPLITTER.splitToList(pattern);
1016         String pattern2 = parts.get(0);
1017         if (parts.size() > 1) {
1018             pattern2 += ";" + parts.get(1);
1019         }
1020         if (!pattern2.equals(pattern)) {
1021             pattern = pattern2;
1022         }
1023         // int pos = pattern.indexOf(';');
1024         // if (pos < 0) return pattern + ";-" + pattern;
1025         return pattern;
1026     }
1027 
1028     /*
1029      * This tests what type a numeric pattern is.
1030      */
1031     public enum NumericType {
1032         CURRENCY(new int[] { 1, 2, 2 }, new int[] { 1, 2, 2 }), CURRENCY_ABBREVIATED(), DECIMAL(new int[] { 1, 0, 3 },
1033             new int[] { 1, 0, 6 }), DECIMAL_ABBREVIATED(), PERCENT(new int[] { 1, 0, 0 },
1034                 new int[] { 1, 0, 0 }), SCIENTIFIC(new int[] { 0, 0, 0 }, new int[] { 1, 6, 6 }), NOT_NUMERIC;
1035 
1036         private static final Pattern NUMBER_PATH = Pattern
1037             .compile("//ldml/numbers/((currency|decimal|percent|scientific)Formats|currencies/currency).*");
1038         private int[] digitCount;
1039         private int[] posixDigitCount;
1040 
NumericType()1041         private NumericType() {
1042         }
1043 
NumericType(int[] digitCount, int[] posixDigitCount)1044         private NumericType(int[] digitCount, int[] posixDigitCount) {
1045             this.digitCount = digitCount;
1046             this.posixDigitCount = posixDigitCount;
1047         }
1048 
1049         /**
1050          * @return the numeric type of the xpath
1051          */
getNumericType(String xpath)1052         public static NumericType getNumericType(String xpath) {
1053             Matcher matcher = NUMBER_PATH.matcher(xpath);
1054             if (xpath.indexOf("/pattern") < 0) {
1055                 return NOT_NUMERIC;
1056             } else if (matcher.matches()) {
1057                 if (matcher.group(1).equals("currencies/currency")) {
1058                     return CURRENCY;
1059                 } else {
1060                     NumericType type = NumericType.valueOf(matcher.group(2).toUpperCase());
1061                     if (xpath.contains("=\"1000")) {
1062                         if (type == DECIMAL) {
1063                             type = DECIMAL_ABBREVIATED;
1064                         } else if (type == CURRENCY) {
1065                             type = CURRENCY_ABBREVIATED;
1066                         } else {
1067                             throw new IllegalArgumentException("Internal Error");
1068                         }
1069                     }
1070                     return type;
1071                 }
1072             } else {
1073                 return NOT_NUMERIC;
1074             }
1075         }
1076 
getDigitCount()1077         public int[] getDigitCount() {
1078             return digitCount;
1079         }
1080 
getPosixDigitCount()1081         public int[] getPosixDigitCount() {
1082             return posixDigitCount;
1083         }
1084     }
1085 
1086     /**
1087      * Turn all whitespace sequences (including tab and newline, and NBSP for certain paths)
1088      * into a single space or a single NBSP depending on path.
1089      * Also trim initial/final NBSP, unless the value is only the one character, "\u00A0"
1090      *
1091      * @param path
1092      * @param value
1093      * @return the normalized value
1094      */
normalizeWhitespace(String path, String value)1095     private String normalizeWhitespace(String path, String value) {
1096         PathSpaceType pst = PathSpaceType.get(path);
1097         if (pst == PathSpaceType.allowSp) {
1098             value = WHITESPACE_AND_NBSP_TO_NORMALIZE.matcher(value).replaceAll(" "); // replace with regular space
1099         } else if (pst == PathSpaceType.allowNbsp) {
1100             value = WHITESPACE_AND_NBSP_TO_NORMALIZE.matcher(value).replaceAll("\u00A0"); // replace with NBSP
1101             value = trimNBSP(value);
1102         } else if (pst == PathSpaceType.allowNNbsp) {
1103             value = WHITESPACE_AND_NBSP_TO_NORMALIZE.matcher(value).replaceAll("\u202F"); // replace with NNBSP
1104             value = trimNBSP(value);
1105         } else if (pst == PathSpaceType.allowSpOrNbsp) {
1106             /*
1107              * in this case don't normalize away NBSP
1108              */
1109             value = WHITESPACE_NO_NBSP_TO_NORMALIZE.matcher(value).replaceAll(" "); // replace with regular space
1110             /*
1111              * if any NBSP and regular space are adjacent, replace with NBSP
1112              */
1113             value = NBSP_PLUS_SPACE_TO_NORMALIZE.matcher(value).replaceAll("\u00A0");
1114             value = SPACE_PLUS_NBSP_TO_NORMALIZE.matcher(value).replaceAll("\u00A0");
1115             value = MULTIPLE_NBSP.matcher(value).replaceAll("\u00A0");
1116             value = trimNBSP(value);
1117         } else {
1118             throw new IllegalArgumentException("Unknown PathSpaceType " + pst);
1119         }
1120 
1121         // Further whitespace adjustments per CLDR-14032
1122         if ((scriptCode.equals("Latn") || scriptCode.equals("Cyrl") || scriptCode.equals("Grek")) &&
1123                 HOUR_FORMAT_XPATHS.matcher(path).matches()) {
1124             String test = AMPM_SPACE_BEFORE.matcher(value).replaceAll("$1$2"); // value without a+
1125             if (value.length() - test.length() != 4) { // exclude patterns with aaaa
1126                 value = AMPM_SPACE_BEFORE.matcher(value).replaceAll("$1\u202F$3");
1127             }
1128             test = AMPM_SPACE_AFTER.matcher(value).replaceAll("$2$3"); // value without a+
1129             if (value.length() - test.length() != 4) { // exclude patterns with aaaa
1130                 value = AMPM_SPACE_AFTER.matcher(value).replaceAll("$1\u202F$3");
1131             }
1132         }
1133         if (scriptCode.equals("Cyrl") && YEAR_FORMAT_XPATHS.matcher(path).matches()) {
1134             value = YEAR_SPACE_YEARMARKER.matcher(value).replaceAll("y\u202F$1");
1135         }
1136         if (UNIT_NARROW_XPATHS.matcher(path).matches()) {
1137             value = PLACEHOLDER_SPACE_AFTER.matcher(value).replaceAll("}\u202F"); // Narrow NBSP
1138             value = PLACEHOLDER_SPACE_BEFORE.matcher(value).replaceAll("\u202F{");
1139         }
1140         if (UNIT_SHORT_XPATHS.matcher(path).matches()) {
1141             value = PLACEHOLDER_SPACE_AFTER.matcher(value).replaceAll("}\u00A0"); // Regular NBSP
1142             value = PLACEHOLDER_SPACE_BEFORE.matcher(value).replaceAll("\u00A0{");
1143         }
1144 
1145         return value;
1146     }
1147 
1148     /**
1149      * Delete any initial or final NBSP, unless the value is just NBSP
1150      *
1151      * @param value
1152      * @return the trimmed value
1153      */
trimNBSP(String value)1154     private String trimNBSP(String value) {
1155         if (!value.equals("\u00A0") && !value.equals("\u202F")) {
1156             value = INITIAL_NBSP.matcher(value).replaceAll("");
1157             value = FINAL_NBSP.matcher(value).replaceAll("");
1158         }
1159         return value;
1160     }
1161 
1162     /**
1163      * Categorize xpaths according to whether they allow space, NBSP, or both
1164      */
1165     public enum PathSpaceType {
1166         allowSp, allowNbsp, allowNNbsp, allowSpOrNbsp;
1167 
get(String path)1168         public static PathSpaceType get(String path) {
1169             if (wantsRegularSpace(path)) {
1170                 return allowSp;
1171             } else if (wantsNBSP(path)) {
1172                 return allowNbsp;
1173             } else if (wantsNNBSP(path)) {
1174                 return allowNNbsp;
1175             } else {
1176                 return allowSpOrNbsp;
1177             }
1178         }
1179 
wantsRegularSpace(String path)1180         private static boolean wantsRegularSpace(String path) {
1181             if ((path.contains("/dateFormatLength") && path.contains("/pattern")) ||
1182                 path.contains("/availableFormats/dateFormatItem") ||
1183                 (path.startsWith("//ldml/dates/timeZoneNames/metazone") && path.contains("/long")) ||
1184                 path.startsWith("//ldml/dates/timeZoneNames/regionFormat") ||
1185                 path.startsWith("//ldml/localeDisplayNames/codePatterns/codePattern") ||
1186                 path.startsWith("//ldml/localeDisplayNames/languages/language") ||
1187                 path.startsWith("//ldml/localeDisplayNames/territories/territory") ||
1188                 path.startsWith("//ldml/localeDisplayNames/types/type") ||
1189                 (path.startsWith("//ldml/numbers/currencies/currency") && path.contains("/displayName")) ||
1190                 (path.contains("/decimalFormatLength[@type=\"long\"]") && path.contains("/pattern")) ||
1191                 path.startsWith("//ldml/posix/messages") ||
1192                 (path.startsWith("//ldml/units/uni") && path.contains("/unitPattern "))) {
1193                 return true;
1194             }
1195             return false;
1196         }
1197 
wantsNBSP(String path)1198         private static boolean wantsNBSP(String path) {
1199             if ((path.contains("/currencies/currency") && (path.contains("/group") || path.contains("/pattern"))) ||
1200                 (path.contains("/currencyFormatLength") && path.contains("/pattern")) ||
1201                 (path.contains("/currencySpacing") && path.contains("/insertBetween")) ||
1202                 (path.contains("/decimalFormatLength") && path.contains("/pattern")) || // i.e. the non-long ones
1203                 (path.contains("/percentFormatLength") && path.contains("/pattern")) ||
1204                 (path.startsWith("//ldml/numbers/symbols") && (path.contains("/group") || path.contains("/nan")))) {
1205                 return true;
1206             }
1207             return false;
1208         }
1209 
wantsNNBSP(String path)1210         private static boolean wantsNNBSP(String path) {
1211             if ((path.contains("/dayPeriodWidth[@type=\"abbreviated\"]") || path.contains("/dayPeriodWidth[@type=\"narrow\"]")) &&
1212                 (path.contains("/dayPeriod[@type=\"am\"]") || path.contains("/dayPeriod[@type=\"pm\"]")) ) {
1213                 return true;
1214             }
1215             return false;
1216         }
1217     }
1218 
1219     private static final Pattern ZERO_WIDTH_SPACES = PatternCache.get("\\u200B+");
1220     private static final Set<String> LOCALES_NOT_ALLOWING_ZWS = new HashSet<>(Arrays.asList("da", "fr"));
1221 
1222     /**
1223      * Remove occurrences of U+200B ZERO_WIDTH_SPACE under certain conditions
1224      *
1225      * @param value the value to be normalized
1226      * @return the normalized value
1227      *
1228      * TODO: extend this method to address more concerns, after clarifying the conditions
1229      *   - enlarge the set LOCALES_NOT_ALLOWING_ZWS?
1230      *   - strip initial and final ZWS in all locales?
1231      *   - reduce two or more adjacent ZWS to one ZWS?
1232      *   - allow or prohibit ZWS by itself as currency symbol, as currently in locales kea, pt_CV, pt_PT
1233      *   - allow or prohibit ZWS preceding URL as in "as per [U+200B]http://unicode.org/repos/cldr/trunk/specs/ldml/tr35-general.html#Annotations"
1234      * Reference: https://unicode-org.atlassian.net/browse/CLDR-15976
1235      */
normalizeZeroWidthSpace(String value)1236     private String normalizeZeroWidthSpace(String value) {
1237         if (ZERO_WIDTH_SPACES.matcher(value).find()) {
1238             final String localeId = locale.getBaseName();
1239             if (LOCALES_NOT_ALLOWING_ZWS.contains(localeId)) {
1240                 value = ZERO_WIDTH_SPACES.matcher(value).replaceAll("");
1241             }
1242         }
1243         return value;
1244     }
1245 }
1246