• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /* Copyright (C) 2007-2013 Google and others.  All Rights Reserved. */
2 /* Copyright (C) 2007-2013 IBM Corp. and others. All Rights Reserved. */
3 
4 package org.unicode.cldr.test;
5 
6 import java.util.Arrays;
7 import java.util.HashMap;
8 import java.util.HashSet;
9 import java.util.List;
10 import java.util.Map;
11 import java.util.Set;
12 import java.util.TreeSet;
13 import java.util.regex.Matcher;
14 import java.util.regex.Pattern;
15 
16 import org.unicode.cldr.test.CheckExemplars.ExemplarType;
17 import org.unicode.cldr.util.Builder;
18 import org.unicode.cldr.util.CLDRFile;
19 import org.unicode.cldr.util.CLDRLocale;
20 import org.unicode.cldr.util.CldrUtility;
21 import org.unicode.cldr.util.DateTimeCanonicalizer;
22 import org.unicode.cldr.util.DateTimeCanonicalizer.DateTimePatternType;
23 import org.unicode.cldr.util.Emoji;
24 import org.unicode.cldr.util.ICUServiceBuilder;
25 import org.unicode.cldr.util.PatternCache;
26 import org.unicode.cldr.util.UnicodeSetPrettyPrinter;
27 import org.unicode.cldr.util.With;
28 import org.unicode.cldr.util.XPathParts;
29 
30 import com.google.common.base.Joiner;
31 import com.google.common.base.Splitter;
32 import com.google.myanmartools.ZawgyiDetector;
33 import com.ibm.icu.lang.UCharacter;
34 import com.ibm.icu.text.Collator;
35 import com.ibm.icu.text.DateIntervalInfo;
36 import com.ibm.icu.text.DateTimePatternGenerator;
37 import com.ibm.icu.text.DecimalFormat;
38 import com.ibm.icu.text.Normalizer;
39 import com.ibm.icu.text.RuleBasedCollator;
40 import com.ibm.icu.text.Transform;
41 import com.ibm.icu.text.Transliterator;
42 import com.ibm.icu.text.UnicodeSet;
43 import com.ibm.icu.text.UnicodeSetIterator;
44 import com.ibm.icu.util.ULocale;
45 
46 /**
47  * Class for processing the input and output of CLDR data for use in the
48  * Survey Tool and other tools.
49  */
50 public class DisplayAndInputProcessor {
51 
52     private static final boolean FIX_YEARS = true;
53 
54     public static final boolean DEBUG_DAIP = CldrUtility.getProperty("DEBUG_DAIP", false);
55 
56     public static final UnicodeSet RTL = new UnicodeSet("[[:Bidi_Class=Arabic_Letter:][:Bidi_Class=Right_To_Left:]]")
57         .freeze();
58 
59     public static final UnicodeSet TO_QUOTE = (UnicodeSet) new UnicodeSet(
60         "[[:Cn:]" +
61             "[:Default_Ignorable_Code_Point:]" +
62             "[:patternwhitespace:]" +
63             "[:Me:][:Mn:]]" // add non-spacing marks
64     ).freeze();
65 
66     public static final Pattern NUMBER_FORMAT_XPATH = Pattern
67         .compile("//ldml/numbers/.*Format\\[@type=\"standard\"]/pattern.*");
68 
69     public static final Pattern NUMBER_SEPARATOR_PATTERN = Pattern
70         .compile("//ldml/numbers/symbols.*/(decimal|group)");
71 
72     private static final Pattern APOSTROPHE_SKIP_PATHS = PatternCache.get("//ldml/("
73         + "localeDisplayNames/languages/language\\[@type=\"mic\"].*|"
74         + "characters/.*|"
75         + "delimiters/.*|"
76         + "dates/.+/(pattern|intervalFormatItem|dateFormatItem).*|"
77         + "units/.+/unitPattern.*|"
78         + "units/.+/durationUnitPattern.*|"
79         + "numbers/symbols.*|"
80         + "numbers/miscPatterns.*|"
81         + "numbers/(decimal|currency|percent|scientific)Formats.+/(decimal|currency|percent|scientific)Format.*)");
82     private static final Pattern INTERVAL_FORMAT_PATHS = PatternCache.get("//ldml/dates/.+/intervalFormatItem.*");
83     private static final Pattern NON_DECIMAL_PERIOD = PatternCache.get("(?<![0#'])\\.(?![0#'])");
84     private static final Pattern WHITESPACE_NO_NBSP_TO_NORMALIZE = PatternCache.get("\\s+"); // string of whitespace not
85     // including NBSP, i.e. [
86     // \t\n\r]+
87     private static final Pattern WHITESPACE_AND_NBSP_TO_NORMALIZE = PatternCache.get("[\\s\\u00A0]+"); // string of
88     // whitespace
89     // including NBSP,
90     // i.e. [
91     // \u00A0\t\n\r]+
92     private static final UnicodeSet UNICODE_WHITESPACE = new UnicodeSet("[:whitespace:]").freeze();
93 
94     private static final CLDRLocale MALAYALAM = CLDRLocale.getInstance("ml");
95     private static final CLDRLocale ROMANIAN = CLDRLocale.getInstance("ro");
96     private static final CLDRLocale CATALAN = CLDRLocale.getInstance("ca");
97     private static final CLDRLocale NGOMBA = CLDRLocale.getInstance("jgo");
98     private static final CLDRLocale KWASIO = CLDRLocale.getInstance("nmg");
99     private static final CLDRLocale HEBREW = CLDRLocale.getInstance("he");
100     private static final CLDRLocale MYANMAR = CLDRLocale.getInstance("my");
101     private static final CLDRLocale KYRGYZ = CLDRLocale.getInstance("ky");
102     private static final CLDRLocale URDU = CLDRLocale.getInstance("ur");
103     private static final CLDRLocale PASHTO = CLDRLocale.getInstance("ps");
104     private static final CLDRLocale FARSI = CLDRLocale.getInstance("fa");
105     private static final CLDRLocale GERMAN_SWITZERLAND = CLDRLocale.getInstance("de_CH");
106     private static final CLDRLocale SWISS_GERMAN = CLDRLocale.getInstance("gsw");
107     public static final Set<String> LANGUAGES_USING_MODIFIER_APOSTROPHE = new HashSet<String>(
108         Arrays.asList("br", "bss", "cch", "gn", "ha", "ha_Latn", "lkt", "mgo", "moh", "nnh", "qu", "quc", "uk", "uz", "uz_Latn"));
109 
110     // Ş ş Ţ ţ  =>  Ș ș Ț ț
111     private static final char[][] ROMANIAN_CONVERSIONS = {
112         { '\u015E', '\u0218' }, { '\u015F', '\u0219' }, { '\u0162', '\u021A' },
113         { '\u0163', '\u021B' } };
114 
115     private static final char[][] CATALAN_CONVERSIONS = {
116         { '\u013F', '\u004C', '\u00B7' }, // Ŀ -> L·
117         { '\u0140', '\u006C', '\u00B7' } }; // ŀ -> l·
118 
119     private static final char[][] NGOMBA_CONVERSIONS = {
120         { '\u0251', '\u0061' }, { '\u0261', '\u0067' }, //  ɑ -> a , ɡ -> g , See ticket #5691
121         { '\u2019', '\uA78C' }, { '\u02BC', '\uA78C' } }; //  Saltillo, see ticket #6805
122 
123     private static final char[][] KWASIO_CONVERSIONS = {
124         { '\u0306', '\u030C' }, // See ticket #6571, use caron instead of breve
125         { '\u0103', '\u01CE' }, { '\u0102', '\u01CD' }, // a-breve -> a-caron
126         { '\u0115', '\u011B' }, { '\u011A', '\u01CD' }, // e-breve -> e-caron
127         { '\u012D', '\u01D0' }, { '\u012C', '\u01CF' }, // i-breve -> i-caron
128         { '\u014F', '\u01D2' }, { '\u014E', '\u01D1' }, // o-breve -> o-caron
129         { '\u016D', '\u01D4' }, { '\u016C', '\u01D3' } // u-breve -> u-caron
130     };
131 
132     private static final char[][] HEBREW_CONVERSIONS = {
133         { '\'', '\u05F3' }, { '"', '\u05F4' } }; //  ' -> geresh  " -> gershayim
134 
135     private static final char[][] KYRGYZ_CONVERSIONS = {
136         { 'ӊ', 'ң' }, { 'Ӊ', 'Ң' } }; //  right modifier
137 
138     private static final char[][] URDU_PLUS_CONVERSIONS = {
139         { '\u0643', '\u06A9' }}; //  wrong char
140 
141     private static final ZawgyiDetector detector = new ZawgyiDetector();
142     private static final Transliterator zawgyiUnicodeTransliterator =
143         Transliterator.getInstance("Zawgyi-my");
144 
145     private Collator col;
146 
147     private Collator spaceCol;
148 
149     private UnicodeSetPrettyPrinter pp = null;
150 
151     final private CLDRLocale locale;
152     private boolean isPosix;
153 
154     /**
155      * Constructor, taking cldrFile.
156      *
157      * @param cldrFileToCheck
158      */
DisplayAndInputProcessor(CLDRFile cldrFileToCheck, boolean needsCollator)159     public DisplayAndInputProcessor(CLDRFile cldrFileToCheck, boolean needsCollator) {
160         init(this.locale = CLDRLocale.getInstance(cldrFileToCheck.getLocaleID()), needsCollator);
161     }
162 
DisplayAndInputProcessor(CLDRFile cldrFileToCheck)163     public DisplayAndInputProcessor(CLDRFile cldrFileToCheck) {
164         init(this.locale = CLDRLocale.getInstance(cldrFileToCheck.getLocaleID()), true);
165     }
166 
init(CLDRLocale locale, boolean needsCollator)167     void init(CLDRLocale locale, boolean needsCollator) {
168         isPosix = locale.toString().indexOf("POSIX") >= 0;
169         if (needsCollator) {
170             ICUServiceBuilder isb = null;
171             try {
172                 isb = ICUServiceBuilder.forLocale(locale);
173             } catch (Exception e) {
174             }
175 
176             if (isb != null) {
177                 try {
178                     col = isb.getRuleBasedCollator();
179                 } catch (Exception e) {
180                     col = Collator.getInstance(ULocale.ROOT);
181                 }
182             } else {
183                 col = Collator.getInstance(ULocale.ROOT);
184             }
185 
186             spaceCol = Collator.getInstance(locale.toULocale());
187             if (spaceCol instanceof RuleBasedCollator) {
188                 ((RuleBasedCollator) spaceCol).setAlternateHandlingShifted(false);
189             }
190             pp = new UnicodeSetPrettyPrinter().setOrdering(Collator.getInstance(ULocale.ROOT))
191                 .setSpaceComparator(Collator.getInstance(ULocale.ROOT).setStrength2(Collator.PRIMARY))
192                 .setCompressRanges(true)
193                 .setToQuote(new UnicodeSet(TO_QUOTE))
194                 .setOrdering(col)
195                 .setSpaceComparator(spaceCol);
196         }
197     }
198 
getPrettyPrinter()199     public UnicodeSetPrettyPrinter getPrettyPrinter() {
200         return pp;
201     }
202 
203     /**
204      * Constructor, taking locale.
205      *
206      * @param locale
207      */
DisplayAndInputProcessor(ULocale locale, boolean needsCollator)208     public DisplayAndInputProcessor(ULocale locale, boolean needsCollator) {
209         init(this.locale = CLDRLocale.getInstance(locale), needsCollator);
210     }
211 
212     /**
213      * Constructor, taking locale.
214      *
215      * @param locale
216      */
DisplayAndInputProcessor(ULocale locale)217     public DisplayAndInputProcessor(ULocale locale) {
218         init(this.locale = CLDRLocale.getInstance(locale), true);
219     }
220 
221     /**
222      * Constructor, taking locale.
223      *
224      * @param locale
225      */
DisplayAndInputProcessor(CLDRLocale locale, boolean needsCollator)226     public DisplayAndInputProcessor(CLDRLocale locale, boolean needsCollator) {
227         init(this.locale = locale, needsCollator);
228     }
229 
230     /**
231      * Constructor, taking locale.
232      *
233      * @param locale
234      */
DisplayAndInputProcessor(CLDRLocale locale)235     public DisplayAndInputProcessor(CLDRLocale locale) {
236         init(this.locale = locale, true);
237     }
238 
239     /**
240      * Process the value for display. The result is a string for display in the
241      * Survey tool or similar program.
242      *
243      * @param path
244      * @param value
245      * @param fullPath
246      * @return
247      */
processForDisplay(String path, String value)248     public synchronized String processForDisplay(String path, String value) {
249         value = Normalizer.compose(value, false); // Always normalize all text to NFC.
250         if (hasUnicodeSetValue(path)) {
251             value = displayUnicodeSet(value);
252         } else if (path.contains("stopword")) {
253             return value.trim().isEmpty() ? "NONE" : value;
254         } else {
255             NumericType numericType = NumericType.getNumericType(path);
256             if (numericType != NumericType.NOT_NUMERIC) {
257                 // Canonicalize existing values that aren't canonicalized yet.
258                 // New values will be canonicalized on input using processInput().
259                 try {
260                     value = getCanonicalPattern(value, numericType, isPosix);
261                 } catch (IllegalArgumentException e) {
262                     if (DEBUG_DAIP) System.err.println("Illegal pattern: " + value);
263                 }
264                 if (numericType != NumericType.CURRENCY && numericType != NumericType.CURRENCY_ABBREVIATED) {
265                     value = value.replace("'", "");
266                 }
267             }
268         }
269         // Fix up any apostrophes in number symbols
270         if (NUMBER_SEPARATOR_PATTERN.matcher(path).matches()) {
271             value = value.replace('\'', '\u2019');
272         }
273         // Fix up any apostrophes as appropriate (Don't do so for things like date patterns...
274         if (!APOSTROPHE_SKIP_PATHS.matcher(path).matches()) {
275             value = normalizeApostrophes(value);
276         }
277         // Fix up hyphens, replacing with N-dash as appropriate
278         if (INTERVAL_FORMAT_PATHS.matcher(path).matches()) {
279             value = normalizeIntervalHyphens(value);
280         } else {
281             value = normalizeHyphens(value);
282         }
283         return value;
284     }
285 
hasUnicodeSetValue(String path)286     private boolean hasUnicodeSetValue(String path) {
287         return path.startsWith("//ldml/characters/exemplarCharacters") || path.startsWith("//ldml/characters/parseLenients");
288     }
289 
290     static final UnicodeSet WHITESPACE = new UnicodeSet("[:whitespace:]").freeze();
291     static final DateTimeCanonicalizer dtc = new DateTimeCanonicalizer(FIX_YEARS);
292 
293     public static final Splitter SPLIT_BAR = Splitter.on('|').trimResults().omitEmptyStrings();
294     static final Splitter SPLIT_SPACE = Splitter.on(' ').trimResults().omitEmptyStrings();
295     static final Joiner JOIN_BAR = Joiner.on(" | ");
296 
297     /**
298      * Process the value for input. The result is a cleaned-up value. For example,
299      * an exemplar set is modified to be in the normal format, and any missing [ ]
300      * are added (a common omission on entry). If there are any failures then the
301      * original value is returned, so that the proper error message can be given.
302      *
303      * @param path
304      * @param value
305      * @param internalException
306      *            TODO
307      * @param fullPath
308      * @return
309      */
processInput(String path, String value, Exception[] internalException)310     public synchronized String processInput(String path, String value, Exception[] internalException) {
311         String original = value;
312         value = Normalizer.compose(value, false); // Always normalize all input to NFC.
313         if (internalException != null) {
314             internalException[0] = null;
315         }
316         try {
317             // Normalise Malayalam characters.
318             boolean isUnicodeSet = hasUnicodeSetValue(path);
319             if (locale.childOf(MALAYALAM)) {
320                 String newvalue = normalizeMalayalam(value);
321                 if (DEBUG_DAIP) System.out.println("DAIP: Normalized Malayalam '" + value + "' to '" + newvalue + "'");
322                 value = newvalue;
323             } else if (locale.childOf(ROMANIAN) && !isUnicodeSet) {
324                 value = standardizeRomanian(value);
325             } else if (locale.childOf(CATALAN) && !isUnicodeSet) {
326                 value = standardizeCatalan(value);
327             } else if (locale.childOf(NGOMBA) && !isUnicodeSet) {
328                 value = standardizeNgomba(value);
329             } else if (locale.childOf(KWASIO) && !isUnicodeSet) {
330                 value = standardizeKwasio(value);
331             } else if (locale.childOf(HEBREW) && !APOSTROPHE_SKIP_PATHS.matcher(path).matches()) {
332                 value = replaceChars(path, value, HEBREW_CONVERSIONS, false);
333             } else if ((locale.childOf(SWISS_GERMAN) || locale.childOf(GERMAN_SWITZERLAND)) && !isUnicodeSet) {
334                 value = standardizeSwissGerman(value);
335             } else if (locale.childOf(MYANMAR) && !isUnicodeSet) {
336                 value = standardizeMyanmar(value);
337             } else if (locale.childOf(KYRGYZ)) {
338                 value = replaceChars(path, value, KYRGYZ_CONVERSIONS, false);
339             } else if (locale.childOf(URDU) || locale.childOf(PASHTO) || locale.childOf(FARSI)) {
340                 value = replaceChars(path, value, URDU_PLUS_CONVERSIONS, true);
341             }
342 
343             if (UNICODE_WHITESPACE.containsSome(value)) {
344                 value = normalizeWhitespace(path, value);
345             }
346 
347             // all of our values should not have leading or trailing spaces, except insertBetween
348             if (!path.contains("/insertBetween") && !isUnicodeSet) {
349                 value = value.trim();
350             }
351 
352             // fix grouping separator if space
353             if (path.startsWith("//ldml/numbers/symbols") && !path.contains("/alias")) {
354                 if (value.isEmpty()) {
355                     value = "\u00A0";
356                 }
357                 value = value.replace(' ', '\u00A0');
358             }
359 
360             // fix date patterns
361             DateTimePatternType datetimePatternType = DateTimePatternType.fromPath(path);
362             if (DateTimePatternType.STOCK_AVAILABLE_INTERVAL_PATTERNS.contains(datetimePatternType)) {
363                 try {
364                     value = dtc.getCanonicalDatePattern(path, value, datetimePatternType);
365                 } catch (IllegalArgumentException ex) {
366                     return value;
367                 }
368             }
369 
370             if (path.startsWith("//ldml/numbers/currencies/currency") && path.contains("displayName")) {
371                 value = normalizeCurrencyDisplayName(value);
372             }
373             NumericType numericType = NumericType.getNumericType(path);
374             if (numericType != NumericType.NOT_NUMERIC) {
375                 if (numericType == NumericType.CURRENCY) {
376                     value = value.replaceAll(" ", "\u00A0");
377                     if (numericType == NumericType.CURRENCY_ABBREVIATED) {
378                         value = value.replaceAll("0\\.0+", "0");
379                     }
380                 } else {
381                     value = value.replaceAll("([%\u00A4]) ", "$1\u00A0")
382                         .replaceAll(" ([%\u00A4])", "\u00A0$1");
383                     value = replace(NON_DECIMAL_PERIOD, value, "'.'");
384                     if (numericType == NumericType.DECIMAL_ABBREVIATED) {
385                         value = value.replaceAll("0\\.0+", "0");
386                     }
387                 }
388                 value = getCanonicalPattern(value, numericType, isPosix);
389             }
390 
391             // fix [,]
392             if (path.startsWith("//ldml/localeDisplayNames/languages/language")
393                 || path.startsWith("//ldml/localeDisplayNames/scripts/script")
394                 || path.startsWith("//ldml/localeDisplayNames/territories/territory")
395                 || path.startsWith("//ldml/localeDisplayNames/variants/variant")
396                 || path.startsWith("//ldml/localeDisplayNames/keys/key")
397                 || path.startsWith("//ldml/localeDisplayNames/types/type")) {
398                 value = value.replace('[', '(').replace(']', ')').replace('[', '(').replace(']', ')');
399             }
400 
401             // Normalize two single quotes for the inches symbol.
402             if (path.contains("/units")) {
403                 value = value.replace("''", "″");
404             }
405 
406             // check specific cases
407             if (isUnicodeSet) {
408                 value = inputUnicodeSet(path, value);
409             } else if (path.contains("stopword")) {
410                 if (value.equals("NONE")) {
411                     value = "";
412                 }
413             }
414 
415             // Normalize ellipsis data.
416             if (path.startsWith("//ldml/characters/ellipsis")) {
417                 value = value.replace("...", "…");
418             }
419 
420             // Replace Arabic presentation forms with their nominal counterparts
421             value = replaceArabicPresentationForms(value);
422 
423             // Fix up any apostrophes as appropriate (Don't do so for things like date patterns...
424             if (!APOSTROPHE_SKIP_PATHS.matcher(path).matches()) {
425                 value = normalizeApostrophes(value);
426             }
427             // Fix up any apostrophes in number symbols
428             if (NUMBER_SEPARATOR_PATTERN.matcher(path).matches()) {
429                 value = value.replace('\'', '\u2019');
430             }
431             // Fix up hyphens, replacing with N-dash as appropriate
432             if (INTERVAL_FORMAT_PATHS.matcher(path).matches()) {
433                 value = normalizeIntervalHyphens(value);
434             } else if (!isUnicodeSet) {
435                 value = normalizeHyphens(value);
436             }
437 
438             if (path.startsWith("//ldml/annotations/annotation")) {
439                 if (path.contains(Emoji.TYPE_TTS)) {
440                     // The row has something like "�� -name" in the first column. Cf. namePath, getNamePaths.
441                     // Normally the value is like "zebra" or "unicorn face", without "|".
442                     // If the user enters a value with "|",  discard anything after "|"; e.g., change "a | b | c" to "a".
443                     value = SPLIT_BAR.split(value).iterator().next();
444                 } else {
445                     // The row has something like "�� –keywords" in the first column. Cf. keywordPath, getKeywordPaths.
446                     // Normally the value is like "stripe | zebra", with "|".
447                     value = annotationsForDisplay(value);
448                 }
449             }
450 
451             return value;
452         } catch (RuntimeException e) {
453             if (internalException != null) {
454                 internalException[0] = e;
455             }
456             return original;
457         }
458     }
459 
460     private static final boolean REMOVE_COVERED_KEYWORDS = true;
461 
462     /**
463      * Produce a modification of the given annotation by sorting its components and filtering covered keywords.
464      *
465      * Examples: Given "b | a", return "a | b". Given "bear | panda | panda bear", return "bear | panda".
466      *
467      * @param value the string
468      * @return the possibly modified string
469      */
annotationsForDisplay(String value)470     private static String annotationsForDisplay(String value) {
471         TreeSet<String> sorted = new TreeSet<>(Collator.getInstance(ULocale.ROOT));
472         sorted.addAll(SPLIT_BAR.splitToList(value));
473         if (REMOVE_COVERED_KEYWORDS) {
474             filterCoveredKeywords(sorted);
475         }
476         value = JOIN_BAR.join(sorted);
477         return value;
478     }
479 
480     /**
481      * Filter from the given set some keywords that include spaces, if they duplicate,
482      * or are "covered by", other keywords in the set.
483      *
484      * For example, if the set is {"bear", "panda", "panda bear"} (annotation was "bear | panda | panda bear"),
485      * then remove "panda bear", treating it as "covered" since the set already includes "panda" and "bear".
486      *
487      * @param sorted the set from which items may be removed
488      */
filterCoveredKeywords(TreeSet<String> sorted)489     public static void filterCoveredKeywords(TreeSet<String> sorted) {
490         // for now, just do single items
491         HashSet<String> toRemove = new HashSet<>();
492 
493         for (String item : sorted) {
494             List<String> list = SPLIT_SPACE.splitToList(item);
495             if (list.size() < 2) {
496                 continue;
497             }
498             if (sorted.containsAll(list)) {
499                 toRemove.add(item);
500             }
501         }
502         sorted.removeAll(toRemove);
503     }
504 
displayUnicodeSet(String value)505     private String displayUnicodeSet(String value) {
506         if (value.startsWith("[") && value.endsWith("]")) {
507             value = value.substring(1, value.length() - 1);
508         }
509 
510         value = replace(NEEDS_QUOTE1, value, "$1\\\\$2$3");
511         value = replace(NEEDS_QUOTE2, value, "$1\\\\$2$3");
512 
513         // if (RTL.containsSome(value) && value.startsWith("[") && value.endsWith("]")) {
514         // return "\u200E[\u200E" + value.substring(1,value.length()-2) + "\u200E]\u200E";
515         // }
516         return value;
517     }
518 
inputUnicodeSet(String path, String value)519     private String inputUnicodeSet(String path, String value) {
520         // clean up the user's input.
521         // first, fix up the '['
522         value = value.trim();
523 
524         // remove brackets and trim again before regex
525         if (value.startsWith("[")) {
526             value = value.substring(1);
527         }
528         if (value.endsWith("]") && (!value.endsWith("\\]") || value.endsWith("\\\\]"))) {
529             value = value.substring(0, value.length() - 1);
530         }
531         value = value.trim();
532 
533         value = replace(NEEDS_QUOTE1, value, "$1\\\\$2$3");
534         value = replace(NEEDS_QUOTE2, value, "$1\\\\$2$3");
535 
536         // re-add brackets.
537         value = "[" + value + "]";
538 
539         UnicodeSet exemplar = new UnicodeSet(value);
540         XPathParts parts = XPathParts.getFrozenInstance(path); // new XPathParts().set(path);
541         if (parts.getElement(2).equals("parseLenients")) {
542             return exemplar.toPattern(false);
543         }
544         final String type = parts.getAttributeValue(-1, "type");
545         ExemplarType exemplarType = type == null ? ExemplarType.main : ExemplarType.valueOf(type);
546         value = getCleanedUnicodeSet(exemplar, pp, exemplarType);
547         return value;
548     }
549 
normalizeWhitespace(String path, String value)550     private String normalizeWhitespace(String path, String value) {
551         // turn all whitespace sequences (including tab and newline, and NBSP for certain paths)
552         // into a single space or a single NBSP depending on path.
553         if ((path.contains("/dateFormatLength") && path.contains("/pattern")) ||
554             path.contains("/availableFormats/dateFormatItem") ||
555             (path.startsWith("//ldml/dates/timeZoneNames/metazone") && path.contains("/long")) ||
556             path.startsWith("//ldml/dates/timeZoneNames/regionFormat") ||
557             path.startsWith("//ldml/localeDisplayNames/codePatterns/codePattern") ||
558             path.startsWith("//ldml/localeDisplayNames/languages/language") ||
559             path.startsWith("//ldml/localeDisplayNames/territories/territory") ||
560             path.startsWith("//ldml/localeDisplayNames/types/type") ||
561             (path.startsWith("//ldml/numbers/currencies/currency") && path.contains("/displayName")) ||
562             (path.contains("/decimalFormatLength[@type=\"long\"]") && path.contains("/pattern")) ||
563             path.startsWith("//ldml/posix/messages") ||
564             (path.startsWith("//ldml/units/uni") && path.contains("/unitPattern "))) {
565             value = WHITESPACE_AND_NBSP_TO_NORMALIZE.matcher(value).replaceAll(" "); // replace with regular space
566         } else if ((path.contains("/currencies/currency") && (path.contains("/group") || path.contains("/pattern")))
567             ||
568             (path.contains("/currencyFormatLength") && path.contains("/pattern")) ||
569             (path.contains("/currencySpacing") && path.contains("/insertBetween")) ||
570             (path.contains("/decimalFormatLength") && path.contains("/pattern")) || // i.e. the non-long ones
571             (path.contains("/percentFormatLength") && path.contains("/pattern")) ||
572             (path.startsWith("//ldml/numbers/symbols") && (path.contains("/group") || path.contains("/nan")))) {
573             value = WHITESPACE_AND_NBSP_TO_NORMALIZE.matcher(value).replaceAll("\u00A0"); // replace with NBSP
574         } else {
575             // in this case don't normalize away NBSP
576             value = WHITESPACE_NO_NBSP_TO_NORMALIZE.matcher(value).replaceAll(" "); // replace with regular space
577         }
578         return value;
579     }
580 
normalizeCurrencyDisplayName(String value)581     private String normalizeCurrencyDisplayName(String value) {
582         StringBuilder result = new StringBuilder();
583         boolean inParentheses = false;
584         for (int i = 0; i < value.length(); i++) {
585             char c = value.charAt(i);
586             if (c == '(') {
587                 inParentheses = true;
588             } else if (c == ')') {
589                 inParentheses = false;
590             }
591             if (inParentheses && c == '-' && Character.isDigit(value.charAt(i - 1))) {
592                 c = 0x2013; /* Replace hyphen-minus with dash for date ranges */
593             }
594             result.append(c);
595         }
596         return result.toString();
597     }
598 
normalizeApostrophes(String value)599     private String normalizeApostrophes(String value) {
600         // If our DAIP always had a CLDRFile to work with, then we could just check the exemplar set in it to see.
601         // But since we don't, we just maintain the list internally and use it.
602         if (LANGUAGES_USING_MODIFIER_APOSTROPHE.contains(locale.getLanguage())) {
603             return value.replace('\'', '\u02bc');
604         } else {
605             char prev = 0;
606             StringBuilder builder = new StringBuilder();
607             for (char c : value.toCharArray()) {
608                 if (c == '\'') {
609                     if (Character.isLetter(prev)) {
610                         builder.append('\u2019');
611                     } else {
612                         builder.append('\u2018');
613                     }
614                 } else {
615                     builder.append(c);
616                 }
617                 prev = c;
618             }
619             return builder.toString();
620         }
621     }
622 
normalizeIntervalHyphens(String value)623     private String normalizeIntervalHyphens(String value) {
624         DateTimePatternGenerator.FormatParser fp = new DateTimePatternGenerator.FormatParser();
625         fp.set(DateIntervalInfo.genPatternInfo(value, false).getFirstPart());
626         List<Object> items = fp.getItems();
627         Object last = items.get(items.size() - 1);
628         if (last instanceof String) {
629             String separator = last.toString();
630             if (separator.contains("-")) {
631                 StringBuilder sb = new StringBuilder();
632                 sb.append(DateIntervalInfo.genPatternInfo(value, false).getFirstPart());
633                 if (sb.lastIndexOf(separator) >= 0) {
634                     sb.delete(sb.lastIndexOf(separator), sb.length());
635                     sb.append(separator.replace("-", "\u2013"));
636                     sb.append(DateIntervalInfo.genPatternInfo(value, false).getSecondPart());
637                     return sb.toString();
638                 }
639             }
640         }
641         return value;
642     }
643 
normalizeHyphens(String value)644     private String normalizeHyphens(String value) {
645         int hyphenLocation = value.indexOf("-");
646         if (hyphenLocation > 0 &&
647             Character.isDigit(value.charAt(hyphenLocation - 1)) &&
648             hyphenLocation < value.length() - 1 &&
649             Character.isDigit(value.charAt(hyphenLocation + 1))) {
650             StringBuilder sb = new StringBuilder();
651             sb.append(value.substring(0, hyphenLocation));
652             sb.append("\u2013");
653             sb.append(value.substring(hyphenLocation + 1));
654             return sb.toString();
655         }
656         return value;
657     }
658 
standardizeRomanian(String value)659     private String standardizeRomanian(String value) {
660         StringBuilder builder = new StringBuilder();
661         for (char c : value.toCharArray()) {
662             for (char[] pair : ROMANIAN_CONVERSIONS) {
663                 if (c == pair[0]) {
664                     c = pair[1];
665                     break;
666                 }
667             }
668             builder.append(c);
669         }
670         return builder.toString();
671     }
672 
standardizeKwasio(String value)673     private String standardizeKwasio(String value) {
674         StringBuilder builder = new StringBuilder();
675         for (char c : value.toCharArray()) {
676             for (char[] pair : KWASIO_CONVERSIONS) {
677                 if (c == pair[0]) {
678                     c = pair[1];
679                     break;
680                 }
681             }
682             builder.append(c);
683         }
684         return builder.toString();
685     }
686 
687     // Use the myanmar-tools detector.
standardizeMyanmar(String value)688     private String standardizeMyanmar(String value) {
689         if (detector.getZawgyiProbability(value) > 0.90) {
690             return zawgyiUnicodeTransliterator.transform(value);
691         }
692         return value;
693     }
694 
standardizeNgomba(String value)695     private String standardizeNgomba(String value) {
696         StringBuilder builder = new StringBuilder();
697         char[] charArray = value.toCharArray();
698         for (int i = 0; i < charArray.length; i++) {
699             char c = charArray[i];
700             boolean convertedSaltillo = false;
701             for (char[] pair : NGOMBA_CONVERSIONS) {
702                 if (c == pair[0]) {
703                     c = pair[1];
704                     if (c == '\uA78C') {
705                         convertedSaltillo = true;
706                     }
707                     break;
708                 }
709             }
710             if (convertedSaltillo &&
711                 ((i > 0 && i < charArray.length - 1 && Character.isUpperCase(charArray[i - 1]) && Character.isUpperCase(charArray[i + 1])) ||
712                     (i > 1 && Character.isUpperCase(charArray[i - 1]) && Character.isUpperCase(charArray[i - 2])))) {
713                 c = '\uA78B'; // UPPER CASE SALTILLO
714             }
715             builder.append(c);
716         }
717         return builder.toString();
718     }
719 
replaceChars(String path, String value, char[][] charsToReplace, boolean skipAuxExemplars)720     private String replaceChars(String path, String value, char[][] charsToReplace, boolean skipAuxExemplars) {
721         if (skipAuxExemplars && path.contains("/exemplarCharacters[@type=\"auxiliary\"]")) {
722             return value;
723         }
724         StringBuilder builder = new StringBuilder();
725         for (char c : value.toCharArray()) {
726             for (char[] pair : charsToReplace) {
727                 if (c == pair[0]) {
728                     c = pair[1];
729                     break;
730                 }
731             }
732             builder.append(c);
733         }
734         return builder.toString();
735     }
736 
standardizeSwissGerman(String value)737     private String standardizeSwissGerman(String value) {
738         return value.replaceAll("\u00DF", "ss");
739     }
740 
standardizeCatalan(String value)741     private String standardizeCatalan(String value) {
742         StringBuilder builder = new StringBuilder();
743         for (char c : value.toCharArray()) {
744             boolean didSubstitute = false;
745             for (char[] triple : CATALAN_CONVERSIONS) {
746                 if (c == triple[0]) {
747                     builder.append(triple[1]);
748                     builder.append(triple[2]);
749                     didSubstitute = true;
750                     break;
751                 }
752             }
753             if (!didSubstitute) {
754                 builder.append(c);
755             }
756         }
757         return builder.toString();
758     }
759 
replace(Pattern pattern, String value, String replacement)760     private String replace(Pattern pattern, String value, String replacement) {
761         String value2 = pattern.matcher(value).replaceAll(replacement);
762         if (DEBUG_DAIP && !value.equals(value2)) {
763             System.out.println("\n" + value + " => " + value2);
764         }
765         return value2;
766     }
767 
768     private static Pattern UNNORMALIZED_MALAYALAM = PatternCache.get(
769         "(\u0D23|\u0D28|\u0D30|\u0D32|\u0D33|\u0D15)\u0D4D\u200D");
770 
771     private static Map<Character, Character> NORMALIZING_MAP = Builder.with(new HashMap<Character, Character>())
772         .put('\u0D23', '\u0D7A').put('\u0D28', '\u0D7B')
773         .put('\u0D30', '\u0D7C').put('\u0D32', '\u0D7D')
774         .put('\u0D33', '\u0D7E').put('\u0D15', '\u0D7F').get();
775 
776     /**
777      * Normalizes the Malayalam characters in the specified input.
778      *
779      * @param value
780      *            the input to be normalized
781      * @return
782      */
normalizeMalayalam(String value)783     private String normalizeMalayalam(String value) {
784         // Normalize Malayalam characters.
785         Matcher matcher = UNNORMALIZED_MALAYALAM.matcher(value);
786         if (matcher.find()) {
787             StringBuffer buffer = new StringBuffer();
788             int start = 0;
789             do {
790                 buffer.append(value.substring(start, matcher.start(0)));
791                 char codePoint = matcher.group(1).charAt(0);
792                 buffer.append(NORMALIZING_MAP.get(codePoint));
793                 start = matcher.end(0);
794             } while (matcher.find());
795             buffer.append(value.substring(start));
796             value = buffer.toString();
797         }
798         return value;
799     }
800 
801     static final Transform<String, String> fixArabicPresentation = Transliterator.getInstance(
802         "[[:block=Arabic_Presentation_Forms_A:][:block=Arabic_Presentation_Forms_B:]] nfkc");
803 
804     /**
805      * Normalizes the Arabic presentation forms characters in the specified input.
806      *
807      * @param value
808      *            the input to be normalized
809      * @return
810      */
replaceArabicPresentationForms(String value)811     private String replaceArabicPresentationForms(String value) {
812         value = fixArabicPresentation.transform(value);
813         return value;
814     }
815 
816     static Pattern REMOVE_QUOTE1 = PatternCache.get("(\\s)(\\\\[-\\}\\]\\&])()");
817     static Pattern REMOVE_QUOTE2 = PatternCache.get("(\\\\[\\-\\{\\[\\&])(\\s)"); // ([^\\])([\\-\\{\\[])(\\s)
818 
819     static Pattern NEEDS_QUOTE1 = PatternCache.get("(\\s|$)([-\\}\\]\\&])()");
820     static Pattern NEEDS_QUOTE2 = PatternCache.get("([^\\\\])([\\-\\{\\[\\&])(\\s)"); // ([^\\])([\\-\\{\\[])(\\s)
821 
getCleanedUnicodeSet(UnicodeSet exemplar, UnicodeSetPrettyPrinter prettyPrinter, ExemplarType exemplarType)822     public static String getCleanedUnicodeSet(UnicodeSet exemplar, UnicodeSetPrettyPrinter prettyPrinter,
823         ExemplarType exemplarType) {
824         if (prettyPrinter == null) {
825             return exemplar.toPattern(false);
826         }
827         String value;
828         prettyPrinter.setCompressRanges(exemplar.size() > 300);
829         value = exemplar.toPattern(false);
830         UnicodeSet toAdd = new UnicodeSet();
831 
832         for (UnicodeSetIterator usi = new UnicodeSetIterator(exemplar); usi.next();) {
833             String string = usi.getString();
834             if (string.equals("ß") || string.equals("İ")) {
835                 toAdd.add(string);
836                 continue;
837             }
838             if (exemplarType.convertUppercase) {
839                 string = UCharacter.toLowerCase(ULocale.ENGLISH, string);
840             }
841             toAdd.add(string);
842             String composed = Normalizer.compose(string, false);
843             if (!string.equals(composed)) {
844                 toAdd.add(composed);
845             }
846         }
847 
848         toAdd.removeAll(exemplarType.toRemove);
849 
850         if (DEBUG_DAIP && !toAdd.equals(exemplar)) {
851             UnicodeSet oldOnly = new UnicodeSet(exemplar).removeAll(toAdd);
852             UnicodeSet newOnly = new UnicodeSet(toAdd).removeAll(exemplar);
853             System.out.println("Exemplar:\t" + exemplarType + ",\tremoved\t" + oldOnly + ",\tadded\t" + newOnly);
854         }
855 
856         String fixedExemplar = prettyPrinter.format(toAdd);
857         UnicodeSet doubleCheck = new UnicodeSet(fixedExemplar);
858         if (!toAdd.equals(doubleCheck)) {
859             // something went wrong, leave as is
860         } else if (!value.equals(fixedExemplar)) { // put in this condition just for debugging
861             if (DEBUG_DAIP) {
862                 System.out.println(TestMetadata.showDifference(
863                     With.codePoints(value),
864                     With.codePoints(fixedExemplar),
865                     "\n"));
866             }
867             value = fixedExemplar;
868         }
869         return value;
870     }
871 
872     /**
873      * @return a canonical numeric pattern, based on the type, and the isPOSIX flag. The latter is set for en_US_POSIX.
874      */
875     static final Splitter SEMI_SPLITTER = Splitter.on(';').trimResults();
876 
getCanonicalPattern(String inpattern, NumericType type, boolean isPOSIX)877     public static String getCanonicalPattern(String inpattern, NumericType type, boolean isPOSIX) {
878         // TODO fix later to properly handle quoted ;
879 
880         DecimalFormat df = new DecimalFormat(inpattern);
881         if (type == NumericType.DECIMAL_ABBREVIATED || type == NumericType.CURRENCY_ABBREVIATED
882             || CldrUtility.INHERITANCE_MARKER.equals(inpattern)) {
883             return inpattern; // TODO fix when ICU bug is fixed
884             // df.setMaximumFractionDigits(df.getMinimumFractionDigits());
885             // df.setMaximumIntegerDigits(Math.max(1, df.getMinimumIntegerDigits()));
886         } else {
887             // int decimals = type == CURRENCY_TYPE ? 2 : 1;
888             int[] digits = isPOSIX ? type.posixDigitCount : type.digitCount;
889             df.setMinimumIntegerDigits(digits[0]);
890             df.setMinimumFractionDigits(digits[1]);
891             df.setMaximumFractionDigits(digits[2]);
892         }
893         String pattern = df.toPattern();
894         List<String> parts = SEMI_SPLITTER.splitToList(pattern);
895         String pattern2 = parts.get(0);
896         if (parts.size() > 1) {
897             pattern2 += ";" + parts.get(1);
898         }
899         if (!pattern2.equals(pattern)) {
900             pattern = pattern2;
901         }
902         // int pos = pattern.indexOf(';');
903         // if (pos < 0) return pattern + ";-" + pattern;
904         return pattern;
905     }
906 
907     /*
908      * This tests what type a numeric pattern is.
909      */
910     public enum NumericType {
911         CURRENCY(new int[] { 1, 2, 2 }, new int[] { 1, 2, 2 }), CURRENCY_ABBREVIATED(), DECIMAL(new int[] { 1, 0, 3 },
912             new int[] { 1, 0, 6 }), DECIMAL_ABBREVIATED(), PERCENT(new int[] { 1, 0, 0 },
913                 new int[] { 1, 0, 0 }), SCIENTIFIC(new int[] { 0, 0, 0 }, new int[] { 1, 6, 6 }), NOT_NUMERIC;
914 
915         private static final Pattern NUMBER_PATH = Pattern
916             .compile("//ldml/numbers/((currency|decimal|percent|scientific)Formats|currencies/currency).*");
917         private int[] digitCount;
918         private int[] posixDigitCount;
919 
NumericType()920         private NumericType() {
921         };
922 
NumericType(int[] digitCount, int[] posixDigitCount)923         private NumericType(int[] digitCount, int[] posixDigitCount) {
924             this.digitCount = digitCount;
925             this.posixDigitCount = posixDigitCount;
926         }
927 
928         /**
929          * @return the numeric type of the xpath
930          */
getNumericType(String xpath)931         public static NumericType getNumericType(String xpath) {
932             Matcher matcher = NUMBER_PATH.matcher(xpath);
933             if (xpath.indexOf("/pattern") < 0) {
934                 return NOT_NUMERIC;
935             } else if (matcher.matches()) {
936                 if (matcher.group(1).equals("currencies/currency")) {
937                     return CURRENCY;
938                 } else {
939                     NumericType type = NumericType.valueOf(matcher.group(2).toUpperCase());
940                     if (xpath.contains("=\"1000")) {
941                         if (type == DECIMAL) {
942                             type = DECIMAL_ABBREVIATED;
943                         } else if (type == CURRENCY) {
944                             type = CURRENCY_ABBREVIATED;
945                         } else {
946                             throw new IllegalArgumentException("Internal Error");
947                         }
948                     }
949                     return type;
950                 }
951             } else {
952                 return NOT_NUMERIC;
953             }
954         }
955 
getDigitCount()956         public int[] getDigitCount() {
957             return digitCount;
958         }
959 
getPosixDigitCount()960         public int[] getPosixDigitCount() {
961             return posixDigitCount;
962         }
963     };
964 }
965