• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 package org.unicode.cldr.util;
2 
3 import com.google.common.base.Splitter;
4 import com.ibm.icu.lang.CharSequences;
5 import com.ibm.icu.text.Collator;
6 import com.ibm.icu.text.Normalizer2;
7 import com.ibm.icu.text.UnicodeSet;
8 import com.ibm.icu.util.ULocale;
9 import java.util.ArrayList;
10 import java.util.Collection;
11 import java.util.Comparator;
12 import java.util.TreeSet;
13 import java.util.function.Function;
14 
15 /**
16  * Goal is a very simple format for UnicodeSet, that keeps vetters from having to know about \ for
17  * quoting or {...} for strings, or $ for FFFF. We do this by using spaces to always separate
18  * different characters, and special syntax for ranges, escaped hex, and named entities. There are 2
19  * special characters:
20  *
21  * <ul>
22  *   <li>➖ a range, but if between two code points
23  *   <li>❰ start of hex or named escape, but only if followed by [A-Fa-f0-9]+ ❱
24  * </ul>
25  *
26  * <b>EBNF</b><br>
27  * result = item (" " item)*<br>
28  * item = string | range | codePoint<br>
29  * string = codePoint+<br>
30  * range = codePoint "➖" codePoint<br>
31  * codepoint = literal // excludes " ", "❰", "❱"<br>
32  * codepoint = "❰" (namedEscape | hex) "❱"<br>
33  * namedEscape = [A-Fa-f0-9]+ // as per CodePointEscape<br>
34  * hex = [A-Fa-f0-9]{2,6} // must be valid code point 0x0..0x10FFFF<br>
35  * ❰ was chosen to be avoid special use of \\u or \x<br>
36  *
37  * @author markdavis
38  */
39 public class SimpleUnicodeSetFormatter implements FormatterParser<UnicodeSet> {
40     public static Normalizer2 nfc = Normalizer2.getNFCInstance();
41 
42     public static final Comparator<String> BASIC_COLLATOR =
43             (Comparator) CLDRConfig.getInstance().getCollator();
44 
45     public static final int DEFAULT_RANGES_ABOVE = 1024;
46 
47     private final Comparator<String> comparator;
48     private final UnicodeSet forceHex;
49     private final int useRangesAbove;
50 
getComparator()51     public Comparator<String> getComparator() {
52         return comparator;
53     }
54 
getToEscape()55     public UnicodeSet getToEscape() {
56         return forceHex;
57     }
58 
getUseRangesAbove()59     public int getUseRangesAbove() {
60         return useRangesAbove;
61     }
62 
63     /**
64      * Create a simple formatter, with a comparator for the ordering and a UnicodeSet of characters
65      * that are to use hex. Immutable (if the collator is).
66      *
67      * @param col — collator. The default is BASIC_COLLATOR, which is the root collator.
68      * @param forceHex - UnicodeSet to force to be hex. It will be frozen if not already. Warning:
69      *     may not round-trip unless it includes all of CodePointEscaper.getNamedEscapes()
70      * @param useRangesAbove — under this number, there will be no ranges; at or above there may be
71      *     ranges, and the collator will be disregarded.
72      */
SimpleUnicodeSetFormatter( Comparator<String> col, UnicodeSet forceHex, int useRangesAbove)73     public SimpleUnicodeSetFormatter(
74             Comparator<String> col, UnicodeSet forceHex, int useRangesAbove) {
75         // collate, but preserve non-equivalents
76         this.comparator = col == null ? BASIC_COLLATOR : ComparatorUtilities.wrapForCodePoints(col);
77         this.forceHex = forceHex == null ? CodePointEscaper.FORCE_ESCAPE : forceHex.freeze();
78         this.useRangesAbove = useRangesAbove < 0 ? DEFAULT_RANGES_ABOVE : useRangesAbove;
79     }
80 
81     public static Comparator<String> getComparatorForLocale(String localeId) {
82         Comparator<String> collator = BASIC_COLLATOR;
83         try {
84             if (localeId != null) {
85                 ICUServiceBuilder isb =
86                         ICUServiceBuilder.forLocale(CLDRLocale.getInstance(localeId));
87                 collator = (Comparator) isb.getRuleBasedCollator();
88             }
89         } catch (Exception e) { // for our purposes, better to fall back to the default
90         }
91         return collator;
92     }
93 
94     public SimpleUnicodeSetFormatter(Comparator<String> col, UnicodeSet forceHex) {
95         this(col, forceHex, DEFAULT_RANGES_ABOVE);
96     }
97 
98     public SimpleUnicodeSetFormatter(Comparator<String> col) {
99         this(col, null, DEFAULT_RANGES_ABOVE);
100     }
101 
102     public SimpleUnicodeSetFormatter() {
103         this(
104                 (Comparator) ComparatorUtilities.getIcuCollator(ULocale.ROOT, Collator.IDENTICAL),
105                 null,
106                 DEFAULT_RANGES_ABOVE);
107     }
108 
109     static class Lazy {
110         static SimpleUnicodeSetFormatter SINGLETON = new SimpleUnicodeSetFormatter();
111 
112         static SimpleUnicodeSetFormatter getSingleton() {
113             return SINGLETON;
114         }
115     }
116 
117     public static SimpleUnicodeSetFormatter getDefault() {
118         return Lazy.getSingleton();
119     }
120 
121     /** Parse as UnicodeSet if of the form […], else parse with default SimpleUnicodeSetFormatter */
122     public static UnicodeSet parseLenient(String source) {
123         if (source.startsWith("[") && source.endsWith("]")) {
124             return new UnicodeSet(source);
125         } else {
126             return getDefault().parse(source);
127         }
128     }
129 
130     @Override
131     public String format(UnicodeSet input) {
132         final boolean allowRanges = input.size() > useRangesAbove;
133         StringBuilder result = new StringBuilder();
134         Collection<String> sorted =
135                 input.addAllTo(allowRanges ? new ArrayList<>() : new TreeSet<>(comparator));
136         //                : transformAndAddAllTo(
137         //                        input, null, new TreeSet<>(comparator)); // x -> nfc.normalize(x)
138         int firstOfRange = -2;
139         int lastOfRange = -2;
140         for (String item : sorted) {
141             int cp = CharSequences.getSingleCodePoint(item);
142             if (cp == Integer.MAX_VALUE) { // string
143                 if (lastOfRange >= 0) {
144                     if (firstOfRange != lastOfRange) {
145                         result.append(
146                                 firstOfRange + 1 != lastOfRange
147                                         ? CodePointEscaper.RANGE_SYNTAX
148                                         : ' ');
149                         appendWithHex(result, lastOfRange, forceHex);
150                     }
151                     firstOfRange = lastOfRange = -2;
152                 }
153                 if (result.length() > 0) {
154                     result.append(' ');
155                 }
156                 appendWithHex(result, item, forceHex);
157             } else if (allowRanges && lastOfRange == cp - 1) {
158                 ++lastOfRange;
159             } else {
160                 if (firstOfRange != lastOfRange) {
161                     result.append(
162                             firstOfRange + 1 != lastOfRange ? CodePointEscaper.RANGE_SYNTAX : ' ');
163                     appendWithHex(result, lastOfRange, forceHex);
164                 }
165                 if (result.length() > 0) {
166                     result.append(' ');
167                 }
168                 appendWithHex(result, cp, forceHex);
169                 firstOfRange = lastOfRange = cp;
170             }
171         }
172         if (firstOfRange != lastOfRange) {
173             result.append(firstOfRange + 1 != lastOfRange ? CodePointEscaper.RANGE_SYNTAX : ' ');
174             appendWithHex(result, lastOfRange, forceHex);
175         }
176         return result.toString();
177     }
178 
appendWithHex( StringBuilder ap, CharSequence s, UnicodeSet forceHex)179     public static final StringBuilder appendWithHex(
180             StringBuilder ap, CharSequence s, UnicodeSet forceHex) {
181         for (int cp : With.codePointArray(s)) {
182             appendWithHex(ap, cp, forceHex);
183         }
184         return ap;
185     }
186 
appendWithHex(StringBuilder ap, int cp, UnicodeSet forceHex)187     public static StringBuilder appendWithHex(StringBuilder ap, int cp, UnicodeSet forceHex) {
188         if (!forceHex.contains(cp)) {
189             ap.appendCodePoint(cp);
190         } else {
191             ap.append(CodePointEscaper.codePointToEscaped(cp));
192         }
193         return ap;
194     }
195 
196     static final Splitter SPACE_SPLITTER = Splitter.on(' ').omitEmptyStrings();
197 
198     @Override
parse(String input)199     public UnicodeSet parse(String input) {
200         UnicodeSet result = new UnicodeSet();
201         // Note: could be optimized but probably not worth the effort
202 
203         for (String word : SPACE_SPLITTER.split(input)) {
204             // parts between spaces can be single code points, or strings, or ranges of single code
205             // points
206             // points
207             int rangePos = word.indexOf(CodePointEscaper.RANGE_SYNTAX);
208             if (rangePos < 0) {
209                 result.add(unescape(word));
210             } else {
211                 int range2Pos = word.indexOf(CodePointEscaper.RANGE_SYNTAX, rangePos + 1);
212                 final String before = word.substring(0, rangePos);
213                 final String after = word.substring(rangePos + 1);
214                 if (rangePos == 0) {
215                     throw new IllegalArgumentException(
216                             "Must have exactly one character before '➖': " + before + "❌➖" + after);
217                 } else if (rangePos == word.length() - 1) {
218                     throw new IllegalArgumentException(
219                             "Must have exactly one character after '➖': " + before + "➖❌" + after);
220                 } else if (range2Pos >= 0) {
221                     throw new IllegalArgumentException(
222                             "Must not have two '➖' characters: " + before + "➖❌" + after);
223                 }
224                 // get the code points on either side
225                 int first = CharSequences.getSingleCodePoint(unescape(before));
226                 int second = CharSequences.getSingleCodePoint(unescape(after));
227                 if (first == Integer.MAX_VALUE) {
228                     throw new IllegalArgumentException(
229                             "Must have exactly one character before '➖': " + before + "❌➖" + after);
230                 } else if (second == Integer.MAX_VALUE) {
231                     throw new IllegalArgumentException(
232                             "Must have exactly one character after '➖': " + before + "➖❌" + after);
233                 }
234                 result.add(first, second);
235             }
236         }
237         return result;
238     }
239 
240     /** Unescape a whole string. */
unescape(String word)241     public static CharSequence unescape(String word) {
242         StringBuilder result = new StringBuilder();
243         for (int i = 0; i < word.length(); ) {
244             int escapeStart = word.indexOf(CodePointEscaper.ESCAPE_START, i);
245             if (escapeStart < 0) {
246                 final String toAppend = i == 0 ? word : word.substring(i);
247                 final int endStart = toAppend.indexOf(CodePointEscaper.ESCAPE_END);
248                 if (endStart >= 0) {
249                     throw new IllegalArgumentException(
250                             "Missing start escape "
251                                     + CodePointEscaper.ESCAPE_START
252                                     + ": "
253                                     + word.substring(0, endStart)
254                                     + "❌"
255                                     + word.substring(endStart));
256                 }
257                 // Otherwise we are done, the rest is unescaped.
258                 result.append(toAppend);
259                 break;
260             }
261             // we have an escape start, so we append what is before that.
262             final String toAppend = word.substring(i, escapeStart);
263             // if we don't find an escape end
264             final int endStart = toAppend.indexOf(CodePointEscaper.ESCAPE_END);
265             if (endStart >= 0) {
266                 throw new IllegalArgumentException(
267                         "Missing start escape "
268                                 + CodePointEscaper.ESCAPE_START
269                                 + ": "
270                                 + toAppend.substring(0, endStart)
271                                 + "❌"
272                                 + toAppend.substring(endStart));
273             }
274             result.append(toAppend);
275             int interiorStart = escapeStart + 1;
276             int escapeEnd = word.indexOf(CodePointEscaper.ESCAPE_END, interiorStart);
277             if (escapeEnd < 0) {
278                 throw new IllegalArgumentException(
279                         "Missing end escape " + CodePointEscaper.ESCAPE_END + ": " + word + "❌");
280             }
281             result.appendCodePoint(
282                     CodePointEscaper.rawEscapedToCodePoint(
283                             word.substring(interiorStart, escapeEnd)));
284             i = escapeEnd + 1;
285         }
286         return result;
287     }
288 
transform(UnicodeSet expected, Function<String, String> function)289     public static UnicodeSet transform(UnicodeSet expected, Function<String, String> function) {
290         UnicodeSet result = new UnicodeSet();
291         for (String s : expected) {
292             String t = function.apply(s);
293             result.add(t);
294         }
295         return result;
296     }
297 
transformAndAddAllTo( UnicodeSet expected, Function<String, String> function, T target)298     public static <T extends Collection<String>> T transformAndAddAllTo(
299             UnicodeSet expected, Function<String, String> function, T target) {
300         for (String s : expected) {
301             String t = function == null ? s : function.apply(s);
302             target.add(t);
303         }
304         return target;
305     }
306 }
307