1 package org.unicode.cldr.util; 2 3 import com.google.common.base.Splitter; 4 import com.ibm.icu.lang.CharSequences; 5 import com.ibm.icu.text.Collator; 6 import com.ibm.icu.text.Normalizer2; 7 import com.ibm.icu.text.UnicodeSet; 8 import com.ibm.icu.util.ULocale; 9 import java.util.ArrayList; 10 import java.util.Collection; 11 import java.util.Comparator; 12 import java.util.TreeSet; 13 import java.util.function.Function; 14 15 /** 16 * Goal is a very simple format for UnicodeSet, that keeps vetters from having to know about \ for 17 * quoting or {...} for strings, or $ for FFFF. We do this by using spaces to always separate 18 * different characters, and special syntax for ranges, escaped hex, and named entities. There are 2 19 * special characters: 20 * 21 * <ul> 22 * <li>➖ a range, but if between two code points 23 * <li>❰ start of hex or named escape, but only if followed by [A-Fa-f0-9]+ ❱ 24 * </ul> 25 * 26 * <b>EBNF</b><br> 27 * result = item (" " item)*<br> 28 * item = string | range | codePoint<br> 29 * string = codePoint+<br> 30 * range = codePoint "➖" codePoint<br> 31 * codepoint = literal // excludes " ", "❰", "❱"<br> 32 * codepoint = "❰" (namedEscape | hex) "❱"<br> 33 * namedEscape = [A-Fa-f0-9]+ // as per CodePointEscape<br> 34 * hex = [A-Fa-f0-9]{2,6} // must be valid code point 0x0..0x10FFFF<br> 35 * ❰ was chosen to be avoid special use of \\u or \x<br> 36 * 37 * @author markdavis 38 */ 39 public class SimpleUnicodeSetFormatter implements FormatterParser<UnicodeSet> { 40 public static Normalizer2 nfc = Normalizer2.getNFCInstance(); 41 42 public static final Comparator<String> BASIC_COLLATOR = 43 (Comparator) CLDRConfig.getInstance().getCollator(); 44 45 public static final int DEFAULT_RANGES_ABOVE = 1024; 46 47 private final Comparator<String> comparator; 48 private final UnicodeSet forceHex; 49 private final int useRangesAbove; 50 getComparator()51 public Comparator<String> getComparator() { 52 return comparator; 53 } 54 getToEscape()55 public UnicodeSet getToEscape() { 56 return forceHex; 57 } 58 getUseRangesAbove()59 public int getUseRangesAbove() { 60 return useRangesAbove; 61 } 62 63 /** 64 * Create a simple formatter, with a comparator for the ordering and a UnicodeSet of characters 65 * that are to use hex. Immutable (if the collator is). 66 * 67 * @param col — collator. The default is BASIC_COLLATOR, which is the root collator. 68 * @param forceHex - UnicodeSet to force to be hex. It will be frozen if not already. Warning: 69 * may not round-trip unless it includes all of CodePointEscaper.getNamedEscapes() 70 * @param useRangesAbove — under this number, there will be no ranges; at or above there may be 71 * ranges, and the collator will be disregarded. 72 */ SimpleUnicodeSetFormatter( Comparator<String> col, UnicodeSet forceHex, int useRangesAbove)73 public SimpleUnicodeSetFormatter( 74 Comparator<String> col, UnicodeSet forceHex, int useRangesAbove) { 75 // collate, but preserve non-equivalents 76 this.comparator = col == null ? BASIC_COLLATOR : ComparatorUtilities.wrapForCodePoints(col); 77 this.forceHex = forceHex == null ? CodePointEscaper.FORCE_ESCAPE : forceHex.freeze(); 78 this.useRangesAbove = useRangesAbove < 0 ? DEFAULT_RANGES_ABOVE : useRangesAbove; 79 } 80 81 public static Comparator<String> getComparatorForLocale(String localeId) { 82 Comparator<String> collator = BASIC_COLLATOR; 83 try { 84 if (localeId != null) { 85 ICUServiceBuilder isb = 86 ICUServiceBuilder.forLocale(CLDRLocale.getInstance(localeId)); 87 collator = (Comparator) isb.getRuleBasedCollator(); 88 } 89 } catch (Exception e) { // for our purposes, better to fall back to the default 90 } 91 return collator; 92 } 93 94 public SimpleUnicodeSetFormatter(Comparator<String> col, UnicodeSet forceHex) { 95 this(col, forceHex, DEFAULT_RANGES_ABOVE); 96 } 97 98 public SimpleUnicodeSetFormatter(Comparator<String> col) { 99 this(col, null, DEFAULT_RANGES_ABOVE); 100 } 101 102 public SimpleUnicodeSetFormatter() { 103 this( 104 (Comparator) ComparatorUtilities.getIcuCollator(ULocale.ROOT, Collator.IDENTICAL), 105 null, 106 DEFAULT_RANGES_ABOVE); 107 } 108 109 static class Lazy { 110 static SimpleUnicodeSetFormatter SINGLETON = new SimpleUnicodeSetFormatter(); 111 112 static SimpleUnicodeSetFormatter getSingleton() { 113 return SINGLETON; 114 } 115 } 116 117 public static SimpleUnicodeSetFormatter getDefault() { 118 return Lazy.getSingleton(); 119 } 120 121 /** Parse as UnicodeSet if of the form […], else parse with default SimpleUnicodeSetFormatter */ 122 public static UnicodeSet parseLenient(String source) { 123 if (source.startsWith("[") && source.endsWith("]")) { 124 return new UnicodeSet(source); 125 } else { 126 return getDefault().parse(source); 127 } 128 } 129 130 @Override 131 public String format(UnicodeSet input) { 132 final boolean allowRanges = input.size() > useRangesAbove; 133 StringBuilder result = new StringBuilder(); 134 Collection<String> sorted = 135 input.addAllTo(allowRanges ? new ArrayList<>() : new TreeSet<>(comparator)); 136 // : transformAndAddAllTo( 137 // input, null, new TreeSet<>(comparator)); // x -> nfc.normalize(x) 138 int firstOfRange = -2; 139 int lastOfRange = -2; 140 for (String item : sorted) { 141 int cp = CharSequences.getSingleCodePoint(item); 142 if (cp == Integer.MAX_VALUE) { // string 143 if (lastOfRange >= 0) { 144 if (firstOfRange != lastOfRange) { 145 result.append( 146 firstOfRange + 1 != lastOfRange 147 ? CodePointEscaper.RANGE_SYNTAX 148 : ' '); 149 appendWithHex(result, lastOfRange, forceHex); 150 } 151 firstOfRange = lastOfRange = -2; 152 } 153 if (result.length() > 0) { 154 result.append(' '); 155 } 156 appendWithHex(result, item, forceHex); 157 } else if (allowRanges && lastOfRange == cp - 1) { 158 ++lastOfRange; 159 } else { 160 if (firstOfRange != lastOfRange) { 161 result.append( 162 firstOfRange + 1 != lastOfRange ? CodePointEscaper.RANGE_SYNTAX : ' '); 163 appendWithHex(result, lastOfRange, forceHex); 164 } 165 if (result.length() > 0) { 166 result.append(' '); 167 } 168 appendWithHex(result, cp, forceHex); 169 firstOfRange = lastOfRange = cp; 170 } 171 } 172 if (firstOfRange != lastOfRange) { 173 result.append(firstOfRange + 1 != lastOfRange ? CodePointEscaper.RANGE_SYNTAX : ' '); 174 appendWithHex(result, lastOfRange, forceHex); 175 } 176 return result.toString(); 177 } 178 appendWithHex( StringBuilder ap, CharSequence s, UnicodeSet forceHex)179 public static final StringBuilder appendWithHex( 180 StringBuilder ap, CharSequence s, UnicodeSet forceHex) { 181 for (int cp : With.codePointArray(s)) { 182 appendWithHex(ap, cp, forceHex); 183 } 184 return ap; 185 } 186 appendWithHex(StringBuilder ap, int cp, UnicodeSet forceHex)187 public static StringBuilder appendWithHex(StringBuilder ap, int cp, UnicodeSet forceHex) { 188 if (!forceHex.contains(cp)) { 189 ap.appendCodePoint(cp); 190 } else { 191 ap.append(CodePointEscaper.codePointToEscaped(cp)); 192 } 193 return ap; 194 } 195 196 static final Splitter SPACE_SPLITTER = Splitter.on(' ').omitEmptyStrings(); 197 198 @Override parse(String input)199 public UnicodeSet parse(String input) { 200 UnicodeSet result = new UnicodeSet(); 201 // Note: could be optimized but probably not worth the effort 202 203 for (String word : SPACE_SPLITTER.split(input)) { 204 // parts between spaces can be single code points, or strings, or ranges of single code 205 // points 206 // points 207 int rangePos = word.indexOf(CodePointEscaper.RANGE_SYNTAX); 208 if (rangePos < 0) { 209 result.add(unescape(word)); 210 } else { 211 int range2Pos = word.indexOf(CodePointEscaper.RANGE_SYNTAX, rangePos + 1); 212 final String before = word.substring(0, rangePos); 213 final String after = word.substring(rangePos + 1); 214 if (rangePos == 0) { 215 throw new IllegalArgumentException( 216 "Must have exactly one character before '➖': " + before + "❌➖" + after); 217 } else if (rangePos == word.length() - 1) { 218 throw new IllegalArgumentException( 219 "Must have exactly one character after '➖': " + before + "➖❌" + after); 220 } else if (range2Pos >= 0) { 221 throw new IllegalArgumentException( 222 "Must not have two '➖' characters: " + before + "➖❌" + after); 223 } 224 // get the code points on either side 225 int first = CharSequences.getSingleCodePoint(unescape(before)); 226 int second = CharSequences.getSingleCodePoint(unescape(after)); 227 if (first == Integer.MAX_VALUE) { 228 throw new IllegalArgumentException( 229 "Must have exactly one character before '➖': " + before + "❌➖" + after); 230 } else if (second == Integer.MAX_VALUE) { 231 throw new IllegalArgumentException( 232 "Must have exactly one character after '➖': " + before + "➖❌" + after); 233 } 234 result.add(first, second); 235 } 236 } 237 return result; 238 } 239 240 /** Unescape a whole string. */ unescape(String word)241 public static CharSequence unescape(String word) { 242 StringBuilder result = new StringBuilder(); 243 for (int i = 0; i < word.length(); ) { 244 int escapeStart = word.indexOf(CodePointEscaper.ESCAPE_START, i); 245 if (escapeStart < 0) { 246 final String toAppend = i == 0 ? word : word.substring(i); 247 final int endStart = toAppend.indexOf(CodePointEscaper.ESCAPE_END); 248 if (endStart >= 0) { 249 throw new IllegalArgumentException( 250 "Missing start escape " 251 + CodePointEscaper.ESCAPE_START 252 + ": " 253 + word.substring(0, endStart) 254 + "❌" 255 + word.substring(endStart)); 256 } 257 // Otherwise we are done, the rest is unescaped. 258 result.append(toAppend); 259 break; 260 } 261 // we have an escape start, so we append what is before that. 262 final String toAppend = word.substring(i, escapeStart); 263 // if we don't find an escape end 264 final int endStart = toAppend.indexOf(CodePointEscaper.ESCAPE_END); 265 if (endStart >= 0) { 266 throw new IllegalArgumentException( 267 "Missing start escape " 268 + CodePointEscaper.ESCAPE_START 269 + ": " 270 + toAppend.substring(0, endStart) 271 + "❌" 272 + toAppend.substring(endStart)); 273 } 274 result.append(toAppend); 275 int interiorStart = escapeStart + 1; 276 int escapeEnd = word.indexOf(CodePointEscaper.ESCAPE_END, interiorStart); 277 if (escapeEnd < 0) { 278 throw new IllegalArgumentException( 279 "Missing end escape " + CodePointEscaper.ESCAPE_END + ": " + word + "❌"); 280 } 281 result.appendCodePoint( 282 CodePointEscaper.rawEscapedToCodePoint( 283 word.substring(interiorStart, escapeEnd))); 284 i = escapeEnd + 1; 285 } 286 return result; 287 } 288 transform(UnicodeSet expected, Function<String, String> function)289 public static UnicodeSet transform(UnicodeSet expected, Function<String, String> function) { 290 UnicodeSet result = new UnicodeSet(); 291 for (String s : expected) { 292 String t = function.apply(s); 293 result.add(t); 294 } 295 return result; 296 } 297 transformAndAddAllTo( UnicodeSet expected, Function<String, String> function, T target)298 public static <T extends Collection<String>> T transformAndAddAllTo( 299 UnicodeSet expected, Function<String, String> function, T target) { 300 for (String s : expected) { 301 String t = function == null ? s : function.apply(s); 302 target.add(t); 303 } 304 return target; 305 } 306 } 307