1 /** 2 * ****************************************************************************** Copyright (C) 3 * 1996-2012, International Business Machines Corporation and * others. All Rights Reserved. * 4 * ********************************************************************* Author: Mark Davis 5 * ********************************************************************* 6 */ 7 package org.unicode.cldr.util; 8 9 import com.ibm.icu.impl.Utility; 10 import com.ibm.icu.lang.UCharacter; 11 import com.ibm.icu.text.Collator; 12 import com.ibm.icu.text.StringTransform; 13 import com.ibm.icu.text.UTF16; 14 import com.ibm.icu.text.UTF16.StringComparator; 15 import com.ibm.icu.text.UnicodeSet; 16 import com.ibm.icu.text.UnicodeSetIterator; 17 import com.ibm.icu.util.ICUUncheckedIOException; 18 import com.ibm.icu.util.ULocale; 19 import java.io.IOException; 20 import java.text.FieldPosition; 21 import java.util.Comparator; 22 import java.util.TreeSet; 23 24 /** 25 * Provides more flexible formatting of UnicodeSet patterns. <br> 26 * Used in the XML for UnicodeSets. <br> 27 * For the Survey Tool, should use SimpleUnicodeSetFormatter.java 28 */ 29 public class UnicodeSetPrettyPrinter implements FormatterParser<UnicodeSet> { 30 private static final StringComparator CODEPOINT_ORDER = 31 new UTF16.StringComparator(true, false, 0); 32 private static final UnicodeSet PATTERN_WHITESPACE = 33 new UnicodeSet("[[:Cn:][:Default_Ignorable_Code_Point:][:patternwhitespace:]]") 34 .freeze(); 35 private static final UnicodeSet SORT_AT_END = 36 new UnicodeSet("[[:Cn:][:Cs:][:Co:][:Ideographic:]]").freeze(); 37 private static final UnicodeSet QUOTED_SYNTAX = 38 new UnicodeSet("[\\[\\]\\-\\^\\&\\\\\\{\\}\\$\\:]").addAll(PATTERN_WHITESPACE).freeze(); 39 40 private boolean first = true; 41 private StringBuffer target = new StringBuffer(); 42 private int firstCodePoint = -2; 43 private int lastCodePoint = -2; 44 private boolean compressRanges = true; 45 private String lastString = ""; 46 private UnicodeSet toQuote = new UnicodeSet(PATTERN_WHITESPACE); 47 private StringTransform quoter = null; 48 49 private Comparator<String> ordering; 50 private Comparator<String> spaceComp; 51 52 /** Make from root collator obtained from ICU */ 53 public static final UnicodeSetPrettyPrinter ROOT_ICU = 54 from( 55 (Comparator) Collator.getInstance(ULocale.ROOT).freeze(), 56 (Comparator) 57 Collator.getInstance(ULocale.ROOT) 58 .setStrength2(Collator.PRIMARY) 59 .freeze()); 60 61 /** Make from ICU Locale */ fromIcuLocale(String localeId)62 public static UnicodeSetPrettyPrinter fromIcuLocale(String localeId) { 63 Collator col = ComparatorUtilities.getIcuCollator(localeId, Collator.IDENTICAL).freeze(); 64 Collator spaceCol = col.cloneAsThawed().setStrength2(Collator.PRIMARY).freeze(); 65 return from((Comparator) col, (Comparator) spaceCol); 66 } 67 68 /** Make from CLDR Locale */ fromCldrLocale(String localeId)69 public static UnicodeSetPrettyPrinter fromCldrLocale(String localeId) { 70 Collator col = ComparatorUtilities.getCldrCollator(localeId, Collator.IDENTICAL).freeze(); 71 Collator spaceCol = col.cloneAsThawed().setStrength2(Collator.PRIMARY).freeze(); 72 return from((Comparator) col, (Comparator) spaceCol); 73 } 74 75 /** Utility for creating UnicodeSetPrettyPrinter */ from( Comparator<String> col, Comparator<String> spaceCol)76 public static UnicodeSetPrettyPrinter from( 77 Comparator<String> col, Comparator<String> spaceCol) { 78 return new UnicodeSetPrettyPrinter() 79 .setOrdering(col) 80 .setSpaceComparator(spaceCol) 81 .setCompressRanges(false); 82 } 83 UnicodeSetPrettyPrinter()84 public UnicodeSetPrettyPrinter() {} 85 getQuoter()86 public StringTransform getQuoter() { 87 return quoter; 88 } 89 setQuoter(StringTransform quoter)90 public UnicodeSetPrettyPrinter setQuoter(StringTransform quoter) { 91 this.quoter = quoter; 92 return this; // for chaining 93 } 94 isCompressRanges()95 public boolean isCompressRanges() { 96 return compressRanges; 97 } 98 99 /** 100 * @param compressRanges if you want abcde instead of a-e, make this false 101 * @return 102 */ setCompressRanges(boolean compressRanges)103 public UnicodeSetPrettyPrinter setCompressRanges(boolean compressRanges) { 104 this.compressRanges = compressRanges; 105 return this; 106 } 107 getOrdering()108 public Comparator<String> getOrdering() { 109 return ordering; 110 } 111 112 /** 113 * @param ordering the resulting ordering of the list of characters in the pattern 114 * @return 115 */ setOrdering(Comparator ordering)116 public UnicodeSetPrettyPrinter setOrdering(Comparator ordering) { 117 this.ordering = 118 ordering == null 119 ? CODEPOINT_ORDER 120 : new org.unicode.cldr.util.MultiComparator<String>( 121 ordering, CODEPOINT_ORDER); 122 return this; 123 } 124 getSpaceComparator()125 public Comparator<String> getSpaceComparator() { 126 return spaceComp; 127 } 128 129 /** 130 * @param spaceComp if the comparison returns non-zero, then a space will be inserted between 131 * characters 132 * @return this, for chaining 133 */ setSpaceComparator(Comparator spaceComp)134 public UnicodeSetPrettyPrinter setSpaceComparator(Comparator spaceComp) { 135 this.spaceComp = spaceComp; 136 return this; 137 } 138 getToQuote()139 public UnicodeSet getToQuote() { 140 return toQuote; 141 } 142 143 /** 144 * a UnicodeSet of extra characters to quote with \\uXXXX-style escaping (will automatically 145 * quote pattern whitespace) 146 * 147 * @param toQuote 148 */ setToQuote(UnicodeSet toQuote)149 public UnicodeSetPrettyPrinter setToQuote(UnicodeSet toQuote) { 150 if (toQuote != null) { 151 toQuote = toQuote.cloneAsThawed(); 152 toQuote.addAll(PATTERN_WHITESPACE); 153 this.toQuote = toQuote; 154 } 155 return this; 156 } 157 158 /** 159 * Get the pattern for a particular set. 160 * 161 * @param uset 162 * @return formatted UnicodeSet 163 */ 164 @Override format(UnicodeSet uset)165 public synchronized String format(UnicodeSet uset) { 166 try { 167 first = true; 168 UnicodeSet putAtEnd = 169 new UnicodeSet(uset) 170 .retainAll(SORT_AT_END); // remove all the unassigned gorp for now 171 // make sure that comparison separates all strings, even canonically equivalent ones 172 TreeSet<String> orderedStrings = new TreeSet<>(ordering); 173 for (UnicodeSetIterator it = new UnicodeSetIterator(uset); it.nextRange(); ) { 174 if (it.codepoint == UnicodeSetIterator.IS_STRING) { 175 orderedStrings.add(it.string); 176 } else { 177 for (int i = it.codepoint; i <= it.codepointEnd; ++i) { 178 if (!putAtEnd.contains(i)) { 179 orderedStrings.add(UTF16.valueOf(i)); 180 } 181 } 182 } 183 } 184 target.setLength(0); 185 target.append("["); 186 for (String item : orderedStrings) { 187 appendUnicodeSetItem(item); 188 } 189 for (UnicodeSetIterator it = new UnicodeSetIterator(putAtEnd); 190 it.next(); ) { // add back the unassigned gorp 191 appendUnicodeSetItem( 192 it.codepoint); // we know that these are only codepoints, not strings, so 193 // this 194 // is safe 195 } 196 flushLast(); 197 target.append("]"); 198 String sresult = target.toString(); 199 200 return sresult; 201 } catch (Exception e) { 202 return uset.toPattern(false); 203 } 204 } 205 appendUnicodeSetItem(String s)206 private UnicodeSetPrettyPrinter appendUnicodeSetItem(String s) { 207 if (UTF16.hasMoreCodePointsThan(s, 1)) { 208 flushLast(); 209 addSpaceAsNeededBefore(s); 210 appendQuoted(s); 211 lastString = s; 212 } else { 213 appendUnicodeSetItem(UTF16.charAt(s, 0)); 214 } 215 return this; 216 } 217 appendUnicodeSetItem(int cp)218 private void appendUnicodeSetItem(int cp) { 219 if (!compressRanges) flushLast(); 220 if (cp == lastCodePoint + 1) { 221 lastCodePoint = cp; // continue range 222 } else { // start range 223 flushLast(); 224 firstCodePoint = lastCodePoint = cp; 225 } 226 } 227 228 /** */ addSpaceAsNeededBefore(String s)229 private void addSpaceAsNeededBefore(String s) { 230 if (first) { 231 first = false; 232 } else if (spaceComp != null && spaceComp.compare(s, lastString) != 0) { 233 target.append(' '); 234 } else { 235 int cp = UTF16.charAt(s, 0); 236 if (!toQuote.contains(cp) && !QUOTED_SYNTAX.contains(cp)) { 237 int type = UCharacter.getType(cp); 238 if (type == UCharacter.NON_SPACING_MARK || type == UCharacter.ENCLOSING_MARK) { 239 target.append(' '); 240 } else if (type == UCharacter.SURROGATE && cp >= UTF16.TRAIL_SURROGATE_MIN_VALUE) { 241 target.append(' '); // make sure we don't accidentally merge two surrogates 242 } 243 } 244 } 245 } 246 addSpaceAsNeededBefore(int codepoint)247 private void addSpaceAsNeededBefore(int codepoint) { 248 addSpaceAsNeededBefore(UTF16.valueOf(codepoint)); 249 } 250 flushLast()251 private void flushLast() { 252 if (lastCodePoint >= 0) { 253 addSpaceAsNeededBefore(firstCodePoint); 254 if (firstCodePoint != lastCodePoint) { 255 appendQuoted(firstCodePoint); 256 if (firstCodePoint + 1 != lastCodePoint) { 257 target.append('-'); 258 } else { 259 addSpaceAsNeededBefore(lastCodePoint); 260 } 261 } 262 appendQuoted(lastCodePoint); 263 lastString = UTF16.valueOf(lastCodePoint); 264 firstCodePoint = lastCodePoint = -2; 265 } 266 } 267 appendQuoted(String s)268 private void appendQuoted(String s) { 269 if (toQuote.containsSome(s) && quoter != null) { 270 target.append(quoter.transform(s)); 271 } else { 272 int cp; 273 target.append("{"); 274 for (int i = 0; i < s.length(); i += UTF16.getCharCount(cp)) { 275 appendQuoted(cp = UTF16.charAt(s, i)); 276 } 277 target.append("}"); 278 } 279 } 280 appendQuoted(int codePoint)281 UnicodeSetPrettyPrinter appendQuoted(int codePoint) { 282 if (toQuote.contains(codePoint)) { 283 if (quoter != null) { 284 target.append(quoter.transform(UTF16.valueOf(codePoint))); 285 return this; 286 } 287 if (codePoint > 0xFFFF) { 288 target.append("\\U"); 289 target.append(Utility.hex(codePoint, 8)); 290 } else { 291 target.append("\\u"); 292 target.append(Utility.hex(codePoint, 4)); 293 } 294 return this; 295 } 296 switch (codePoint) { 297 case '[': // SET_OPEN: 298 case ']': // SET_CLOSE: 299 case '-': // HYPHEN: 300 case '^': // COMPLEMENT: 301 case '&': // INTERSECTION: 302 case '\\': // BACKSLASH: 303 case '{': 304 case '}': 305 case '$': 306 case ':': 307 target.append('\\'); 308 break; 309 default: 310 // Escape whitespace 311 if (PATTERN_WHITESPACE.contains(codePoint)) { 312 target.append('\\'); 313 } 314 break; 315 } 316 UTF16.append(target, codePoint); 317 return this; 318 } 319 // Appender append(String s) { 320 // target.append(s); 321 // return this; 322 // } 323 // public String toString() { 324 // return target.toString(); 325 // } 326 format(UnicodeSet obj, Appendable toAppendTo, FieldPosition pos)327 public Appendable format(UnicodeSet obj, Appendable toAppendTo, FieldPosition pos) { 328 try { 329 return toAppendTo.append(format(obj)); 330 } catch (IOException e) { 331 throw new ICUUncheckedIOException(e); 332 } 333 } 334 335 @Override parse(String formattedString)336 public UnicodeSet parse(String formattedString) { 337 return new UnicodeSet(formattedString); 338 } 339 } 340