1 /** 2 ******************************************************************************* 3 * Copyright (C) 1996-2012, International Business Machines Corporation and * 4 * others. All Rights Reserved. * 5 ********************************************************************** 6 * Author: Mark Davis 7 ********************************************************************** 8 */ 9 10 package org.unicode.cldr.util; 11 12 import java.io.IOException; 13 import java.text.FieldPosition; 14 import java.util.Comparator; 15 import java.util.TreeSet; 16 17 import com.ibm.icu.impl.Utility; 18 import com.ibm.icu.lang.UCharacter; 19 import com.ibm.icu.text.StringTransform; 20 import com.ibm.icu.text.UTF16; 21 import com.ibm.icu.text.UTF16.StringComparator; 22 import com.ibm.icu.text.UnicodeSet; 23 import com.ibm.icu.text.UnicodeSetIterator; 24 import com.ibm.icu.util.ICUUncheckedIOException; 25 26 /** Provides more flexible formatting of UnicodeSet patterns. 27 */ 28 public class UnicodeSetPrettyPrinter { 29 private static final StringComparator CODEPOINT_ORDER = new UTF16.StringComparator(true, false, 0); 30 private static final UnicodeSet PATTERN_WHITESPACE = new UnicodeSet("[[:Cn:][:Default_Ignorable_Code_Point:][:patternwhitespace:]]").freeze(); 31 private static final UnicodeSet SORT_AT_END = new UnicodeSet("[[:Cn:][:Cs:][:Co:][:Ideographic:]]").freeze(); 32 private static final UnicodeSet QUOTED_SYNTAX = new UnicodeSet("[\\[\\]\\-\\^\\&\\\\\\{\\}\\$\\:]").addAll(PATTERN_WHITESPACE).freeze(); 33 34 private boolean first = true; 35 private StringBuffer target = new StringBuffer(); 36 private int firstCodePoint = -2; 37 private int lastCodePoint = -2; 38 private boolean compressRanges = true; 39 private String lastString = ""; 40 private UnicodeSet toQuote = new UnicodeSet(PATTERN_WHITESPACE); 41 private StringTransform quoter = null; 42 43 private Comparator<String> ordering; 44 private Comparator<String> spaceComp; 45 UnicodeSetPrettyPrinter()46 public UnicodeSetPrettyPrinter() { 47 } 48 getQuoter()49 public StringTransform getQuoter() { 50 return quoter; 51 } 52 setQuoter(StringTransform quoter)53 public UnicodeSetPrettyPrinter setQuoter(StringTransform quoter) { 54 this.quoter = quoter; 55 return this; // for chaining 56 } 57 isCompressRanges()58 public boolean isCompressRanges() { 59 return compressRanges; 60 } 61 62 /** 63 * @param compressRanges if you want abcde instead of a-e, make this false 64 * @return 65 */ setCompressRanges(boolean compressRanges)66 public UnicodeSetPrettyPrinter setCompressRanges(boolean compressRanges) { 67 this.compressRanges = compressRanges; 68 return this; 69 } 70 getOrdering()71 public Comparator<String> getOrdering() { 72 return ordering; 73 } 74 75 /** 76 * @param ordering the resulting ordering of the list of characters in the pattern 77 * @return 78 */ setOrdering(Comparator ordering)79 public UnicodeSetPrettyPrinter setOrdering(Comparator ordering) { 80 this.ordering = ordering == null ? CODEPOINT_ORDER : new org.unicode.cldr.util.MultiComparator<String>(ordering, CODEPOINT_ORDER); 81 return this; 82 } 83 getSpaceComparator()84 public Comparator<String> getSpaceComparator() { 85 return spaceComp; 86 } 87 88 /** 89 * @param spaceComp if the comparison returns non-zero, then a space will be inserted between characters 90 * @return this, for chaining 91 */ setSpaceComparator(Comparator spaceComp)92 public UnicodeSetPrettyPrinter setSpaceComparator(Comparator spaceComp) { 93 this.spaceComp = spaceComp; 94 return this; 95 } 96 getToQuote()97 public UnicodeSet getToQuote() { 98 return toQuote; 99 } 100 101 /** 102 * a UnicodeSet of extra characters to quote with \\uXXXX-style escaping (will automatically quote pattern whitespace) 103 * @param toQuote 104 */ setToQuote(UnicodeSet toQuote)105 public UnicodeSetPrettyPrinter setToQuote(UnicodeSet toQuote) { 106 if (toQuote != null) { 107 toQuote = toQuote.cloneAsThawed(); 108 toQuote.addAll(PATTERN_WHITESPACE); 109 this.toQuote = toQuote; 110 } 111 return this; 112 } 113 114 /** 115 * Get the pattern for a particular set. 116 * @param uset 117 * @return formatted UnicodeSet 118 */ format(UnicodeSet uset)119 public String format(UnicodeSet uset) { 120 first = true; 121 UnicodeSet putAtEnd = new UnicodeSet(uset).retainAll(SORT_AT_END); // remove all the unassigned gorp for now 122 // make sure that comparison separates all strings, even canonically equivalent ones 123 TreeSet<String> orderedStrings = new TreeSet<>(ordering); 124 for (UnicodeSetIterator it = new UnicodeSetIterator(uset); it.nextRange();) { 125 if (it.codepoint == UnicodeSetIterator.IS_STRING) { 126 orderedStrings.add(it.string); 127 } else { 128 for (int i = it.codepoint; i <= it.codepointEnd; ++i) { 129 if (!putAtEnd.contains(i)) { 130 orderedStrings.add(UTF16.valueOf(i)); 131 } 132 } 133 } 134 } 135 target.setLength(0); 136 target.append("["); 137 for (String item : orderedStrings) { 138 appendUnicodeSetItem(item); 139 } 140 for (UnicodeSetIterator it = new UnicodeSetIterator(putAtEnd); it.next();) { // add back the unassigned gorp 141 appendUnicodeSetItem(it.codepoint); // we know that these are only codepoints, not strings, so this is safe 142 } 143 flushLast(); 144 target.append("]"); 145 String sresult = target.toString(); 146 147 // double check the results. This can be removed once we have more tests. 148 // try { 149 // UnicodeSet doubleCheck = new UnicodeSet(sresult); 150 // if (!uset.equals(doubleCheck)) { 151 // throw new IllegalStateException("Failure to round-trip in pretty-print " + uset + " => " + sresult + Utility.LINE_SEPARATOR + " source-result: " + new UnicodeSet(uset).removeAll(doubleCheck) + Utility.LINE_SEPARATOR + " result-source: " + new UnicodeSet(doubleCheck).removeAll(uset)); 152 // } 153 // } catch (RuntimeException e) { 154 // throw (RuntimeException) new IllegalStateException("Failure to round-trip in pretty-print " + uset).initCause(e); 155 // } 156 return sresult; 157 } 158 appendUnicodeSetItem(String s)159 private UnicodeSetPrettyPrinter appendUnicodeSetItem(String s) { 160 if (UTF16.hasMoreCodePointsThan(s, 1)) { 161 flushLast(); 162 addSpaceAsNeededBefore(s); 163 appendQuoted(s); 164 lastString = s; 165 } else { 166 appendUnicodeSetItem(UTF16.charAt(s, 0)); 167 } 168 return this; 169 } 170 appendUnicodeSetItem(int cp)171 private void appendUnicodeSetItem(int cp) { 172 if (!compressRanges) 173 flushLast(); 174 if (cp == lastCodePoint + 1) { 175 lastCodePoint = cp; // continue range 176 } else { // start range 177 flushLast(); 178 firstCodePoint = lastCodePoint = cp; 179 } 180 } 181 182 /** 183 * 184 */ addSpaceAsNeededBefore(String s)185 private void addSpaceAsNeededBefore(String s) { 186 if (first) { 187 first = false; 188 } else if (spaceComp != null && spaceComp.compare(s, lastString) != 0) { 189 target.append(' '); 190 } else { 191 int cp = UTF16.charAt(s, 0); 192 if (!toQuote.contains(cp) && !QUOTED_SYNTAX.contains(cp)) { 193 int type = UCharacter.getType(cp); 194 if (type == UCharacter.NON_SPACING_MARK || type == UCharacter.ENCLOSING_MARK) { 195 target.append(' '); 196 } else if (type == UCharacter.SURROGATE && cp >= UTF16.TRAIL_SURROGATE_MIN_VALUE) { 197 target.append(' '); // make sure we don't accidentally merge two surrogates 198 } 199 } 200 } 201 } 202 addSpaceAsNeededBefore(int codepoint)203 private void addSpaceAsNeededBefore(int codepoint) { 204 addSpaceAsNeededBefore(UTF16.valueOf(codepoint)); 205 } 206 flushLast()207 private void flushLast() { 208 if (lastCodePoint >= 0) { 209 addSpaceAsNeededBefore(firstCodePoint); 210 if (firstCodePoint != lastCodePoint) { 211 appendQuoted(firstCodePoint); 212 if (firstCodePoint + 1 != lastCodePoint) { 213 target.append('-'); 214 } else { 215 addSpaceAsNeededBefore(lastCodePoint); 216 } 217 } 218 appendQuoted(lastCodePoint); 219 lastString = UTF16.valueOf(lastCodePoint); 220 firstCodePoint = lastCodePoint = -2; 221 } 222 } 223 appendQuoted(String s)224 private void appendQuoted(String s) { 225 if (toQuote.containsSome(s) && quoter != null) { 226 target.append(quoter.transform(s)); 227 } else { 228 int cp; 229 target.append("{"); 230 for (int i = 0; i < s.length(); i += UTF16.getCharCount(cp)) { 231 appendQuoted(cp = UTF16.charAt(s, i)); 232 } 233 target.append("}"); 234 } 235 } 236 appendQuoted(int codePoint)237 UnicodeSetPrettyPrinter appendQuoted(int codePoint) { 238 if (toQuote.contains(codePoint)) { 239 if (quoter != null) { 240 target.append(quoter.transform(UTF16.valueOf(codePoint))); 241 return this; 242 } 243 if (codePoint > 0xFFFF) { 244 target.append("\\U"); 245 target.append(Utility.hex(codePoint, 8)); 246 } else { 247 target.append("\\u"); 248 target.append(Utility.hex(codePoint, 4)); 249 } 250 return this; 251 } 252 switch (codePoint) { 253 case '[': // SET_OPEN: 254 case ']': // SET_CLOSE: 255 case '-': // HYPHEN: 256 case '^': // COMPLEMENT: 257 case '&': // INTERSECTION: 258 case '\\': //BACKSLASH: 259 case '{': 260 case '}': 261 case '$': 262 case ':': 263 target.append('\\'); 264 break; 265 default: 266 // Escape whitespace 267 if (PATTERN_WHITESPACE.contains(codePoint)) { 268 target.append('\\'); 269 } 270 break; 271 } 272 UTF16.append(target, codePoint); 273 return this; 274 } 275 // Appender append(String s) { 276 // target.append(s); 277 // return this; 278 // } 279 // public String toString() { 280 // return target.toString(); 281 // } 282 format(UnicodeSet obj, Appendable toAppendTo, FieldPosition pos)283 public Appendable format(UnicodeSet obj, Appendable toAppendTo, FieldPosition pos) { 284 try { 285 return toAppendTo.append(format(obj)); 286 } catch (IOException e) { 287 throw new ICUUncheckedIOException(e); 288 } 289 } 290 } 291