• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /**
2  * ****************************************************************************** Copyright (C)
3  * 1996-2012, International Business Machines Corporation and * others. All Rights Reserved. *
4  * ********************************************************************* Author: Mark Davis
5  * *********************************************************************
6  */
7 package org.unicode.cldr.util;
8 
9 import com.ibm.icu.impl.Utility;
10 import com.ibm.icu.lang.UCharacter;
11 import com.ibm.icu.text.Collator;
12 import com.ibm.icu.text.StringTransform;
13 import com.ibm.icu.text.UTF16;
14 import com.ibm.icu.text.UTF16.StringComparator;
15 import com.ibm.icu.text.UnicodeSet;
16 import com.ibm.icu.text.UnicodeSetIterator;
17 import com.ibm.icu.util.ICUUncheckedIOException;
18 import com.ibm.icu.util.ULocale;
19 import java.io.IOException;
20 import java.text.FieldPosition;
21 import java.util.Comparator;
22 import java.util.TreeSet;
23 
24 /**
25  * Provides more flexible formatting of UnicodeSet patterns. <br>
26  * Used in the XML for UnicodeSets. <br>
27  * For the Survey Tool, should use SimpleUnicodeSetFormatter.java
28  */
29 public class UnicodeSetPrettyPrinter implements FormatterParser<UnicodeSet> {
30     private static final StringComparator CODEPOINT_ORDER =
31             new UTF16.StringComparator(true, false, 0);
32     private static final UnicodeSet PATTERN_WHITESPACE =
33             new UnicodeSet("[[:Cn:][:Default_Ignorable_Code_Point:][:patternwhitespace:]]")
34                     .freeze();
35     private static final UnicodeSet SORT_AT_END =
36             new UnicodeSet("[[:Cn:][:Cs:][:Co:][:Ideographic:]]").freeze();
37     private static final UnicodeSet QUOTED_SYNTAX =
38             new UnicodeSet("[\\[\\]\\-\\^\\&\\\\\\{\\}\\$\\:]").addAll(PATTERN_WHITESPACE).freeze();
39 
40     private boolean first = true;
41     private StringBuffer target = new StringBuffer();
42     private int firstCodePoint = -2;
43     private int lastCodePoint = -2;
44     private boolean compressRanges = true;
45     private String lastString = "";
46     private UnicodeSet toQuote = new UnicodeSet(PATTERN_WHITESPACE);
47     private StringTransform quoter = null;
48 
49     private Comparator<String> ordering;
50     private Comparator<String> spaceComp;
51 
52     /** Make from root collator obtained from ICU */
53     public static final UnicodeSetPrettyPrinter ROOT_ICU =
54             from(
55                     (Comparator) Collator.getInstance(ULocale.ROOT).freeze(),
56                     (Comparator)
57                             Collator.getInstance(ULocale.ROOT)
58                                     .setStrength2(Collator.PRIMARY)
59                                     .freeze());
60 
61     /** Make from ICU Locale */
fromIcuLocale(String localeId)62     public static UnicodeSetPrettyPrinter fromIcuLocale(String localeId) {
63         Collator col = ComparatorUtilities.getIcuCollator(localeId, Collator.IDENTICAL).freeze();
64         Collator spaceCol = col.cloneAsThawed().setStrength2(Collator.PRIMARY).freeze();
65         return from((Comparator) col, (Comparator) spaceCol);
66     }
67 
68     /** Make from CLDR Locale */
fromCldrLocale(String localeId)69     public static UnicodeSetPrettyPrinter fromCldrLocale(String localeId) {
70         Collator col = ComparatorUtilities.getCldrCollator(localeId, Collator.IDENTICAL).freeze();
71         Collator spaceCol = col.cloneAsThawed().setStrength2(Collator.PRIMARY).freeze();
72         return from((Comparator) col, (Comparator) spaceCol);
73     }
74 
75     /** Utility for creating UnicodeSetPrettyPrinter */
from( Comparator<String> col, Comparator<String> spaceCol)76     public static UnicodeSetPrettyPrinter from(
77             Comparator<String> col, Comparator<String> spaceCol) {
78         return new UnicodeSetPrettyPrinter()
79                 .setOrdering(col)
80                 .setSpaceComparator(spaceCol)
81                 .setCompressRanges(false);
82     }
83 
UnicodeSetPrettyPrinter()84     public UnicodeSetPrettyPrinter() {}
85 
getQuoter()86     public StringTransform getQuoter() {
87         return quoter;
88     }
89 
setQuoter(StringTransform quoter)90     public UnicodeSetPrettyPrinter setQuoter(StringTransform quoter) {
91         this.quoter = quoter;
92         return this; // for chaining
93     }
94 
isCompressRanges()95     public boolean isCompressRanges() {
96         return compressRanges;
97     }
98 
99     /**
100      * @param compressRanges if you want abcde instead of a-e, make this false
101      * @return
102      */
setCompressRanges(boolean compressRanges)103     public UnicodeSetPrettyPrinter setCompressRanges(boolean compressRanges) {
104         this.compressRanges = compressRanges;
105         return this;
106     }
107 
getOrdering()108     public Comparator<String> getOrdering() {
109         return ordering;
110     }
111 
112     /**
113      * @param ordering the resulting ordering of the list of characters in the pattern
114      * @return
115      */
setOrdering(Comparator ordering)116     public UnicodeSetPrettyPrinter setOrdering(Comparator ordering) {
117         this.ordering =
118                 ordering == null
119                         ? CODEPOINT_ORDER
120                         : new org.unicode.cldr.util.MultiComparator<String>(
121                                 ordering, CODEPOINT_ORDER);
122         return this;
123     }
124 
getSpaceComparator()125     public Comparator<String> getSpaceComparator() {
126         return spaceComp;
127     }
128 
129     /**
130      * @param spaceComp if the comparison returns non-zero, then a space will be inserted between
131      *     characters
132      * @return this, for chaining
133      */
setSpaceComparator(Comparator spaceComp)134     public UnicodeSetPrettyPrinter setSpaceComparator(Comparator spaceComp) {
135         this.spaceComp = spaceComp;
136         return this;
137     }
138 
getToQuote()139     public UnicodeSet getToQuote() {
140         return toQuote;
141     }
142 
143     /**
144      * a UnicodeSet of extra characters to quote with \\uXXXX-style escaping (will automatically
145      * quote pattern whitespace)
146      *
147      * @param toQuote
148      */
setToQuote(UnicodeSet toQuote)149     public UnicodeSetPrettyPrinter setToQuote(UnicodeSet toQuote) {
150         if (toQuote != null) {
151             toQuote = toQuote.cloneAsThawed();
152             toQuote.addAll(PATTERN_WHITESPACE);
153             this.toQuote = toQuote;
154         }
155         return this;
156     }
157 
158     /**
159      * Get the pattern for a particular set.
160      *
161      * @param uset
162      * @return formatted UnicodeSet
163      */
164     @Override
format(UnicodeSet uset)165     public synchronized String format(UnicodeSet uset) {
166         try {
167             first = true;
168             UnicodeSet putAtEnd =
169                     new UnicodeSet(uset)
170                             .retainAll(SORT_AT_END); // remove all the unassigned gorp for now
171             // make sure that comparison separates all strings, even canonically equivalent ones
172             TreeSet<String> orderedStrings = new TreeSet<>(ordering);
173             for (UnicodeSetIterator it = new UnicodeSetIterator(uset); it.nextRange(); ) {
174                 if (it.codepoint == UnicodeSetIterator.IS_STRING) {
175                     orderedStrings.add(it.string);
176                 } else {
177                     for (int i = it.codepoint; i <= it.codepointEnd; ++i) {
178                         if (!putAtEnd.contains(i)) {
179                             orderedStrings.add(UTF16.valueOf(i));
180                         }
181                     }
182                 }
183             }
184             target.setLength(0);
185             target.append("[");
186             for (String item : orderedStrings) {
187                 appendUnicodeSetItem(item);
188             }
189             for (UnicodeSetIterator it = new UnicodeSetIterator(putAtEnd);
190                     it.next(); ) { // add back the unassigned gorp
191                 appendUnicodeSetItem(
192                         it.codepoint); // we know that these are only codepoints, not strings, so
193                 // this
194                 // is safe
195             }
196             flushLast();
197             target.append("]");
198             String sresult = target.toString();
199 
200             return sresult;
201         } catch (Exception e) {
202             return uset.toPattern(false);
203         }
204     }
205 
appendUnicodeSetItem(String s)206     private UnicodeSetPrettyPrinter appendUnicodeSetItem(String s) {
207         if (UTF16.hasMoreCodePointsThan(s, 1)) {
208             flushLast();
209             addSpaceAsNeededBefore(s);
210             appendQuoted(s);
211             lastString = s;
212         } else {
213             appendUnicodeSetItem(UTF16.charAt(s, 0));
214         }
215         return this;
216     }
217 
appendUnicodeSetItem(int cp)218     private void appendUnicodeSetItem(int cp) {
219         if (!compressRanges) flushLast();
220         if (cp == lastCodePoint + 1) {
221             lastCodePoint = cp; // continue range
222         } else { // start range
223             flushLast();
224             firstCodePoint = lastCodePoint = cp;
225         }
226     }
227 
228     /** */
addSpaceAsNeededBefore(String s)229     private void addSpaceAsNeededBefore(String s) {
230         if (first) {
231             first = false;
232         } else if (spaceComp != null && spaceComp.compare(s, lastString) != 0) {
233             target.append(' ');
234         } else {
235             int cp = UTF16.charAt(s, 0);
236             if (!toQuote.contains(cp) && !QUOTED_SYNTAX.contains(cp)) {
237                 int type = UCharacter.getType(cp);
238                 if (type == UCharacter.NON_SPACING_MARK || type == UCharacter.ENCLOSING_MARK) {
239                     target.append(' ');
240                 } else if (type == UCharacter.SURROGATE && cp >= UTF16.TRAIL_SURROGATE_MIN_VALUE) {
241                     target.append(' '); // make sure we don't accidentally merge two surrogates
242                 }
243             }
244         }
245     }
246 
addSpaceAsNeededBefore(int codepoint)247     private void addSpaceAsNeededBefore(int codepoint) {
248         addSpaceAsNeededBefore(UTF16.valueOf(codepoint));
249     }
250 
flushLast()251     private void flushLast() {
252         if (lastCodePoint >= 0) {
253             addSpaceAsNeededBefore(firstCodePoint);
254             if (firstCodePoint != lastCodePoint) {
255                 appendQuoted(firstCodePoint);
256                 if (firstCodePoint + 1 != lastCodePoint) {
257                     target.append('-');
258                 } else {
259                     addSpaceAsNeededBefore(lastCodePoint);
260                 }
261             }
262             appendQuoted(lastCodePoint);
263             lastString = UTF16.valueOf(lastCodePoint);
264             firstCodePoint = lastCodePoint = -2;
265         }
266     }
267 
appendQuoted(String s)268     private void appendQuoted(String s) {
269         if (toQuote.containsSome(s) && quoter != null) {
270             target.append(quoter.transform(s));
271         } else {
272             int cp;
273             target.append("{");
274             for (int i = 0; i < s.length(); i += UTF16.getCharCount(cp)) {
275                 appendQuoted(cp = UTF16.charAt(s, i));
276             }
277             target.append("}");
278         }
279     }
280 
appendQuoted(int codePoint)281     UnicodeSetPrettyPrinter appendQuoted(int codePoint) {
282         if (toQuote.contains(codePoint)) {
283             if (quoter != null) {
284                 target.append(quoter.transform(UTF16.valueOf(codePoint)));
285                 return this;
286             }
287             if (codePoint > 0xFFFF) {
288                 target.append("\\U");
289                 target.append(Utility.hex(codePoint, 8));
290             } else {
291                 target.append("\\u");
292                 target.append(Utility.hex(codePoint, 4));
293             }
294             return this;
295         }
296         switch (codePoint) {
297             case '[': // SET_OPEN:
298             case ']': // SET_CLOSE:
299             case '-': // HYPHEN:
300             case '^': // COMPLEMENT:
301             case '&': // INTERSECTION:
302             case '\\': // BACKSLASH:
303             case '{':
304             case '}':
305             case '$':
306             case ':':
307                 target.append('\\');
308                 break;
309             default:
310                 // Escape whitespace
311                 if (PATTERN_WHITESPACE.contains(codePoint)) {
312                     target.append('\\');
313                 }
314                 break;
315         }
316         UTF16.append(target, codePoint);
317         return this;
318     }
319     //  Appender append(String s) {
320     //  target.append(s);
321     //  return this;
322     //  }
323     //  public String toString() {
324     //  return target.toString();
325     //  }
326 
format(UnicodeSet obj, Appendable toAppendTo, FieldPosition pos)327     public Appendable format(UnicodeSet obj, Appendable toAppendTo, FieldPosition pos) {
328         try {
329             return toAppendTo.append(format(obj));
330         } catch (IOException e) {
331             throw new ICUUncheckedIOException(e);
332         }
333     }
334 
335     @Override
parse(String formattedString)336     public UnicodeSet parse(String formattedString) {
337         return new UnicodeSet(formattedString);
338     }
339 }
340