• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 package org.unicode.cldr.util;
2 
3 import com.ibm.icu.impl.UnicodeMap;
4 import com.ibm.icu.impl.Utility;
5 import com.ibm.icu.lang.UCharacter;
6 import com.ibm.icu.text.UTF16;
7 import com.ibm.icu.text.UnicodeSet;
8 import java.util.Locale;
9 
10 /**
11  * Provide a set of code point abbreviations. Includes conversions to and from codepoints, including
12  * hex. Typicaly To test whether a string could have escapes, use either:
13  *
14  * <ul>
15  *   <li>
16  */
17 public enum CodePointEscaper {
18     // These are characters found in CLDR data fields
19     // The long names don't necessarily match the formal Unicode names
20     TAB(9, "tab"),
21     LF(0xA, "line feed"),
22     CR(0xD, "carriage return"),
23     SP(0x20, "space", "ASCII space"),
24     TSP(0x2009, "thin space", "Aka ‘narrow space’"),
25     NBSP(0xA0, "no-break space", "Same as space, but doesn’t line wrap."),
26 
27     NBTSP(
28             0x202F,
29             "no-break thin space",
30             "Same as thin space, but doesn’t line wrap. Aka 'narrow no-break space'"),
31 
32     WNJ(
33             0x200B,
34             "allow line wrap after, aka ZWSP",
35             "Invisible character allowing a line-wrap afterwards. Also known as ‘ZWSP’."),
36     WJ(
37             0x2060,
38             "prevent line wrap",
39             "Keeps adjacent characters from line-wrapping. Also known as ‘word-joiner’."),
40     SHY(
41             0x00AD,
42             "soft hyphen",
43             "Invisible character allowing a line-wrap afterwards, but appears like a hyphen in most languages."),
44 
45     ZWNJ(0x200C, "cursive non-joiner", "Breaks cursive connections, where possible."),
46     ZWJ(0x200D, "cursive joiner", "Forces cursive connections, if possible."),
47 
48     ALM(
49             0x061C,
50             "Arabic letter mark",
51             "For BIDI, invisible character that behaves like Arabic letter."),
52     LRM(
53             0x200E,
54             "left-right mark",
55             "For BIDI, invisible character that behaves like Hebrew letter."),
56     RLM(0x200F, "right-left mark", "For BIDI, invisible character that behaves like Latin letter."),
57 
58     LRO(0x202D, "left-right override"),
59     RLO(0x202E, "right-left override"),
60     PDF(0x202C, "end override"),
61 
62     BOM(0xFEFF, "byte-order mark"),
63 
64     ANS(0x0600, "Arabic number sign"),
65     ASNS(0x0601, "Arabic sanah sign"),
66     AFM(0x602, "Arabic footnote marker"),
67     ASFS(0x603, "Arabic safha sign"),
68     SAM(0x70F, "Syriac abbreviation mark"),
69     KIAQ(0x17B4, "Khmer inherent aq"),
70     KIAA(0x17B5, "Khmer inherent aa"),
71 
72     RANGE('➖', "range syntax mark", "heavy minus sign"),
73     ESCS('❰', "escape start", "heavy open angle bracket"),
74     ESCE('❱', "escape end", "heavy close angle bracket");
75 
76     public static final char RANGE_SYNTAX = (char) RANGE.getCodePoint();
77     public static final char ESCAPE_START = (char) ESCS.getCodePoint();
78     public static final char ESCAPE_END = (char) ESCE.getCodePoint();
79 
80     /** Assemble the reverse mapping */
81     private static final UnicodeMap<CodePointEscaper> _fromCodePoint = new UnicodeMap<>();
82 
83     static {
84         for (CodePointEscaper abbr : CodePointEscaper.values()) {
85             CodePointEscaper oldValue = _fromCodePoint.get(abbr.codePoint);
86             if (oldValue != null) {
87                 throw new IllegalArgumentException(
88                         "Abbreviation code points collide: "
89                                 + oldValue.name()
90                                 + ", "
91                                 + abbr.name());
92             }
_fromCodePoint.put(abbr.codePoint, abbr)93             _fromCodePoint.put(abbr.codePoint, abbr);
94         }
_fromCodePoint.freeze()95         _fromCodePoint.freeze();
96     }
97 
98     /** Characters that need escaping */
99     public static final UnicodeSet EMOJI_INVISIBLES =
100             new UnicodeSet("[\\uFE0F\\U000E0020-\\U000E007F]").freeze();
101 
102     public static final UnicodeSet FORCE_ESCAPE =
103             new UnicodeSet("[[:DI:][:Pat_WS:][:WSpace:][:C:][:Z:]]")
104                     .addAll(getNamedEscapes())
105                     .removeAll(EMOJI_INVISIBLES)
106                     .freeze();
107 
108     public static final UnicodeSet NON_SPACING = new UnicodeSet("[[:Mn:][:Me:]]").freeze();
109 
110     public static final UnicodeSet FORCE_ESCAPE_WITH_NONSPACING =
111             new UnicodeSet(FORCE_ESCAPE).addAll(NON_SPACING).freeze();
112 
113     private final int codePoint;
114     private final String shortName;
115     private final String description;
116 
CodePointEscaper(int codePoint, String shortName)117     private CodePointEscaper(int codePoint, String shortName) {
118         this(codePoint, shortName, "");
119     }
120 
CodePointEscaper(int codePoint, String shortName, String description)121     private CodePointEscaper(int codePoint, String shortName, String description) {
122         this.codePoint = codePoint;
123         this.shortName = shortName;
124         this.description = description;
125     }
126 
getNamedEscapes()127     public static final UnicodeSet getNamedEscapes() {
128         return _fromCodePoint.keySet().freeze();
129     }
130 
131     /**
132      * Return long names for this character. The set is immutable and ordered, with the first name
133      * being the most user-friendly.
134      */
getShortName()135     public String getShortName() {
136         return shortName;
137     }
138 
139     /**
140      * Return a longer description, if available; otherwise ""
141      *
142      * @return
143      */
getDescription()144     public String getDescription() {
145         return description;
146     }
147 
148     /** Return the code point for this character. */
getCodePoint()149     public int getCodePoint() {
150         return codePoint;
151     }
152 
153     /** Return the string form of the code point for this character. */
getString()154     public String getString() {
155         return UTF16.valueOf(codePoint);
156     }
157 
158     /** Returns the escaped form from the code point for this enum */
codePointToEscaped()159     public String codePointToEscaped() {
160         return ESCAPE_START + rawCodePointToEscaped(codePoint) + ESCAPE_END;
161     }
162 
163     /** Returns a code point from the escaped form <b>of a single code point</b> */
escapedToCodePoint(String value)164     public static int escapedToCodePoint(String value) {
165         if (value == null || value.isEmpty()) {
166             return 0xFFFD;
167         }
168         if (value.codePointAt(0) != CodePointEscaper.ESCAPE_START
169                 || value.codePointAt(value.length() - 1) != CodePointEscaper.ESCAPE_END) {
170             throw new IllegalArgumentException(
171                     "Must be of the form "
172                             + CodePointEscaper.ESCAPE_START
173                             + "…"
174                             + CodePointEscaper.ESCAPE_END);
175         }
176         return rawEscapedToCodePoint(value.substring(1, value.length() - 1));
177     }
178 
179     /** Returns the escaped form from a code point */
codePointToEscaped(int codePoint)180     public static String codePointToEscaped(int codePoint) {
181         return ESCAPE_START + rawCodePointToEscaped(codePoint) + ESCAPE_END;
182     }
183 
184     /** Returns the escaped form from a string */
toEscaped(String unescaped)185     public static String toEscaped(String unescaped) {
186         return toEscaped(unescaped, FORCE_ESCAPE);
187     }
188 
189     /** Returns the escaped form from a string */
toEscaped(String unescaped, UnicodeSet toEscape)190     public static String toEscaped(String unescaped, UnicodeSet toEscape) {
191         if (unescaped == null) {
192             return null;
193         }
194         StringBuilder result = new StringBuilder();
195         unescaped
196                 .codePoints()
197                 .forEach(
198                         cp -> {
199                             if (!toEscape.contains(cp)) {
200                                 result.appendCodePoint(cp);
201                             } else {
202                                 result.append(codePointToEscaped(cp));
203                             }
204                         });
205         return result.toString();
206     }
207 
getEscaped(int cp, UnicodeSet toEscape)208     public static String getEscaped(int cp, UnicodeSet toEscape) {
209         if (!toEscape.contains(cp)) {
210             return UTF16.valueOf(cp);
211         } else {
212             return codePointToEscaped(cp);
213         }
214     }
215 
216     /** Return unescaped string */
toUnescaped(String escaped)217     public static String toUnescaped(String escaped) {
218         if (escaped == null) {
219             return null;
220         }
221         StringBuilder result = null;
222         int donePart = 0;
223         int found = escaped.indexOf(ESCAPE_START);
224         while (found >= 0) {
225             int foundEnd = escaped.indexOf(ESCAPE_END, found);
226             if (foundEnd < 0) {
227                 throw new IllegalArgumentException(
228                         "Malformed escaped string, missing: " + ESCAPE_END);
229             }
230             if (result == null) {
231                 result = new StringBuilder();
232             }
233             result.append(escaped, donePart, found);
234             donePart = ++foundEnd;
235             result.appendCodePoint(escapedToCodePoint(escaped.substring(found, foundEnd)));
236             found = escaped.indexOf(ESCAPE_START, foundEnd);
237         }
238         return donePart == 0
239                 ? escaped
240                 : result.append(escaped, donePart, escaped.length()).toString();
241     }
242 
243     private static final String HAS_NAME = " ≡ ";
244 
toExample(int codePoint)245     public static String toExample(int codePoint) {
246         CodePointEscaper cpe = _fromCodePoint.get(codePoint);
247         if (cpe == null) { // hex
248             final String name = UCharacter.getExtendedName(codePoint);
249             return codePointToEscaped(codePoint)
250                     + HAS_NAME
251                     + (name != null ? name.toLowerCase() : "");
252         } else {
253             return CodePointEscaper.codePointToEscaped(cpe.codePoint)
254                     + HAS_NAME
255                     + cpe.shortName; // TODO show hover with cpe.description
256         }
257     }
258 
259     /**
260      * Returns a code point from an abbreviation string or hex string <b>without the escape
261      * brackets</b>
262      */
rawEscapedToCodePoint(CharSequence value)263     public static int rawEscapedToCodePoint(CharSequence value) {
264         if (value == null || value.length() == 0) {
265             return 0xFFFD;
266         }
267         try {
268             return valueOf(value.toString().toUpperCase(Locale.ROOT)).codePoint;
269         } catch (Exception e) {
270         }
271         int codePoint;
272         try {
273             codePoint = Integer.parseInt(value.toString(), 16);
274         } catch (NumberFormatException e) {
275             throw new IllegalArgumentException("Not a named or hex escape: ❰" + value + "❌❱");
276         }
277         if (codePoint < 0 || codePoint > 0x10FFFF) {
278             throw new IllegalArgumentException("Illegal code point: ❰" + value + "❌❱");
279         }
280         return codePoint;
281     }
282 
283     /**
284      * Returns an abbreviation string or hex string <b>without the escape brackets</b> from a code
285      * point.
286      */
rawCodePointToEscaped(int codePoint)287     public static String rawCodePointToEscaped(int codePoint) {
288         CodePointEscaper result = CodePointEscaper._fromCodePoint.get(codePoint);
289         return result == null
290                 ? Integer.toString(codePoint, 16).toUpperCase(Locale.ROOT)
291                 : result.toString();
292     }
293 
getHtmlRows( UnicodeSet escapesToShow, String tableOptions, String cellOptions)294     public static final String getHtmlRows(
295             UnicodeSet escapesToShow, String tableOptions, String cellOptions) {
296         if (!escapesToShow.strings().isEmpty()) {
297             throw new IllegalArgumentException("No strings allowed in the unicode set.");
298         }
299         StringBuilder result = new StringBuilder("<table" + tableOptions + ">");
300         UnicodeSet remaining = new UnicodeSet(escapesToShow);
301         String tdPlus = "<td" + cellOptions + ">";
302         for (CodePointEscaper cpe : CodePointEscaper.values()) {
303             int cp = cpe.getCodePoint();
304             remaining.remove(cp);
305             if (escapesToShow.contains(cpe.getCodePoint())) {
306                 final String id = cpe.name();
307                 final String shortName = cpe.getShortName();
308                 final String description = cpe.getDescription();
309                 addREsult(result, tdPlus, id, shortName, description);
310             }
311         }
312         for (String cps : remaining) {
313             int cp = cps.codePointAt(0);
314             final String extendedName = UCharacter.getExtendedName(cp);
315             addREsult(
316                     result,
317                     tdPlus,
318                     Utility.hex(cp, 2),
319                     "",
320                     extendedName == null ? "" : extendedName.toLowerCase());
321         }
322         return result.append("</table>").toString();
323     }
324 
addREsult( StringBuilder result, String tdPlus, final String id, final String shortName, final String description)325     public static void addREsult(
326             StringBuilder result,
327             String tdPlus,
328             final String id,
329             final String shortName,
330             final String description) {
331         result.append("<tr>")
332                 .append(tdPlus)
333                 .append(ESCAPE_START)
334                 .append(id)
335                 .append(ESCAPE_END + "</td>")
336                 .append(tdPlus)
337                 .append(shortName)
338                 .append("</td>")
339                 .append(tdPlus)
340                 .append(description)
341                 .append("</td><tr>");
342     }
343 }
344