1 package org.unicode.cldr.util; 2 3 import com.ibm.icu.impl.UnicodeMap; 4 import com.ibm.icu.impl.Utility; 5 import com.ibm.icu.lang.UCharacter; 6 import com.ibm.icu.text.UTF16; 7 import com.ibm.icu.text.UnicodeSet; 8 import java.util.Locale; 9 10 /** 11 * Provide a set of code point abbreviations. Includes conversions to and from codepoints, including 12 * hex. Typicaly To test whether a string could have escapes, use either: 13 * 14 * <ul> 15 * <li> 16 */ 17 public enum CodePointEscaper { 18 // These are characters found in CLDR data fields 19 // The long names don't necessarily match the formal Unicode names 20 TAB(9, "tab"), 21 LF(0xA, "line feed"), 22 CR(0xD, "carriage return"), 23 SP(0x20, "space", "ASCII space"), 24 TSP(0x2009, "thin space", "Aka ‘narrow space’"), 25 NBSP(0xA0, "no-break space", "Same as space, but doesn’t line wrap."), 26 27 NBTSP( 28 0x202F, 29 "no-break thin space", 30 "Same as thin space, but doesn’t line wrap. Aka 'narrow no-break space'"), 31 32 WNJ( 33 0x200B, 34 "allow line wrap after, aka ZWSP", 35 "Invisible character allowing a line-wrap afterwards. Also known as ‘ZWSP’."), 36 WJ( 37 0x2060, 38 "prevent line wrap", 39 "Keeps adjacent characters from line-wrapping. Also known as ‘word-joiner’."), 40 SHY( 41 0x00AD, 42 "soft hyphen", 43 "Invisible character allowing a line-wrap afterwards, but appears like a hyphen in most languages."), 44 45 ZWNJ(0x200C, "cursive non-joiner", "Breaks cursive connections, where possible."), 46 ZWJ(0x200D, "cursive joiner", "Forces cursive connections, if possible."), 47 48 ALM( 49 0x061C, 50 "Arabic letter mark", 51 "For BIDI, invisible character that behaves like Arabic letter."), 52 LRM( 53 0x200E, 54 "left-right mark", 55 "For BIDI, invisible character that behaves like Hebrew letter."), 56 RLM(0x200F, "right-left mark", "For BIDI, invisible character that behaves like Latin letter."), 57 58 LRO(0x202D, "left-right override"), 59 RLO(0x202E, "right-left override"), 60 PDF(0x202C, "end override"), 61 62 BOM(0xFEFF, "byte-order mark"), 63 64 ANS(0x0600, "Arabic number sign"), 65 ASNS(0x0601, "Arabic sanah sign"), 66 AFM(0x602, "Arabic footnote marker"), 67 ASFS(0x603, "Arabic safha sign"), 68 SAM(0x70F, "Syriac abbreviation mark"), 69 KIAQ(0x17B4, "Khmer inherent aq"), 70 KIAA(0x17B5, "Khmer inherent aa"), 71 72 RANGE('➖', "range syntax mark", "heavy minus sign"), 73 ESCS('❰', "escape start", "heavy open angle bracket"), 74 ESCE('❱', "escape end", "heavy close angle bracket"); 75 76 public static final char RANGE_SYNTAX = (char) RANGE.getCodePoint(); 77 public static final char ESCAPE_START = (char) ESCS.getCodePoint(); 78 public static final char ESCAPE_END = (char) ESCE.getCodePoint(); 79 80 /** Assemble the reverse mapping */ 81 private static final UnicodeMap<CodePointEscaper> _fromCodePoint = new UnicodeMap<>(); 82 83 static { 84 for (CodePointEscaper abbr : CodePointEscaper.values()) { 85 CodePointEscaper oldValue = _fromCodePoint.get(abbr.codePoint); 86 if (oldValue != null) { 87 throw new IllegalArgumentException( 88 "Abbreviation code points collide: " 89 + oldValue.name() 90 + ", " 91 + abbr.name()); 92 } _fromCodePoint.put(abbr.codePoint, abbr)93 _fromCodePoint.put(abbr.codePoint, abbr); 94 } _fromCodePoint.freeze()95 _fromCodePoint.freeze(); 96 } 97 98 /** Characters that need escaping */ 99 public static final UnicodeSet EMOJI_INVISIBLES = 100 new UnicodeSet("[\\uFE0F\\U000E0020-\\U000E007F]").freeze(); 101 102 public static final UnicodeSet FORCE_ESCAPE = 103 new UnicodeSet("[[:DI:][:Pat_WS:][:WSpace:][:C:][:Z:]]") 104 .addAll(getNamedEscapes()) 105 .removeAll(EMOJI_INVISIBLES) 106 .freeze(); 107 108 public static final UnicodeSet NON_SPACING = new UnicodeSet("[[:Mn:][:Me:]]").freeze(); 109 110 public static final UnicodeSet FORCE_ESCAPE_WITH_NONSPACING = 111 new UnicodeSet(FORCE_ESCAPE).addAll(NON_SPACING).freeze(); 112 113 private final int codePoint; 114 private final String shortName; 115 private final String description; 116 CodePointEscaper(int codePoint, String shortName)117 private CodePointEscaper(int codePoint, String shortName) { 118 this(codePoint, shortName, ""); 119 } 120 CodePointEscaper(int codePoint, String shortName, String description)121 private CodePointEscaper(int codePoint, String shortName, String description) { 122 this.codePoint = codePoint; 123 this.shortName = shortName; 124 this.description = description; 125 } 126 getNamedEscapes()127 public static final UnicodeSet getNamedEscapes() { 128 return _fromCodePoint.keySet().freeze(); 129 } 130 131 /** 132 * Return long names for this character. The set is immutable and ordered, with the first name 133 * being the most user-friendly. 134 */ getShortName()135 public String getShortName() { 136 return shortName; 137 } 138 139 /** 140 * Return a longer description, if available; otherwise "" 141 * 142 * @return 143 */ getDescription()144 public String getDescription() { 145 return description; 146 } 147 148 /** Return the code point for this character. */ getCodePoint()149 public int getCodePoint() { 150 return codePoint; 151 } 152 153 /** Return the string form of the code point for this character. */ getString()154 public String getString() { 155 return UTF16.valueOf(codePoint); 156 } 157 158 /** Returns the escaped form from the code point for this enum */ codePointToEscaped()159 public String codePointToEscaped() { 160 return ESCAPE_START + rawCodePointToEscaped(codePoint) + ESCAPE_END; 161 } 162 163 /** Returns a code point from the escaped form <b>of a single code point</b> */ escapedToCodePoint(String value)164 public static int escapedToCodePoint(String value) { 165 if (value == null || value.isEmpty()) { 166 return 0xFFFD; 167 } 168 if (value.codePointAt(0) != CodePointEscaper.ESCAPE_START 169 || value.codePointAt(value.length() - 1) != CodePointEscaper.ESCAPE_END) { 170 throw new IllegalArgumentException( 171 "Must be of the form " 172 + CodePointEscaper.ESCAPE_START 173 + "…" 174 + CodePointEscaper.ESCAPE_END); 175 } 176 return rawEscapedToCodePoint(value.substring(1, value.length() - 1)); 177 } 178 179 /** Returns the escaped form from a code point */ codePointToEscaped(int codePoint)180 public static String codePointToEscaped(int codePoint) { 181 return ESCAPE_START + rawCodePointToEscaped(codePoint) + ESCAPE_END; 182 } 183 184 /** Returns the escaped form from a string */ toEscaped(String unescaped)185 public static String toEscaped(String unescaped) { 186 return toEscaped(unescaped, FORCE_ESCAPE); 187 } 188 189 /** Returns the escaped form from a string */ toEscaped(String unescaped, UnicodeSet toEscape)190 public static String toEscaped(String unescaped, UnicodeSet toEscape) { 191 if (unescaped == null) { 192 return null; 193 } 194 StringBuilder result = new StringBuilder(); 195 unescaped 196 .codePoints() 197 .forEach( 198 cp -> { 199 if (!toEscape.contains(cp)) { 200 result.appendCodePoint(cp); 201 } else { 202 result.append(codePointToEscaped(cp)); 203 } 204 }); 205 return result.toString(); 206 } 207 getEscaped(int cp, UnicodeSet toEscape)208 public static String getEscaped(int cp, UnicodeSet toEscape) { 209 if (!toEscape.contains(cp)) { 210 return UTF16.valueOf(cp); 211 } else { 212 return codePointToEscaped(cp); 213 } 214 } 215 216 /** Return unescaped string */ toUnescaped(String escaped)217 public static String toUnescaped(String escaped) { 218 if (escaped == null) { 219 return null; 220 } 221 StringBuilder result = null; 222 int donePart = 0; 223 int found = escaped.indexOf(ESCAPE_START); 224 while (found >= 0) { 225 int foundEnd = escaped.indexOf(ESCAPE_END, found); 226 if (foundEnd < 0) { 227 throw new IllegalArgumentException( 228 "Malformed escaped string, missing: " + ESCAPE_END); 229 } 230 if (result == null) { 231 result = new StringBuilder(); 232 } 233 result.append(escaped, donePart, found); 234 donePart = ++foundEnd; 235 result.appendCodePoint(escapedToCodePoint(escaped.substring(found, foundEnd))); 236 found = escaped.indexOf(ESCAPE_START, foundEnd); 237 } 238 return donePart == 0 239 ? escaped 240 : result.append(escaped, donePart, escaped.length()).toString(); 241 } 242 243 private static final String HAS_NAME = " ≡ "; 244 toExample(int codePoint)245 public static String toExample(int codePoint) { 246 CodePointEscaper cpe = _fromCodePoint.get(codePoint); 247 if (cpe == null) { // hex 248 final String name = UCharacter.getExtendedName(codePoint); 249 return codePointToEscaped(codePoint) 250 + HAS_NAME 251 + (name != null ? name.toLowerCase() : ""); 252 } else { 253 return CodePointEscaper.codePointToEscaped(cpe.codePoint) 254 + HAS_NAME 255 + cpe.shortName; // TODO show hover with cpe.description 256 } 257 } 258 259 /** 260 * Returns a code point from an abbreviation string or hex string <b>without the escape 261 * brackets</b> 262 */ rawEscapedToCodePoint(CharSequence value)263 public static int rawEscapedToCodePoint(CharSequence value) { 264 if (value == null || value.length() == 0) { 265 return 0xFFFD; 266 } 267 try { 268 return valueOf(value.toString().toUpperCase(Locale.ROOT)).codePoint; 269 } catch (Exception e) { 270 } 271 int codePoint; 272 try { 273 codePoint = Integer.parseInt(value.toString(), 16); 274 } catch (NumberFormatException e) { 275 throw new IllegalArgumentException("Not a named or hex escape: ❰" + value + "❌❱"); 276 } 277 if (codePoint < 0 || codePoint > 0x10FFFF) { 278 throw new IllegalArgumentException("Illegal code point: ❰" + value + "❌❱"); 279 } 280 return codePoint; 281 } 282 283 /** 284 * Returns an abbreviation string or hex string <b>without the escape brackets</b> from a code 285 * point. 286 */ rawCodePointToEscaped(int codePoint)287 public static String rawCodePointToEscaped(int codePoint) { 288 CodePointEscaper result = CodePointEscaper._fromCodePoint.get(codePoint); 289 return result == null 290 ? Integer.toString(codePoint, 16).toUpperCase(Locale.ROOT) 291 : result.toString(); 292 } 293 getHtmlRows( UnicodeSet escapesToShow, String tableOptions, String cellOptions)294 public static final String getHtmlRows( 295 UnicodeSet escapesToShow, String tableOptions, String cellOptions) { 296 if (!escapesToShow.strings().isEmpty()) { 297 throw new IllegalArgumentException("No strings allowed in the unicode set."); 298 } 299 StringBuilder result = new StringBuilder("<table" + tableOptions + ">"); 300 UnicodeSet remaining = new UnicodeSet(escapesToShow); 301 String tdPlus = "<td" + cellOptions + ">"; 302 for (CodePointEscaper cpe : CodePointEscaper.values()) { 303 int cp = cpe.getCodePoint(); 304 remaining.remove(cp); 305 if (escapesToShow.contains(cpe.getCodePoint())) { 306 final String id = cpe.name(); 307 final String shortName = cpe.getShortName(); 308 final String description = cpe.getDescription(); 309 addREsult(result, tdPlus, id, shortName, description); 310 } 311 } 312 for (String cps : remaining) { 313 int cp = cps.codePointAt(0); 314 final String extendedName = UCharacter.getExtendedName(cp); 315 addREsult( 316 result, 317 tdPlus, 318 Utility.hex(cp, 2), 319 "", 320 extendedName == null ? "" : extendedName.toLowerCase()); 321 } 322 return result.append("</table>").toString(); 323 } 324 addREsult( StringBuilder result, String tdPlus, final String id, final String shortName, final String description)325 public static void addREsult( 326 StringBuilder result, 327 String tdPlus, 328 final String id, 329 final String shortName, 330 final String description) { 331 result.append("<tr>") 332 .append(tdPlus) 333 .append(ESCAPE_START) 334 .append(id) 335 .append(ESCAPE_END + "</td>") 336 .append(tdPlus) 337 .append(shortName) 338 .append("</td>") 339 .append(tdPlus) 340 .append(description) 341 .append("</td><tr>"); 342 } 343 } 344