1 // © 2016 and later: Unicode, Inc. and others. 2 // License & terms of use: http://www.unicode.org/copyright.html 3 /* 4 ******************************************************************************* 5 * Copyright (C) 2006-2009, Google, International Business Machines Corporation * 6 * and others. All Rights Reserved. * 7 ******************************************************************************* 8 */ 9 package com.ibm.icu.impl; 10 11 import com.ibm.icu.text.UTF16; 12 import com.ibm.icu.text.UnicodeSet; 13 14 /** 15 * A simple parsing class for patterns and rules. Handles '...' quotations, \\uxxxx and \\Uxxxxxxxx, and symple syntax. 16 * The '' (two quotes) is treated as a single quote, inside or outside a quote 17 * <ul> 18 * <li>Any ignorable characters are ignored in parsing.</li> 19 * <li>Any syntax characters are broken into separate tokens</li> 20 * <li>Quote characters can be specified: '...', "...", and \x </li> 21 * <li>Other characters are treated as literals</li> 22 * </ul> 23 */ 24 public class PatternTokenizer { 25 // settings used in the interpretation of the pattern 26 private UnicodeSet ignorableCharacters = new UnicodeSet(); 27 private UnicodeSet syntaxCharacters = new UnicodeSet(); 28 private UnicodeSet extraQuotingCharacters = new UnicodeSet(); 29 private UnicodeSet escapeCharacters = new UnicodeSet(); 30 private boolean usingSlash = false; 31 private boolean usingQuote = false; 32 33 // transient data, set when needed. Null it out for any changes in the above fields. 34 private transient UnicodeSet needingQuoteCharacters = null; 35 36 // data about the current pattern being parsed. start gets moved as we go along. 37 private int start; 38 private int limit; 39 private String pattern; 40 getIgnorableCharacters()41 public UnicodeSet getIgnorableCharacters() { 42 return (UnicodeSet) ignorableCharacters.clone(); 43 } 44 /** 45 * Sets the characters to be ignored in parsing, eg new UnicodeSet("[:pattern_whitespace:]"); 46 * @param ignorableCharacters Characters to be ignored. 47 * @return A PatternTokenizer object in which characters are specified as ignored characters. 48 */ setIgnorableCharacters(UnicodeSet ignorableCharacters)49 public PatternTokenizer setIgnorableCharacters(UnicodeSet ignorableCharacters) { 50 this.ignorableCharacters = (UnicodeSet) ignorableCharacters.clone(); 51 needingQuoteCharacters = null; 52 return this; 53 } getSyntaxCharacters()54 public UnicodeSet getSyntaxCharacters() { 55 return (UnicodeSet) syntaxCharacters.clone(); 56 } getExtraQuotingCharacters()57 public UnicodeSet getExtraQuotingCharacters() { 58 return (UnicodeSet) extraQuotingCharacters.clone(); 59 } 60 /** 61 * Sets the characters to be interpreted as syntax characters in parsing, eg new UnicodeSet("[:pattern_syntax:]") 62 * @param syntaxCharacters Characters to be set as syntax characters. 63 * @return A PatternTokenizer object in which characters are specified as syntax characters. 64 */ setSyntaxCharacters(UnicodeSet syntaxCharacters)65 public PatternTokenizer setSyntaxCharacters(UnicodeSet syntaxCharacters) { 66 this.syntaxCharacters = (UnicodeSet) syntaxCharacters.clone(); 67 needingQuoteCharacters = null; 68 return this; 69 } 70 /** 71 * Sets the extra characters to be quoted in literals 72 * @param syntaxCharacters Characters to be set as extra quoting characters. 73 * @return A PatternTokenizer object in which characters are specified as extra quoting characters. 74 */ setExtraQuotingCharacters(UnicodeSet syntaxCharacters)75 public PatternTokenizer setExtraQuotingCharacters(UnicodeSet syntaxCharacters) { 76 this.extraQuotingCharacters = (UnicodeSet) syntaxCharacters.clone(); 77 needingQuoteCharacters = null; 78 return this; 79 } 80 getEscapeCharacters()81 public UnicodeSet getEscapeCharacters() { 82 return (UnicodeSet) escapeCharacters.clone(); 83 } 84 /** 85 * Set characters to be escaped in literals, in quoteLiteral and normalize, eg new UnicodeSet("[^\\u0020-\\u007E]"); 86 * @param escapeCharacters Characters to be set as escape characters. 87 * @return A PatternTokenizer object in which characters are specified as escape characters. 88 */ setEscapeCharacters(UnicodeSet escapeCharacters)89 public PatternTokenizer setEscapeCharacters(UnicodeSet escapeCharacters) { 90 this.escapeCharacters = (UnicodeSet) escapeCharacters.clone(); 91 return this; 92 } isUsingQuote()93 public boolean isUsingQuote() { 94 return usingQuote; 95 } setUsingQuote(boolean usingQuote)96 public PatternTokenizer setUsingQuote(boolean usingQuote) { 97 this.usingQuote = usingQuote; 98 needingQuoteCharacters = null; 99 return this; 100 } isUsingSlash()101 public boolean isUsingSlash() { 102 return usingSlash; 103 } setUsingSlash(boolean usingSlash)104 public PatternTokenizer setUsingSlash(boolean usingSlash) { 105 this.usingSlash = usingSlash; 106 needingQuoteCharacters = null; 107 return this; 108 } 109 // public UnicodeSet getQuoteCharacters() { 110 // return (UnicodeSet) quoteCharacters.clone(); 111 // } 112 // public PatternTokenizer setQuoteCharacters(UnicodeSet quoteCharacters) { 113 // this.quoteCharacters = (UnicodeSet) quoteCharacters.clone(); 114 // needingQuoteCharacters = null; 115 // return this; 116 // } getLimit()117 public int getLimit() { 118 return limit; 119 } setLimit(int limit)120 public PatternTokenizer setLimit(int limit) { 121 this.limit = limit; 122 return this; 123 } getStart()124 public int getStart() { 125 return start; 126 } setStart(int start)127 public PatternTokenizer setStart(int start) { 128 this.start = start; 129 return this; 130 } 131 setPattern(CharSequence pattern)132 public PatternTokenizer setPattern(CharSequence pattern) { 133 return setPattern(pattern.toString()); 134 } 135 setPattern(String pattern)136 public PatternTokenizer setPattern(String pattern) { 137 if (pattern == null) { 138 throw new IllegalArgumentException("Inconsistent arguments"); 139 } 140 this.start = 0; 141 this.limit = pattern.length(); 142 this.pattern = pattern; 143 return this; 144 } 145 146 public static final char SINGLE_QUOTE = '\''; 147 public static final char BACK_SLASH = '\\'; 148 private static int NO_QUOTE = -1, IN_QUOTE = -2; 149 quoteLiteral(CharSequence string)150 public String quoteLiteral(CharSequence string) { 151 return quoteLiteral(string.toString()); 152 } 153 154 /** 155 * Quote a literal string, using the available settings. Thus syntax characters, quote characters, and ignorable characters will be put into quotes. 156 * @param string String passed to quote a literal string. 157 * @return A string using the available settings will place syntax, quote, or ignorable characters into quotes. 158 */ quoteLiteral(String string)159 public String quoteLiteral(String string) { 160 if (needingQuoteCharacters == null) { 161 needingQuoteCharacters = new UnicodeSet().addAll(syntaxCharacters).addAll(ignorableCharacters).addAll(extraQuotingCharacters); // .addAll(quoteCharacters) 162 if (usingSlash) needingQuoteCharacters.add(BACK_SLASH); 163 if (usingQuote) needingQuoteCharacters.add(SINGLE_QUOTE); 164 } 165 StringBuffer result = new StringBuffer(); 166 int quotedChar = NO_QUOTE; 167 int cp; 168 for (int i = 0; i < string.length(); i += UTF16.getCharCount(cp)) { 169 cp = UTF16.charAt(string, i); 170 if (escapeCharacters.contains(cp)) { 171 // we may have to fix up previous characters 172 if (quotedChar == IN_QUOTE) { 173 result.append(SINGLE_QUOTE); 174 quotedChar = NO_QUOTE; 175 } 176 appendEscaped(result, cp); 177 continue; 178 } 179 180 if (needingQuoteCharacters.contains(cp)) { 181 // if we have already started a quote 182 if (quotedChar == IN_QUOTE) { 183 UTF16.append(result, cp); 184 if (usingQuote && cp == SINGLE_QUOTE) { // double it 185 result.append(SINGLE_QUOTE); 186 } 187 continue; 188 } 189 // otherwise not already in quote 190 if (usingSlash) { 191 result.append(BACK_SLASH); 192 UTF16.append(result, cp); 193 continue; 194 } 195 if (usingQuote) { 196 if (cp == SINGLE_QUOTE) { // double it and continue 197 result.append(SINGLE_QUOTE); 198 result.append(SINGLE_QUOTE); 199 continue; 200 } 201 result.append(SINGLE_QUOTE); 202 UTF16.append(result, cp); 203 quotedChar = IN_QUOTE; 204 continue; 205 } 206 // we have no choice but to use \\u or \\U 207 appendEscaped(result, cp); 208 continue; 209 } 210 // otherwise cp doesn't need quoting 211 // we may have to fix up previous characters 212 if (quotedChar == IN_QUOTE) { 213 result.append(SINGLE_QUOTE); 214 quotedChar = NO_QUOTE; 215 } 216 UTF16.append(result, cp); 217 } 218 // all done. 219 // we may have to fix up previous characters 220 if (quotedChar == IN_QUOTE) { 221 result.append(SINGLE_QUOTE); 222 } 223 return result.toString(); 224 } 225 appendEscaped(StringBuffer result, int cp)226 private void appendEscaped(StringBuffer result, int cp) { 227 if (cp <= 0xFFFF) { 228 result.append("\\u").append(Utility.hex(cp,4)); 229 } else { 230 result.append("\\U").append(Utility.hex(cp,8)); 231 } 232 } 233 normalize()234 public String normalize() { 235 int oldStart = start; 236 StringBuffer result = new StringBuffer(); 237 StringBuffer buffer = new StringBuffer(); 238 while (true) { 239 buffer.setLength(0); 240 int status = next(buffer); 241 if (status == DONE) { 242 start = oldStart; 243 return result.toString(); 244 } 245 if (status != SYNTAX) { 246 result.append(quoteLiteral(buffer)); 247 } else { 248 result.append(buffer); 249 } 250 } 251 } 252 253 public static final int DONE = 0, SYNTAX = 1, LITERAL = 2, BROKEN_QUOTE = 3, BROKEN_ESCAPE = 4, UNKNOWN = 5; 254 255 private static final int AFTER_QUOTE = -1, NONE = 0, START_QUOTE = 1, NORMAL_QUOTE = 2, SLASH_START = 3, HEX = 4; 256 next(StringBuffer buffer)257 public int next(StringBuffer buffer) { 258 if (start >= limit) return DONE; 259 int status = UNKNOWN; 260 int lastQuote = UNKNOWN; 261 int quoteStatus = NONE; 262 int hexCount = 0; 263 int hexValue = 0; 264 int cp; 265 main: 266 for (int i = start; i < limit; i += UTF16.getCharCount(cp)) { 267 cp = UTF16.charAt(pattern, i); 268 // if we are in a quote, then handle it. 269 switch (quoteStatus) { 270 case SLASH_START: 271 switch (cp) { 272 case 'u': 273 quoteStatus = HEX; 274 hexCount = 4; 275 hexValue = 0; 276 continue main; 277 case 'U': 278 quoteStatus = HEX; 279 hexCount = 8; 280 hexValue = 0; 281 continue main; 282 default: 283 if (usingSlash) { 284 UTF16.append(buffer, cp); 285 quoteStatus = NONE; 286 continue main; 287 } else { 288 buffer.append(BACK_SLASH); 289 quoteStatus = NONE; 290 } 291 } 292 break; // fall through to NONE 293 case HEX: 294 hexValue <<= 4; 295 hexValue += cp; 296 switch (cp) { 297 case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9': 298 hexValue -= '0'; break; 299 case 'a': case 'b': case 'c': case 'd': case 'e': case 'f': 300 hexValue -= 'a' - 10; break; 301 case 'A': case 'B': case 'C': case 'D': case 'E': case 'F': 302 hexValue -= 'A' - 10; break; 303 default: 304 start = i; 305 return BROKEN_ESCAPE; 306 } 307 --hexCount; 308 if (hexCount == 0) { 309 quoteStatus = NONE; 310 UTF16.append(buffer, hexValue); 311 } 312 continue main; 313 case AFTER_QUOTE: 314 // see if we get another quote character 315 // if we just ended a quote BUT the following character is the lastQuote character, then we have a situation like '...''...', so we restart the quote 316 if (cp == lastQuote) { 317 UTF16.append(buffer, cp); 318 quoteStatus = NORMAL_QUOTE; 319 continue main; 320 } 321 quoteStatus = NONE; 322 break; // fall through to NONE 323 case START_QUOTE: 324 // if we are at the very start of a quote, and we hit another quote mark then we emit a literal quote character and end the quote 325 if (cp == lastQuote) { 326 UTF16.append(buffer, cp); 327 quoteStatus = NONE; // get out of quote, with no trace remaining 328 continue; 329 } 330 // otherwise get into quote 331 UTF16.append(buffer, cp); 332 quoteStatus = NORMAL_QUOTE; 333 continue main; 334 case NORMAL_QUOTE: 335 if (cp == lastQuote) { 336 quoteStatus = AFTER_QUOTE; // get out of quote 337 continue main; 338 } 339 UTF16.append(buffer, cp); 340 continue main; 341 } 342 343 if (ignorableCharacters.contains(cp)) { 344 continue; 345 } 346 // do syntax characters 347 if (syntaxCharacters.contains(cp)) { 348 if (status == UNKNOWN) { 349 UTF16.append(buffer, cp); 350 start = i + UTF16.getCharCount(cp); 351 return SYNTAX; 352 } else { // LITERAL, so back up and break 353 start = i; 354 return status; 355 } 356 } 357 // otherwise it is a literal; keep on going 358 status = LITERAL; 359 if (cp == BACK_SLASH) { 360 quoteStatus = SLASH_START; 361 continue; 362 } else if (usingQuote && cp == SINGLE_QUOTE) { 363 lastQuote = cp; 364 quoteStatus = START_QUOTE; 365 continue; 366 } 367 // normal literals 368 UTF16.append(buffer, cp); 369 } 370 // handle final cleanup 371 start = limit; 372 switch (quoteStatus) { 373 case HEX: 374 status = BROKEN_ESCAPE; 375 break; 376 case SLASH_START: 377 if (usingSlash) { 378 status = BROKEN_ESCAPE; 379 } else { 380 buffer.append(BACK_SLASH); 381 } 382 break; 383 case START_QUOTE: case NORMAL_QUOTE: 384 status = BROKEN_QUOTE; 385 break; 386 } 387 return status; 388 } 389 390 391 } 392 //eof 393