1 /* GENERATED SOURCE. DO NOT MODIFY. */ 2 // © 2016 and later: Unicode, Inc. and others. 3 // License & terms of use: http://www.unicode.org/copyright.html#License 4 /* 5 ******************************************************************************* 6 * Copyright (C) 2006-2009, Google, International Business Machines Corporation * 7 * and others. All Rights Reserved. * 8 ******************************************************************************* 9 */ 10 package android.icu.impl; 11 12 import android.icu.text.UTF16; 13 import android.icu.text.UnicodeSet; 14 15 /** 16 * A simple parsing class for patterns and rules. Handles '...' quotations, \\uxxxx and \\Uxxxxxxxx, and symple syntax. 17 * The '' (two quotes) is treated as a single quote, inside or outside a quote 18 * <ul> 19 * <li>Any ignorable characters are ignored in parsing.</li> 20 * <li>Any syntax characters are broken into separate tokens</li> 21 * <li>Quote characters can be specified: '...', "...", and \x </li> 22 * <li>Other characters are treated as literals</li> 23 * </ul> 24 * @hide Only a subset of ICU is exposed in Android 25 */ 26 public class PatternTokenizer { 27 // settings used in the interpretation of the pattern 28 private UnicodeSet ignorableCharacters = new UnicodeSet(); 29 private UnicodeSet syntaxCharacters = new UnicodeSet(); 30 private UnicodeSet extraQuotingCharacters = new UnicodeSet(); 31 private UnicodeSet escapeCharacters = new UnicodeSet(); 32 private boolean usingSlash = false; 33 private boolean usingQuote = false; 34 35 // transient data, set when needed. Null it out for any changes in the above fields. 36 private transient UnicodeSet needingQuoteCharacters = null; 37 38 // data about the current pattern being parsed. start gets moved as we go along. 39 private int start; 40 private int limit; 41 private String pattern; 42 getIgnorableCharacters()43 public UnicodeSet getIgnorableCharacters() { 44 return (UnicodeSet) ignorableCharacters.clone(); 45 } 46 /** 47 * Sets the characters to be ignored in parsing, eg new UnicodeSet("[:pattern_whitespace:]"); 48 * @param ignorableCharacters Characters to be ignored. 49 * @return A PatternTokenizer object in which characters are specified as ignored characters. 50 */ setIgnorableCharacters(UnicodeSet ignorableCharacters)51 public PatternTokenizer setIgnorableCharacters(UnicodeSet ignorableCharacters) { 52 this.ignorableCharacters = (UnicodeSet) ignorableCharacters.clone(); 53 needingQuoteCharacters = null; 54 return this; 55 } getSyntaxCharacters()56 public UnicodeSet getSyntaxCharacters() { 57 return (UnicodeSet) syntaxCharacters.clone(); 58 } getExtraQuotingCharacters()59 public UnicodeSet getExtraQuotingCharacters() { 60 return (UnicodeSet) extraQuotingCharacters.clone(); 61 } 62 /** 63 * Sets the characters to be interpreted as syntax characters in parsing, eg new UnicodeSet("[:pattern_syntax:]") 64 * @param syntaxCharacters Characters to be set as syntax characters. 65 * @return A PatternTokenizer object in which characters are specified as syntax characters. 66 */ setSyntaxCharacters(UnicodeSet syntaxCharacters)67 public PatternTokenizer setSyntaxCharacters(UnicodeSet syntaxCharacters) { 68 this.syntaxCharacters = (UnicodeSet) syntaxCharacters.clone(); 69 needingQuoteCharacters = null; 70 return this; 71 } 72 /** 73 * Sets the extra characters to be quoted in literals 74 * @param syntaxCharacters Characters to be set as extra quoting characters. 75 * @return A PatternTokenizer object in which characters are specified as extra quoting characters. 76 */ setExtraQuotingCharacters(UnicodeSet syntaxCharacters)77 public PatternTokenizer setExtraQuotingCharacters(UnicodeSet syntaxCharacters) { 78 this.extraQuotingCharacters = (UnicodeSet) syntaxCharacters.clone(); 79 needingQuoteCharacters = null; 80 return this; 81 } 82 getEscapeCharacters()83 public UnicodeSet getEscapeCharacters() { 84 return (UnicodeSet) escapeCharacters.clone(); 85 } 86 /** 87 * Set characters to be escaped in literals, in quoteLiteral and normalize, eg new UnicodeSet("[^\\u0020-\\u007E]"); 88 * @param escapeCharacters Characters to be set as escape characters. 89 * @return A PatternTokenizer object in which characters are specified as escape characters. 90 */ setEscapeCharacters(UnicodeSet escapeCharacters)91 public PatternTokenizer setEscapeCharacters(UnicodeSet escapeCharacters) { 92 this.escapeCharacters = (UnicodeSet) escapeCharacters.clone(); 93 return this; 94 } isUsingQuote()95 public boolean isUsingQuote() { 96 return usingQuote; 97 } setUsingQuote(boolean usingQuote)98 public PatternTokenizer setUsingQuote(boolean usingQuote) { 99 this.usingQuote = usingQuote; 100 needingQuoteCharacters = null; 101 return this; 102 } isUsingSlash()103 public boolean isUsingSlash() { 104 return usingSlash; 105 } setUsingSlash(boolean usingSlash)106 public PatternTokenizer setUsingSlash(boolean usingSlash) { 107 this.usingSlash = usingSlash; 108 needingQuoteCharacters = null; 109 return this; 110 } 111 // public UnicodeSet getQuoteCharacters() { 112 // return (UnicodeSet) quoteCharacters.clone(); 113 // } 114 // public PatternTokenizer setQuoteCharacters(UnicodeSet quoteCharacters) { 115 // this.quoteCharacters = (UnicodeSet) quoteCharacters.clone(); 116 // needingQuoteCharacters = null; 117 // return this; 118 // } getLimit()119 public int getLimit() { 120 return limit; 121 } setLimit(int limit)122 public PatternTokenizer setLimit(int limit) { 123 this.limit = limit; 124 return this; 125 } getStart()126 public int getStart() { 127 return start; 128 } setStart(int start)129 public PatternTokenizer setStart(int start) { 130 this.start = start; 131 return this; 132 } 133 setPattern(CharSequence pattern)134 public PatternTokenizer setPattern(CharSequence pattern) { 135 return setPattern(pattern.toString()); 136 } 137 setPattern(String pattern)138 public PatternTokenizer setPattern(String pattern) { 139 if (pattern == null) { 140 throw new IllegalArgumentException("Inconsistent arguments"); 141 } 142 this.start = 0; 143 this.limit = pattern.length(); 144 this.pattern = pattern; 145 return this; 146 } 147 148 public static final char SINGLE_QUOTE = '\''; 149 public static final char BACK_SLASH = '\\'; 150 private static int NO_QUOTE = -1, IN_QUOTE = -2; 151 quoteLiteral(CharSequence string)152 public String quoteLiteral(CharSequence string) { 153 return quoteLiteral(string.toString()); 154 } 155 156 /** 157 * Quote a literal string, using the available settings. Thus syntax characters, quote characters, and ignorable characters will be put into quotes. 158 * @param string String passed to quote a literal string. 159 * @return A string using the available settings will place syntax, quote, or ignorable characters into quotes. 160 */ quoteLiteral(String string)161 public String quoteLiteral(String string) { 162 if (needingQuoteCharacters == null) { 163 needingQuoteCharacters = new UnicodeSet().addAll(syntaxCharacters).addAll(ignorableCharacters).addAll(extraQuotingCharacters); // .addAll(quoteCharacters) 164 if (usingSlash) needingQuoteCharacters.add(BACK_SLASH); 165 if (usingQuote) needingQuoteCharacters.add(SINGLE_QUOTE); 166 } 167 StringBuffer result = new StringBuffer(); 168 int quotedChar = NO_QUOTE; 169 int cp; 170 for (int i = 0; i < string.length(); i += UTF16.getCharCount(cp)) { 171 cp = UTF16.charAt(string, i); 172 if (escapeCharacters.contains(cp)) { 173 // we may have to fix up previous characters 174 if (quotedChar == IN_QUOTE) { 175 result.append(SINGLE_QUOTE); 176 quotedChar = NO_QUOTE; 177 } 178 appendEscaped(result, cp); 179 continue; 180 } 181 182 if (needingQuoteCharacters.contains(cp)) { 183 // if we have already started a quote 184 if (quotedChar == IN_QUOTE) { 185 UTF16.append(result, cp); 186 if (usingQuote && cp == SINGLE_QUOTE) { // double it 187 result.append(SINGLE_QUOTE); 188 } 189 continue; 190 } 191 // otherwise not already in quote 192 if (usingSlash) { 193 result.append(BACK_SLASH); 194 UTF16.append(result, cp); 195 continue; 196 } 197 if (usingQuote) { 198 if (cp == SINGLE_QUOTE) { // double it and continue 199 result.append(SINGLE_QUOTE); 200 result.append(SINGLE_QUOTE); 201 continue; 202 } 203 result.append(SINGLE_QUOTE); 204 UTF16.append(result, cp); 205 quotedChar = IN_QUOTE; 206 continue; 207 } 208 // we have no choice but to use \\u or \\U 209 appendEscaped(result, cp); 210 continue; 211 } 212 // otherwise cp doesn't need quoting 213 // we may have to fix up previous characters 214 if (quotedChar == IN_QUOTE) { 215 result.append(SINGLE_QUOTE); 216 quotedChar = NO_QUOTE; 217 } 218 UTF16.append(result, cp); 219 } 220 // all done. 221 // we may have to fix up previous characters 222 if (quotedChar == IN_QUOTE) { 223 result.append(SINGLE_QUOTE); 224 } 225 return result.toString(); 226 } 227 appendEscaped(StringBuffer result, int cp)228 private void appendEscaped(StringBuffer result, int cp) { 229 if (cp <= 0xFFFF) { 230 result.append("\\u").append(Utility.hex(cp,4)); 231 } else { 232 result.append("\\U").append(Utility.hex(cp,8)); 233 } 234 } 235 normalize()236 public String normalize() { 237 int oldStart = start; 238 StringBuffer result = new StringBuffer(); 239 StringBuffer buffer = new StringBuffer(); 240 while (true) { 241 buffer.setLength(0); 242 int status = next(buffer); 243 if (status == DONE) { 244 start = oldStart; 245 return result.toString(); 246 } 247 if (status != SYNTAX) { 248 result.append(quoteLiteral(buffer)); 249 } else { 250 result.append(buffer); 251 } 252 } 253 } 254 255 public static final int DONE = 0, SYNTAX = 1, LITERAL = 2, BROKEN_QUOTE = 3, BROKEN_ESCAPE = 4, UNKNOWN = 5; 256 257 private static final int AFTER_QUOTE = -1, NONE = 0, START_QUOTE = 1, NORMAL_QUOTE = 2, SLASH_START = 3, HEX = 4; 258 next(StringBuffer buffer)259 public int next(StringBuffer buffer) { 260 if (start >= limit) return DONE; 261 int status = UNKNOWN; 262 int lastQuote = UNKNOWN; 263 int quoteStatus = NONE; 264 int hexCount = 0; 265 int hexValue = 0; 266 int cp; 267 main: 268 for (int i = start; i < limit; i += UTF16.getCharCount(cp)) { 269 cp = UTF16.charAt(pattern, i); 270 // if we are in a quote, then handle it. 271 switch (quoteStatus) { 272 case SLASH_START: 273 switch (cp) { 274 case 'u': 275 quoteStatus = HEX; 276 hexCount = 4; 277 hexValue = 0; 278 continue main; 279 case 'U': 280 quoteStatus = HEX; 281 hexCount = 8; 282 hexValue = 0; 283 continue main; 284 default: 285 if (usingSlash) { 286 UTF16.append(buffer, cp); 287 quoteStatus = NONE; 288 continue main; 289 } else { 290 buffer.append(BACK_SLASH); 291 quoteStatus = NONE; 292 } 293 } 294 break; // fall through to NONE 295 case HEX: 296 hexValue <<= 4; 297 hexValue += cp; 298 switch (cp) { 299 case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9': 300 hexValue -= '0'; break; 301 case 'a': case 'b': case 'c': case 'd': case 'e': case 'f': 302 hexValue -= 'a' - 10; break; 303 case 'A': case 'B': case 'C': case 'D': case 'E': case 'F': 304 hexValue -= 'A' - 10; break; 305 default: 306 start = i; 307 return BROKEN_ESCAPE; 308 } 309 --hexCount; 310 if (hexCount == 0) { 311 quoteStatus = NONE; 312 UTF16.append(buffer, hexValue); 313 } 314 continue main; 315 case AFTER_QUOTE: 316 // see if we get another quote character 317 // if we just ended a quote BUT the following character is the lastQuote character, then we have a situation like '...''...', so we restart the quote 318 if (cp == lastQuote) { 319 UTF16.append(buffer, cp); 320 quoteStatus = NORMAL_QUOTE; 321 continue main; 322 } 323 quoteStatus = NONE; 324 break; // fall through to NONE 325 case START_QUOTE: 326 // if we are at the very start of a quote, and we hit another quote mark then we emit a literal quote character and end the quote 327 if (cp == lastQuote) { 328 UTF16.append(buffer, cp); 329 quoteStatus = NONE; // get out of quote, with no trace remaining 330 continue; 331 } 332 // otherwise get into quote 333 UTF16.append(buffer, cp); 334 quoteStatus = NORMAL_QUOTE; 335 continue main; 336 case NORMAL_QUOTE: 337 if (cp == lastQuote) { 338 quoteStatus = AFTER_QUOTE; // get out of quote 339 continue main; 340 } 341 UTF16.append(buffer, cp); 342 continue main; 343 } 344 345 if (ignorableCharacters.contains(cp)) { 346 continue; 347 } 348 // do syntax characters 349 if (syntaxCharacters.contains(cp)) { 350 if (status == UNKNOWN) { 351 UTF16.append(buffer, cp); 352 start = i + UTF16.getCharCount(cp); 353 return SYNTAX; 354 } else { // LITERAL, so back up and break 355 start = i; 356 return status; 357 } 358 } 359 // otherwise it is a literal; keep on going 360 status = LITERAL; 361 if (cp == BACK_SLASH) { 362 quoteStatus = SLASH_START; 363 continue; 364 } else if (usingQuote && cp == SINGLE_QUOTE) { 365 lastQuote = cp; 366 quoteStatus = START_QUOTE; 367 continue; 368 } 369 // normal literals 370 UTF16.append(buffer, cp); 371 } 372 // handle final cleanup 373 start = limit; 374 switch (quoteStatus) { 375 case HEX: 376 status = BROKEN_ESCAPE; 377 break; 378 case SLASH_START: 379 if (usingSlash) { 380 status = BROKEN_ESCAPE; 381 } else { 382 buffer.append(BACK_SLASH); 383 } 384 break; 385 case START_QUOTE: case NORMAL_QUOTE: 386 status = BROKEN_QUOTE; 387 break; 388 } 389 return status; 390 } 391 392 393 } 394 //eof 395