• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /* GENERATED SOURCE. DO NOT MODIFY. */
2 // © 2016 and later: Unicode, Inc. and others.
3 // License & terms of use: http://www.unicode.org/copyright.html#License
4 /*
5  *******************************************************************************
6  * Copyright (C) 2006-2009, Google, International Business Machines Corporation *
7  * and others. All Rights Reserved.                                            *
8  *******************************************************************************
9  */
10 package android.icu.impl;
11 
12 import android.icu.text.UTF16;
13 import android.icu.text.UnicodeSet;
14 
15 /**
16  * A simple parsing class for patterns and rules. Handles '...' quotations, \\uxxxx and \\Uxxxxxxxx, and symple syntax.
17  * The '' (two quotes) is treated as a single quote, inside or outside a quote
18  * <ul>
19  * <li>Any ignorable characters are ignored in parsing.</li>
20  * <li>Any syntax characters are broken into separate tokens</li>
21  * <li>Quote characters can be specified: '...', "...", and \x </li>
22  * <li>Other characters are treated as literals</li>
23  * </ul>
24  * @hide Only a subset of ICU is exposed in Android
25  */
26 public class PatternTokenizer {
27     // settings used in the interpretation of the pattern
28     private UnicodeSet ignorableCharacters = new UnicodeSet();
29     private UnicodeSet syntaxCharacters = new UnicodeSet();
30     private UnicodeSet extraQuotingCharacters = new UnicodeSet();
31     private UnicodeSet escapeCharacters = new UnicodeSet();
32     private boolean usingSlash = false;
33     private boolean usingQuote = false;
34 
35     // transient data, set when needed. Null it out for any changes in the above fields.
36     private transient UnicodeSet needingQuoteCharacters = null;
37 
38     // data about the current pattern being parsed. start gets moved as we go along.
39     private int start;
40     private int limit;
41     private String pattern;
42 
getIgnorableCharacters()43     public UnicodeSet getIgnorableCharacters() {
44         return (UnicodeSet) ignorableCharacters.clone();
45     }
46     /**
47      * Sets the characters to be ignored in parsing, eg new UnicodeSet("[:pattern_whitespace:]");
48      * @param ignorableCharacters Characters to be ignored.
49      * @return A PatternTokenizer object in which characters are specified as ignored characters.
50      */
setIgnorableCharacters(UnicodeSet ignorableCharacters)51     public PatternTokenizer setIgnorableCharacters(UnicodeSet ignorableCharacters) {
52         this.ignorableCharacters = (UnicodeSet) ignorableCharacters.clone();
53         needingQuoteCharacters = null;
54         return this;
55     }
getSyntaxCharacters()56     public UnicodeSet getSyntaxCharacters() {
57         return (UnicodeSet) syntaxCharacters.clone();
58     }
getExtraQuotingCharacters()59     public UnicodeSet getExtraQuotingCharacters() {
60         return (UnicodeSet) extraQuotingCharacters.clone();
61     }
62     /**
63      *  Sets the characters to be interpreted as syntax characters in parsing, eg new UnicodeSet("[:pattern_syntax:]")
64      * @param syntaxCharacters Characters to be set as syntax characters.
65      * @return A PatternTokenizer object in which characters are specified as syntax characters.
66      */
setSyntaxCharacters(UnicodeSet syntaxCharacters)67     public PatternTokenizer setSyntaxCharacters(UnicodeSet syntaxCharacters) {
68         this.syntaxCharacters = (UnicodeSet) syntaxCharacters.clone();
69         needingQuoteCharacters = null;
70         return this;
71     }
72     /**
73      *  Sets the extra characters to be quoted in literals
74      * @param syntaxCharacters Characters to be set as extra quoting characters.
75      * @return A PatternTokenizer object in which characters are specified as extra quoting characters.
76      */
setExtraQuotingCharacters(UnicodeSet syntaxCharacters)77     public PatternTokenizer setExtraQuotingCharacters(UnicodeSet syntaxCharacters) {
78         this.extraQuotingCharacters = (UnicodeSet) syntaxCharacters.clone();
79         needingQuoteCharacters = null;
80         return this;
81     }
82 
getEscapeCharacters()83     public UnicodeSet getEscapeCharacters() {
84         return (UnicodeSet) escapeCharacters.clone();
85     }
86     /**
87      * Set characters to be escaped in literals, in quoteLiteral and normalize, eg new UnicodeSet("[^\\u0020-\\u007E]");
88      * @param escapeCharacters Characters to be set as escape characters.
89      * @return A PatternTokenizer object in which characters are specified as escape characters.
90      */
setEscapeCharacters(UnicodeSet escapeCharacters)91     public PatternTokenizer setEscapeCharacters(UnicodeSet escapeCharacters) {
92         this.escapeCharacters = (UnicodeSet) escapeCharacters.clone();
93         return this;
94     }
isUsingQuote()95     public boolean isUsingQuote() {
96         return usingQuote;
97     }
setUsingQuote(boolean usingQuote)98     public PatternTokenizer setUsingQuote(boolean usingQuote) {
99         this.usingQuote = usingQuote;
100         needingQuoteCharacters = null;
101         return this;
102     }
isUsingSlash()103     public boolean isUsingSlash() {
104         return usingSlash;
105     }
setUsingSlash(boolean usingSlash)106     public PatternTokenizer setUsingSlash(boolean usingSlash) {
107         this.usingSlash = usingSlash;
108         needingQuoteCharacters = null;
109         return this;
110     }
111     //    public UnicodeSet getQuoteCharacters() {
112 //  return (UnicodeSet) quoteCharacters.clone();
113 //  }
114 //  public PatternTokenizer setQuoteCharacters(UnicodeSet quoteCharacters) {
115 //  this.quoteCharacters = (UnicodeSet) quoteCharacters.clone();
116 //  needingQuoteCharacters = null;
117 //  return this;
118 //  }
getLimit()119     public int getLimit() {
120         return limit;
121     }
setLimit(int limit)122     public PatternTokenizer setLimit(int limit) {
123         this.limit = limit;
124         return this;
125     }
getStart()126     public int getStart() {
127         return start;
128     }
setStart(int start)129     public PatternTokenizer setStart(int start) {
130         this.start = start;
131         return this;
132     }
133 
setPattern(CharSequence pattern)134     public PatternTokenizer setPattern(CharSequence pattern) {
135         return setPattern(pattern.toString());
136     }
137 
setPattern(String pattern)138     public PatternTokenizer setPattern(String pattern) {
139         if (pattern == null) {
140             throw new IllegalArgumentException("Inconsistent arguments");
141         }
142         this.start = 0;
143         this.limit = pattern.length();
144         this.pattern = pattern;
145         return this;
146     }
147 
148     public static final char SINGLE_QUOTE = '\'';
149     public static final char BACK_SLASH = '\\';
150     private static int NO_QUOTE = -1, IN_QUOTE = -2;
151 
quoteLiteral(CharSequence string)152     public String quoteLiteral(CharSequence string) {
153         return quoteLiteral(string.toString());
154     }
155 
156     /**
157      * Quote a literal string, using the available settings. Thus syntax characters, quote characters, and ignorable characters will be put into quotes.
158      * @param string String passed to quote a literal string.
159      * @return A string using the available settings will place syntax, quote, or ignorable characters into quotes.
160      */
quoteLiteral(String string)161     public String quoteLiteral(String string) {
162         if (needingQuoteCharacters == null) {
163             needingQuoteCharacters = new UnicodeSet().addAll(syntaxCharacters).addAll(ignorableCharacters).addAll(extraQuotingCharacters); // .addAll(quoteCharacters)
164             if (usingSlash) needingQuoteCharacters.add(BACK_SLASH);
165             if (usingQuote) needingQuoteCharacters.add(SINGLE_QUOTE);
166         }
167         StringBuffer result = new StringBuffer();
168         int quotedChar = NO_QUOTE;
169         int cp;
170         for (int i = 0; i < string.length(); i += UTF16.getCharCount(cp)) {
171             cp = UTF16.charAt(string, i);
172             if (escapeCharacters.contains(cp)) {
173                 // we may have to fix up previous characters
174                 if (quotedChar == IN_QUOTE) {
175                     result.append(SINGLE_QUOTE);
176                     quotedChar = NO_QUOTE;
177                 }
178                 appendEscaped(result, cp);
179                 continue;
180             }
181 
182             if (needingQuoteCharacters.contains(cp)) {
183                 // if we have already started a quote
184                 if (quotedChar == IN_QUOTE) {
185                     UTF16.append(result, cp);
186                     if (usingQuote && cp == SINGLE_QUOTE) { // double it
187                         result.append(SINGLE_QUOTE);
188                     }
189                     continue;
190                 }
191                 // otherwise not already in quote
192                 if (usingSlash) {
193                     result.append(BACK_SLASH);
194                     UTF16.append(result, cp);
195                     continue;
196                 }
197                 if (usingQuote) {
198                     if (cp == SINGLE_QUOTE) { // double it and continue
199                         result.append(SINGLE_QUOTE);
200                         result.append(SINGLE_QUOTE);
201                         continue;
202                     }
203                     result.append(SINGLE_QUOTE);
204                     UTF16.append(result, cp);
205                     quotedChar = IN_QUOTE;
206                     continue;
207                 }
208                 // we have no choice but to use \\u or \\U
209                 appendEscaped(result, cp);
210                 continue;
211             }
212             // otherwise cp doesn't need quoting
213             // we may have to fix up previous characters
214             if (quotedChar == IN_QUOTE) {
215                 result.append(SINGLE_QUOTE);
216                 quotedChar = NO_QUOTE;
217             }
218             UTF16.append(result, cp);
219         }
220         // all done.
221         // we may have to fix up previous characters
222         if (quotedChar == IN_QUOTE) {
223             result.append(SINGLE_QUOTE);
224         }
225         return result.toString();
226     }
227 
appendEscaped(StringBuffer result, int cp)228     private void appendEscaped(StringBuffer result, int cp) {
229         if (cp <= 0xFFFF) {
230             result.append("\\u").append(Utility.hex(cp,4));
231         } else {
232             result.append("\\U").append(Utility.hex(cp,8));
233         }
234     }
235 
normalize()236     public String normalize() {
237         int oldStart = start;
238         StringBuffer result = new StringBuffer();
239         StringBuffer buffer = new StringBuffer();
240         while (true) {
241             buffer.setLength(0);
242             int status = next(buffer);
243             if (status == DONE) {
244                 start = oldStart;
245                 return result.toString();
246             }
247             if (status != SYNTAX) {
248                 result.append(quoteLiteral(buffer));
249             } else {
250                 result.append(buffer);
251             }
252         }
253     }
254 
255     public static final int DONE = 0, SYNTAX = 1, LITERAL = 2, BROKEN_QUOTE = 3, BROKEN_ESCAPE = 4, UNKNOWN = 5;
256 
257     private static final int AFTER_QUOTE = -1, NONE = 0, START_QUOTE = 1, NORMAL_QUOTE = 2, SLASH_START = 3, HEX = 4;
258 
next(StringBuffer buffer)259     public int next(StringBuffer buffer) {
260         if (start >= limit) return DONE;
261         int status = UNKNOWN;
262         int lastQuote = UNKNOWN;
263         int quoteStatus = NONE;
264         int hexCount = 0;
265         int hexValue = 0;
266         int cp;
267         main:
268             for (int i = start; i < limit; i += UTF16.getCharCount(cp)) {
269                 cp = UTF16.charAt(pattern, i);
270                 // if we are in a quote, then handle it.
271                 switch (quoteStatus) {
272                 case SLASH_START:
273                     switch (cp) {
274                     case 'u':
275                         quoteStatus = HEX;
276                         hexCount = 4;
277                         hexValue = 0;
278                         continue main;
279                     case 'U':
280                         quoteStatus = HEX;
281                         hexCount = 8;
282                         hexValue = 0;
283                         continue main;
284                     default:
285                         if (usingSlash) {
286                             UTF16.append(buffer, cp);
287                             quoteStatus = NONE;
288                             continue main;
289                         } else {
290                             buffer.append(BACK_SLASH);
291                             quoteStatus = NONE;
292                         }
293                     }
294                     break; // fall through to NONE
295                 case HEX:
296                     hexValue <<= 4;
297                     hexValue += cp;
298                     switch (cp) {
299                     case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9':
300                         hexValue -= '0'; break;
301                     case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
302                         hexValue -= 'a' - 10; break;
303                     case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
304                         hexValue -= 'A' - 10; break;
305                     default:
306                         start = i;
307                     return BROKEN_ESCAPE;
308                     }
309                     --hexCount;
310                     if (hexCount == 0) {
311                         quoteStatus = NONE;
312                         UTF16.append(buffer, hexValue);
313                     }
314                     continue main;
315                 case AFTER_QUOTE:
316                     // see if we get another quote character
317                     // if we just ended a quote BUT the following character is the lastQuote character, then we have a situation like '...''...', so we restart the quote
318                     if (cp == lastQuote) {
319                         UTF16.append(buffer, cp);
320                         quoteStatus = NORMAL_QUOTE;
321                         continue main;
322                     }
323                     quoteStatus = NONE;
324                     break; // fall through to NONE
325                 case START_QUOTE:
326                     // if we are at the very start of a quote, and we hit another quote mark then we emit a literal quote character and end the quote
327                     if (cp == lastQuote) {
328                         UTF16.append(buffer, cp);
329                         quoteStatus = NONE; // get out of quote, with no trace remaining
330                         continue;
331                     }
332                     // otherwise get into quote
333                     UTF16.append(buffer, cp);
334                     quoteStatus = NORMAL_QUOTE;
335                     continue main;
336                 case NORMAL_QUOTE:
337                     if (cp == lastQuote) {
338                         quoteStatus = AFTER_QUOTE; // get out of quote
339                         continue main;
340                     }
341                     UTF16.append(buffer, cp);
342                     continue main;
343                 }
344 
345                 if (ignorableCharacters.contains(cp)) {
346                     continue;
347                 }
348                 // do syntax characters
349                 if (syntaxCharacters.contains(cp)) {
350                     if (status == UNKNOWN) {
351                         UTF16.append(buffer, cp);
352                         start = i + UTF16.getCharCount(cp);
353                         return SYNTAX;
354                     } else { // LITERAL, so back up and break
355                         start = i;
356                         return status;
357                     }
358                 }
359                 // otherwise it is a literal; keep on going
360                 status = LITERAL;
361                 if (cp == BACK_SLASH) {
362                     quoteStatus = SLASH_START;
363                     continue;
364                 } else if (usingQuote && cp == SINGLE_QUOTE) {
365                     lastQuote = cp;
366                     quoteStatus = START_QUOTE;
367                     continue;
368                 }
369                 // normal literals
370                 UTF16.append(buffer, cp);
371             }
372         // handle final cleanup
373         start = limit;
374         switch (quoteStatus) {
375         case HEX:
376             status = BROKEN_ESCAPE;
377             break;
378         case SLASH_START:
379             if (usingSlash) {
380                 status = BROKEN_ESCAPE;
381             } else {
382                 buffer.append(BACK_SLASH);
383             }
384             break;
385         case START_QUOTE: case NORMAL_QUOTE:
386             status = BROKEN_QUOTE;
387             break;
388         }
389         return status;
390     }
391 
392 
393 }
394 //eof
395