• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1  /* GENERATED SOURCE. DO NOT MODIFY. */
2  // © 2016 and later: Unicode, Inc. and others.
3  // License & terms of use: http://www.unicode.org/copyright.html#License
4  /*
5   *******************************************************************************
6   * Copyright (C) 2006-2009, Google, International Business Machines Corporation *
7   * and others. All Rights Reserved.                                            *
8   *******************************************************************************
9   */
10  package ohos.global.icu.impl;
11  
12  import ohos.global.icu.text.UTF16;
13  import ohos.global.icu.text.UnicodeSet;
14  
15  /**
16   * A simple parsing class for patterns and rules. Handles '...' quotations, \\uxxxx and \\Uxxxxxxxx, and symple syntax.
17   * The '' (two quotes) is treated as a single quote, inside or outside a quote
18   * <ul>
19   * <li>Any ignorable characters are ignored in parsing.</li>
20   * <li>Any syntax characters are broken into separate tokens</li>
21   * <li>Quote characters can be specified: '...', "...", and \x </li>
22   * <li>Other characters are treated as literals</li>
23   * </ul>
24   * @hide exposed on OHOS
25   */
26  public class PatternTokenizer {
27      // settings used in the interpretation of the pattern
28      private UnicodeSet ignorableCharacters = new UnicodeSet();
29      private UnicodeSet syntaxCharacters = new UnicodeSet();
30      private UnicodeSet extraQuotingCharacters = new UnicodeSet();
31      private UnicodeSet escapeCharacters = new UnicodeSet();
32      private boolean usingSlash = false;
33      private boolean usingQuote = false;
34  
35      // transient data, set when needed. Null it out for any changes in the above fields.
36      private transient UnicodeSet needingQuoteCharacters = null;
37  
38      // data about the current pattern being parsed. start gets moved as we go along.
39      private int start;
40      private int limit;
41      private String pattern;
42  
getIgnorableCharacters()43      public UnicodeSet getIgnorableCharacters() {
44          return (UnicodeSet) ignorableCharacters.clone();
45      }
46      /**
47       * Sets the characters to be ignored in parsing, eg new UnicodeSet("[:pattern_whitespace:]");
48       * @param ignorableCharacters Characters to be ignored.
49       * @return A PatternTokenizer object in which characters are specified as ignored characters.
50       */
setIgnorableCharacters(UnicodeSet ignorableCharacters)51      public PatternTokenizer setIgnorableCharacters(UnicodeSet ignorableCharacters) {
52          this.ignorableCharacters = (UnicodeSet) ignorableCharacters.clone();
53          needingQuoteCharacters = null;
54          return this;
55      }
getSyntaxCharacters()56      public UnicodeSet getSyntaxCharacters() {
57          return (UnicodeSet) syntaxCharacters.clone();
58      }
getExtraQuotingCharacters()59      public UnicodeSet getExtraQuotingCharacters() {
60          return (UnicodeSet) extraQuotingCharacters.clone();
61      }
62      /**
63       *  Sets the characters to be interpreted as syntax characters in parsing, eg new UnicodeSet("[:pattern_syntax:]")
64       * @param syntaxCharacters Characters to be set as syntax characters.
65       * @return A PatternTokenizer object in which characters are specified as syntax characters.
66       */
setSyntaxCharacters(UnicodeSet syntaxCharacters)67      public PatternTokenizer setSyntaxCharacters(UnicodeSet syntaxCharacters) {
68          this.syntaxCharacters = (UnicodeSet) syntaxCharacters.clone();
69          needingQuoteCharacters = null;
70          return this;
71      }
72      /**
73       *  Sets the extra characters to be quoted in literals
74       * @param syntaxCharacters Characters to be set as extra quoting characters.
75       * @return A PatternTokenizer object in which characters are specified as extra quoting characters.
76       */
setExtraQuotingCharacters(UnicodeSet syntaxCharacters)77      public PatternTokenizer setExtraQuotingCharacters(UnicodeSet syntaxCharacters) {
78          this.extraQuotingCharacters = (UnicodeSet) syntaxCharacters.clone();
79          needingQuoteCharacters = null;
80          return this;
81      }
82  
getEscapeCharacters()83      public UnicodeSet getEscapeCharacters() {
84          return (UnicodeSet) escapeCharacters.clone();
85      }
86      /**
87       * Set characters to be escaped in literals, in quoteLiteral and normalize, eg new UnicodeSet("[^\\u0020-\\u007E]");
88       * @param escapeCharacters Characters to be set as escape characters.
89       * @return A PatternTokenizer object in which characters are specified as escape characters.
90       */
setEscapeCharacters(UnicodeSet escapeCharacters)91      public PatternTokenizer setEscapeCharacters(UnicodeSet escapeCharacters) {
92          this.escapeCharacters = (UnicodeSet) escapeCharacters.clone();
93          return this;
94      }
isUsingQuote()95      public boolean isUsingQuote() {
96          return usingQuote;
97      }
setUsingQuote(boolean usingQuote)98      public PatternTokenizer setUsingQuote(boolean usingQuote) {
99          this.usingQuote = usingQuote;
100          needingQuoteCharacters = null;
101          return this;
102      }
isUsingSlash()103      public boolean isUsingSlash() {
104          return usingSlash;
105      }
setUsingSlash(boolean usingSlash)106      public PatternTokenizer setUsingSlash(boolean usingSlash) {
107          this.usingSlash = usingSlash;
108          needingQuoteCharacters = null;
109          return this;
110      }
111      //    public UnicodeSet getQuoteCharacters() {
112  //  return (UnicodeSet) quoteCharacters.clone();
113  //  }
114  //  public PatternTokenizer setQuoteCharacters(UnicodeSet quoteCharacters) {
115  //  this.quoteCharacters = (UnicodeSet) quoteCharacters.clone();
116  //  needingQuoteCharacters = null;
117  //  return this;
118  //  }
getLimit()119      public int getLimit() {
120          return limit;
121      }
setLimit(int limit)122      public PatternTokenizer setLimit(int limit) {
123          this.limit = limit;
124          return this;
125      }
getStart()126      public int getStart() {
127          return start;
128      }
setStart(int start)129      public PatternTokenizer setStart(int start) {
130          this.start = start;
131          return this;
132      }
133  
setPattern(CharSequence pattern)134      public PatternTokenizer setPattern(CharSequence pattern) {
135          return setPattern(pattern.toString());
136      }
137  
setPattern(String pattern)138      public PatternTokenizer setPattern(String pattern) {
139          if (pattern == null) {
140              throw new IllegalArgumentException("Inconsistent arguments");
141          }
142          this.start = 0;
143          this.limit = pattern.length();
144          this.pattern = pattern;
145          return this;
146      }
147  
148      public static final char SINGLE_QUOTE = '\'';
149      public static final char BACK_SLASH = '\\';
150      private static int NO_QUOTE = -1, IN_QUOTE = -2;
151  
quoteLiteral(CharSequence string)152      public String quoteLiteral(CharSequence string) {
153          return quoteLiteral(string.toString());
154      }
155  
156      /**
157       * Quote a literal string, using the available settings. Thus syntax characters, quote characters, and ignorable characters will be put into quotes.
158       * @param string String passed to quote a literal string.
159       * @return A string using the available settings will place syntax, quote, or ignorable characters into quotes.
160       */
quoteLiteral(String string)161      public String quoteLiteral(String string) {
162          if (needingQuoteCharacters == null) {
163              needingQuoteCharacters = new UnicodeSet().addAll(syntaxCharacters).addAll(ignorableCharacters).addAll(extraQuotingCharacters); // .addAll(quoteCharacters)
164              if (usingSlash) needingQuoteCharacters.add(BACK_SLASH);
165              if (usingQuote) needingQuoteCharacters.add(SINGLE_QUOTE);
166          }
167          StringBuffer result = new StringBuffer();
168          int quotedChar = NO_QUOTE;
169          int cp;
170          for (int i = 0; i < string.length(); i += UTF16.getCharCount(cp)) {
171              cp = UTF16.charAt(string, i);
172              if (escapeCharacters.contains(cp)) {
173                  // we may have to fix up previous characters
174                  if (quotedChar == IN_QUOTE) {
175                      result.append(SINGLE_QUOTE);
176                      quotedChar = NO_QUOTE;
177                  }
178                  appendEscaped(result, cp);
179                  continue;
180              }
181  
182              if (needingQuoteCharacters.contains(cp)) {
183                  // if we have already started a quote
184                  if (quotedChar == IN_QUOTE) {
185                      UTF16.append(result, cp);
186                      if (usingQuote && cp == SINGLE_QUOTE) { // double it
187                          result.append(SINGLE_QUOTE);
188                      }
189                      continue;
190                  }
191                  // otherwise not already in quote
192                  if (usingSlash) {
193                      result.append(BACK_SLASH);
194                      UTF16.append(result, cp);
195                      continue;
196                  }
197                  if (usingQuote) {
198                      if (cp == SINGLE_QUOTE) { // double it and continue
199                          result.append(SINGLE_QUOTE);
200                          result.append(SINGLE_QUOTE);
201                          continue;
202                      }
203                      result.append(SINGLE_QUOTE);
204                      UTF16.append(result, cp);
205                      quotedChar = IN_QUOTE;
206                      continue;
207                  }
208                  // we have no choice but to use \\u or \\U
209                  appendEscaped(result, cp);
210                  continue;
211              }
212              // otherwise cp doesn't need quoting
213              // we may have to fix up previous characters
214              if (quotedChar == IN_QUOTE) {
215                  result.append(SINGLE_QUOTE);
216                  quotedChar = NO_QUOTE;
217              }
218              UTF16.append(result, cp);
219          }
220          // all done.
221          // we may have to fix up previous characters
222          if (quotedChar == IN_QUOTE) {
223              result.append(SINGLE_QUOTE);
224          }
225          return result.toString();
226      }
227  
appendEscaped(StringBuffer result, int cp)228      private void appendEscaped(StringBuffer result, int cp) {
229          if (cp <= 0xFFFF) {
230              result.append("\\u").append(Utility.hex(cp,4));
231          } else {
232              result.append("\\U").append(Utility.hex(cp,8));
233          }
234      }
235  
normalize()236      public String normalize() {
237          int oldStart = start;
238          StringBuffer result = new StringBuffer();
239          StringBuffer buffer = new StringBuffer();
240          while (true) {
241              buffer.setLength(0);
242              int status = next(buffer);
243              if (status == DONE) {
244                  start = oldStart;
245                  return result.toString();
246              }
247              if (status != SYNTAX) {
248                  result.append(quoteLiteral(buffer));
249              } else {
250                  result.append(buffer);
251              }
252          }
253      }
254  
255      public static final int DONE = 0, SYNTAX = 1, LITERAL = 2, BROKEN_QUOTE = 3, BROKEN_ESCAPE = 4, UNKNOWN = 5;
256  
257      private static final int AFTER_QUOTE = -1, NONE = 0, START_QUOTE = 1, NORMAL_QUOTE = 2, SLASH_START = 3, HEX = 4;
258  
next(StringBuffer buffer)259      public int next(StringBuffer buffer) {
260          if (start >= limit) return DONE;
261          int status = UNKNOWN;
262          int lastQuote = UNKNOWN;
263          int quoteStatus = NONE;
264          int hexCount = 0;
265          int hexValue = 0;
266          int cp;
267          main:
268              for (int i = start; i < limit; i += UTF16.getCharCount(cp)) {
269                  cp = UTF16.charAt(pattern, i);
270                  // if we are in a quote, then handle it.
271                  switch (quoteStatus) {
272                  case SLASH_START:
273                      switch (cp) {
274                      case 'u':
275                          quoteStatus = HEX;
276                          hexCount = 4;
277                          hexValue = 0;
278                          continue main;
279                      case 'U':
280                          quoteStatus = HEX;
281                          hexCount = 8;
282                          hexValue = 0;
283                          continue main;
284                      default:
285                          if (usingSlash) {
286                              UTF16.append(buffer, cp);
287                              quoteStatus = NONE;
288                              continue main;
289                          } else {
290                              buffer.append(BACK_SLASH);
291                              quoteStatus = NONE;
292                          }
293                      }
294                      break; // fall through to NONE
295                  case HEX:
296                      hexValue <<= 4;
297                      hexValue += cp;
298                      switch (cp) {
299                      case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9':
300                          hexValue -= '0'; break;
301                      case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
302                          hexValue -= 'a' - 10; break;
303                      case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
304                          hexValue -= 'A' - 10; break;
305                      default:
306                          start = i;
307                      return BROKEN_ESCAPE;
308                      }
309                      --hexCount;
310                      if (hexCount == 0) {
311                          quoteStatus = NONE;
312                          UTF16.append(buffer, hexValue);
313                      }
314                      continue main;
315                  case AFTER_QUOTE:
316                      // see if we get another quote character
317                      // if we just ended a quote BUT the following character is the lastQuote character, then we have a situation like '...''...', so we restart the quote
318                      if (cp == lastQuote) {
319                          UTF16.append(buffer, cp);
320                          quoteStatus = NORMAL_QUOTE;
321                          continue main;
322                      }
323                      quoteStatus = NONE;
324                      break; // fall through to NONE
325                  case START_QUOTE:
326                      // if we are at the very start of a quote, and we hit another quote mark then we emit a literal quote character and end the quote
327                      if (cp == lastQuote) {
328                          UTF16.append(buffer, cp);
329                          quoteStatus = NONE; // get out of quote, with no trace remaining
330                          continue;
331                      }
332                      // otherwise get into quote
333                      UTF16.append(buffer, cp);
334                      quoteStatus = NORMAL_QUOTE;
335                      continue main;
336                  case NORMAL_QUOTE:
337                      if (cp == lastQuote) {
338                          quoteStatus = AFTER_QUOTE; // get out of quote
339                          continue main;
340                      }
341                      UTF16.append(buffer, cp);
342                      continue main;
343                  }
344  
345                  if (ignorableCharacters.contains(cp)) {
346                      continue;
347                  }
348                  // do syntax characters
349                  if (syntaxCharacters.contains(cp)) {
350                      if (status == UNKNOWN) {
351                          UTF16.append(buffer, cp);
352                          start = i + UTF16.getCharCount(cp);
353                          return SYNTAX;
354                      } else { // LITERAL, so back up and break
355                          start = i;
356                          return status;
357                      }
358                  }
359                  // otherwise it is a literal; keep on going
360                  status = LITERAL;
361                  if (cp == BACK_SLASH) {
362                      quoteStatus = SLASH_START;
363                      continue;
364                  } else if (usingQuote && cp == SINGLE_QUOTE) {
365                      lastQuote = cp;
366                      quoteStatus = START_QUOTE;
367                      continue;
368                  }
369                  // normal literals
370                  UTF16.append(buffer, cp);
371              }
372          // handle final cleanup
373          start = limit;
374          switch (quoteStatus) {
375          case HEX:
376              status = BROKEN_ESCAPE;
377              break;
378          case SLASH_START:
379              if (usingSlash) {
380                  status = BROKEN_ESCAPE;
381              } else {
382                  buffer.append(BACK_SLASH);
383              }
384              break;
385          case START_QUOTE: case NORMAL_QUOTE:
386              status = BROKEN_QUOTE;
387              break;
388          }
389          return status;
390      }
391  
392  
393  }
394  //eof
395