• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  *******************************************************************************
3  * Copyright (C) 2002-2012, International Business Machines Corporation and    *
4  * others. All Rights Reserved.                                                *
5  *******************************************************************************
6  */
7 package org.unicode.cldr.util;
8 
9 import com.ibm.icu.lang.UCharacter;
10 import com.ibm.icu.text.SymbolTable;
11 import com.ibm.icu.text.UTF16;
12 import com.ibm.icu.text.UnicodeMatcher;
13 import com.ibm.icu.text.UnicodeSet;
14 import java.text.ParsePosition;
15 import java.util.HashMap;
16 import java.util.HashSet;
17 import java.util.Map;
18 import java.util.Set;
19 import org.unicode.cldr.util.props.BagFormatter;
20 
21 public class Tokenizer {
22     protected String source;
23 
24     protected StringBuffer buffer = new StringBuffer();
25     protected long number;
26     protected UnicodeSet unicodeSet = null;
27     protected int index;
28     boolean backedup = false;
29     protected int lastIndex = -1;
30     protected int nextIndex;
31     int lastValue = BACKEDUP_TOO_FAR;
32     TokenSymbolTable symbolTable = new TokenSymbolTable();
33 
34     private static final char QUOTE = '\'', BSLASH = '\\';
35     private static final UnicodeSet QUOTERS = new UnicodeSet().add(QUOTE).add(BSLASH);
36     private static final UnicodeSet WHITESPACE =
37             new UnicodeSet("[" + "\\u0009-\\u000D\\u0020\\u0085\\u200E\\u200F\\u2028\\u2029" + "]");
38     private static final UnicodeSet SYNTAX =
39             new UnicodeSet(
40                             "["
41                                     + "\\u0021-\\u002F\\u003A-\\u0040\\u005B-\\u0060\\u007B-\\u007E"
42                                     + "\\u00A1-\\u00A7\\u00A9\\u00AB-\\u00AC\\u00AE"
43                                     + "\\u00B0-\\u00B1\\u00B6\\u00B7\\u00BB\\u00BF\\u00D7\\u00F7"
44                                     + "\\u2010-\\u2027\\u2030-\\u205E\\u2190-\\u2BFF"
45                                     + "\\u3001\\u3003\\u3008-\\u3020\\u3030"
46                                     + "\\uFD3E\\uFD3F\\uFE45\\uFE46"
47                                     + "]")
48                     .removeAll(QUOTERS)
49                     .remove('$');
50     private static final UnicodeSet NEWLINE =
51             new UnicodeSet("[\\u000A\\u000D\\u0085\\u2028\\u2029]");
52     // private static final UnicodeSet DECIMAL = new UnicodeSet("[:Nd:]");
53     private static final UnicodeSet NON_STRING = new UnicodeSet().addAll(WHITESPACE).addAll(SYNTAX);
54 
55     protected UnicodeSet whiteSpace = WHITESPACE;
56     protected UnicodeSet syntax = SYNTAX;
57     private UnicodeSet non_string = NON_STRING;
58 
fixSets()59     private void fixSets() {
60         if (syntax.containsSome(QUOTERS) || syntax.containsSome(whiteSpace)) {
61             syntax = ((UnicodeSet) syntax.clone()).removeAll(QUOTERS).removeAll(whiteSpace);
62         }
63         if (whiteSpace.containsSome(QUOTERS)) {
64             whiteSpace = ((UnicodeSet) whiteSpace.clone()).removeAll(QUOTERS);
65         }
66         non_string = new UnicodeSet(syntax).addAll(whiteSpace);
67     }
68 
setSource(String source)69     public Tokenizer setSource(String source) {
70         this.source = source;
71         this.index = 0;
72         return this; // for chaining
73     }
74 
setIndex(int index)75     public Tokenizer setIndex(int index) {
76         this.index = index;
77         return this; // for chaining
78     }
79 
80     public static final int DONE = -1,
81             NUMBER = -2,
82             STRING = -3,
83             UNICODESET = -4,
84             UNTERMINATED_QUOTE = -5,
85             BACKEDUP_TOO_FAR = -6;
86     private static final int
87             // FIRST = 0,
88             // IN_NUMBER = 1,
89             // IN_SPACE = 2,
90             AFTER_QUOTE = 3, // warning: order is important for switch statement
91             IN_STRING = 4,
92             AFTER_BSLASH = 5,
93             IN_QUOTE = 6;
94 
toString(int type, boolean backedupBefore)95     public String toString(int type, boolean backedupBefore) {
96         String s = backedup ? "@" : "*";
97         switch (type) {
98             case DONE:
99                 return s + "Done" + s;
100             case BACKEDUP_TOO_FAR:
101                 return s + "Illegal Backup" + s;
102             case UNTERMINATED_QUOTE:
103                 return s + "Unterminated Quote=" + getString() + s;
104             case STRING:
105                 return s + "s=" + getString() + s;
106             case NUMBER:
107                 return s + "n=" + getNumber() + s;
108             case UNICODESET:
109                 return s + "n=" + getUnicodeSet() + s;
110             default:
111                 return s + "c=" + usf.getName(type, true) + s;
112         }
113     }
114 
115     private static final BagFormatter usf = new BagFormatter();
116 
backup()117     public void backup() {
118         if (backedup) throw new IllegalArgumentException("backup too far");
119         backedup = true;
120         nextIndex = index;
121         index = lastIndex;
122     }
123 
124     /*
125     public int next2() {
126         boolean backedupBefore = backedup;
127         int result = next();
128         System.out.println(toString(result, backedupBefore));
129         return result;
130     }
131     */
132 
next()133     public int next() {
134         if (backedup) {
135             backedup = false;
136             index = nextIndex;
137             return lastValue;
138         }
139         int cp = 0;
140         boolean inComment = false;
141         // clean off any leading whitespace or comments
142         while (true) {
143             if (index >= source.length()) return lastValue = DONE;
144             cp = nextChar();
145             if (inComment) {
146                 if (NEWLINE.contains(cp)) inComment = false;
147             } else {
148                 if (cp == '#') inComment = true;
149                 else if (!whiteSpace.contains(cp)) break;
150             }
151         }
152         // record the last index in case we have to backup
153         lastIndex = index;
154 
155         if (cp == '[') {
156             ParsePosition pos = new ParsePosition(index - 1);
157             unicodeSet = new UnicodeSet(source, pos, symbolTable);
158             index = pos.getIndex();
159             return lastValue = UNICODESET;
160         }
161         // get syntax character
162         if (syntax.contains(cp)) return lastValue = cp;
163 
164         // get number, if there is one
165         if (UCharacter.getType(cp) == Character.DECIMAL_DIGIT_NUMBER) {
166             number = UCharacter.getNumericValue(cp);
167             while (index < source.length()) {
168                 cp = nextChar();
169                 if (UCharacter.getType(cp) != Character.DECIMAL_DIGIT_NUMBER) {
170                     index -= UTF16.getCharCount(cp); // BACKUP!
171                     break;
172                 }
173                 number *= 10;
174                 number += UCharacter.getNumericValue(cp);
175             }
176             return lastValue = NUMBER;
177         }
178         buffer.setLength(0);
179         int status = IN_STRING;
180         main:
181         while (true) {
182             switch (status) {
183                 case AFTER_QUOTE: // check for double ''?
184                     if (cp == QUOTE) {
185                         UTF16.append(buffer, QUOTE);
186                         status = IN_QUOTE;
187                         break;
188                     }
189                     // OTHERWISE FALL THROUGH!!!
190                 case IN_STRING:
191                     if (cp == QUOTE) status = IN_QUOTE;
192                     else if (cp == BSLASH) status = AFTER_BSLASH;
193                     else if (non_string.contains(cp)) {
194                         index -= UTF16.getCharCount(cp); // BACKUP!
195                         break main;
196                     } else UTF16.append(buffer, cp);
197                     break;
198                 case IN_QUOTE:
199                     if (cp == QUOTE) status = AFTER_QUOTE;
200                     else UTF16.append(buffer, cp);
201                     break;
202                 case AFTER_BSLASH:
203                     switch (cp) {
204                         case 'n':
205                             cp = '\n';
206                             break;
207                         case 'r':
208                             cp = '\r';
209                             break;
210                         case 't':
211                             cp = '\t';
212                             break;
213                     }
214                     UTF16.append(buffer, cp);
215                     status = IN_STRING;
216                     break;
217                 default:
218                     throw new IllegalArgumentException("Internal Error");
219             }
220             if (index >= source.length()) break;
221             cp = nextChar();
222         }
223         if (status > IN_STRING) return lastValue = UNTERMINATED_QUOTE;
224         return lastValue = STRING;
225     }
226 
getString()227     public String getString() {
228         return buffer.toString();
229     }
230 
231     @Override
toString()232     public String toString() {
233         return source.substring(0, index) + "$$$" + source.substring(index);
234     }
235 
getNumber()236     public long getNumber() {
237         return number;
238     }
239 
getUnicodeSet()240     public UnicodeSet getUnicodeSet() {
241         return unicodeSet;
242     }
243 
nextChar()244     private int nextChar() {
245         int cp = UTF16.charAt(source, index);
246         index += UTF16.getCharCount(cp);
247         return cp;
248     }
249 
getIndex()250     public int getIndex() {
251         return index;
252     }
253 
getSource()254     public String getSource() {
255         return source;
256     }
257 
getSyntax()258     public UnicodeSet getSyntax() {
259         return syntax;
260     }
261 
getWhiteSpace()262     public UnicodeSet getWhiteSpace() {
263         return whiteSpace;
264     }
265 
setSyntax(UnicodeSet set)266     public void setSyntax(UnicodeSet set) {
267         syntax = set;
268         fixSets();
269     }
270 
setWhiteSpace(UnicodeSet set)271     public void setWhiteSpace(UnicodeSet set) {
272         whiteSpace = set;
273         fixSets();
274     }
275 
getLookedUpItems()276     public Set getLookedUpItems() {
277         return symbolTable.itemsLookedUp;
278     }
279 
addSymbol(String var, String value, int start, int limit)280     public void addSymbol(String var, String value, int start, int limit) {
281         // the limit is after the ';', so remove it
282         --limit;
283         char[] body = new char[limit - start];
284         value.getChars(start, limit, body, 0);
285         symbolTable.add(var, body);
286     }
287 
288     public class TokenSymbolTable implements SymbolTable {
289         Map contents = new HashMap();
290         Set itemsLookedUp = new HashSet();
291 
add(String var, char[] body)292         public void add(String var, char[] body) {
293             // start from 1 to avoid the $
294             contents.put(var.substring(1), body);
295         }
296 
297         /* (non-Javadoc)
298          * @see com.ibm.icu.text.SymbolTable#lookup(java.lang.String)
299          */
300         @Override
lookup(String s)301         public char[] lookup(String s) {
302             itemsLookedUp.add('$' + s);
303             return (char[]) contents.get(s);
304         }
305 
306         /* (non-Javadoc)
307          * @see com.ibm.icu.text.SymbolTable#lookupMatcher(int)
308          */
309         @Override
lookupMatcher(int ch)310         public UnicodeMatcher lookupMatcher(int ch) {
311             // TODO Auto-generated method stub
312             return null;
313         }
314 
315         /* (non-Javadoc)
316          * @see com.ibm.icu.text.SymbolTable#parseReference(java.lang.String, java.text.ParsePosition, int)
317          */
318         @Override
parseReference(String text, ParsePosition pos, int limit)319         public String parseReference(String text, ParsePosition pos, int limit) {
320             int cp;
321             int start = pos.getIndex();
322             int i;
323             for (i = start; i < limit; i += UTF16.getCharCount(cp)) {
324                 cp = UTF16.charAt(text, i);
325                 if (!com.ibm.icu.lang.UCharacter.isUnicodeIdentifierPart(cp)) {
326                     break;
327                 }
328             }
329             pos.setIndex(i);
330             return text.substring(start, i);
331         }
332     }
333 }
334