1 /* 2 ******************************************************************************* 3 * Copyright (C) 2002-2012, International Business Machines Corporation and * 4 * others. All Rights Reserved. * 5 ******************************************************************************* 6 */ 7 package org.unicode.cldr.util; 8 9 import java.text.ParsePosition; 10 import java.util.HashMap; 11 import java.util.HashSet; 12 import java.util.Map; 13 import java.util.Set; 14 15 import org.unicode.cldr.util.props.BagFormatter; 16 17 import com.ibm.icu.lang.UCharacter; 18 import com.ibm.icu.text.SymbolTable; 19 import com.ibm.icu.text.UTF16; 20 import com.ibm.icu.text.UnicodeMatcher; 21 import com.ibm.icu.text.UnicodeSet; 22 23 public class Tokenizer { 24 protected String source; 25 26 protected StringBuffer buffer = new StringBuffer(); 27 protected long number; 28 protected UnicodeSet unicodeSet = null; 29 protected int index; 30 boolean backedup = false; 31 protected int lastIndex = -1; 32 protected int nextIndex; 33 int lastValue = BACKEDUP_TOO_FAR; 34 TokenSymbolTable symbolTable = new TokenSymbolTable(); 35 36 private static final char QUOTE = '\'', 37 BSLASH = '\\'; 38 private static final UnicodeSet QUOTERS = new UnicodeSet().add(QUOTE).add(BSLASH); 39 private static final UnicodeSet WHITESPACE = new UnicodeSet("[" + 40 "\\u0009-\\u000D\\u0020\\u0085\\u200E\\u200F\\u2028\\u2029" + 41 "]"); 42 private static final UnicodeSet SYNTAX = new UnicodeSet("[" + 43 "\\u0021-\\u002F\\u003A-\\u0040\\u005B-\\u0060\\u007B-\\u007E" + 44 "\\u00A1-\\u00A7\\u00A9\\u00AB-\\u00AC\\u00AE" + 45 "\\u00B0-\\u00B1\\u00B6\\u00B7\\u00BB\\u00BF\\u00D7\\u00F7" + 46 "\\u2010-\\u2027\\u2030-\\u205E\\u2190-\\u2BFF" + 47 "\\u3001\\u3003\\u3008-\\u3020\\u3030" + 48 "\\uFD3E\\uFD3F\\uFE45\\uFE46" + 49 "]").removeAll(QUOTERS).remove('$'); 50 private static final UnicodeSet NEWLINE = new UnicodeSet("[\\u000A\\u000D\\u0085\\u2028\\u2029]"); 51 //private static final UnicodeSet DECIMAL = new UnicodeSet("[:Nd:]"); 52 private static final UnicodeSet NON_STRING = new UnicodeSet() 53 .addAll(WHITESPACE) 54 .addAll(SYNTAX); 55 56 protected UnicodeSet whiteSpace = WHITESPACE; 57 protected UnicodeSet syntax = SYNTAX; 58 private UnicodeSet non_string = NON_STRING; 59 fixSets()60 private void fixSets() { 61 if (syntax.containsSome(QUOTERS) || syntax.containsSome(whiteSpace)) { 62 syntax = ((UnicodeSet) syntax.clone()).removeAll(QUOTERS).removeAll(whiteSpace); 63 } 64 if (whiteSpace.containsSome(QUOTERS)) { 65 whiteSpace = ((UnicodeSet) whiteSpace.clone()).removeAll(QUOTERS); 66 } 67 non_string = new UnicodeSet(syntax) 68 .addAll(whiteSpace); 69 } 70 setSource(String source)71 public Tokenizer setSource(String source) { 72 this.source = source; 73 this.index = 0; 74 return this; // for chaining 75 } 76 setIndex(int index)77 public Tokenizer setIndex(int index) { 78 this.index = index; 79 return this; // for chaining 80 } 81 82 public static final int DONE = -1, 83 NUMBER = -2, 84 STRING = -3, 85 UNICODESET = -4, 86 UNTERMINATED_QUOTE = -5, 87 BACKEDUP_TOO_FAR = -6; 88 89 private static final int 90 //FIRST = 0, 91 //IN_NUMBER = 1, 92 //IN_SPACE = 2, 93 AFTER_QUOTE = 3, // warning: order is important for switch statement 94 IN_STRING = 4, 95 AFTER_BSLASH = 5, 96 IN_QUOTE = 6; 97 toString(int type, boolean backedupBefore)98 public String toString(int type, boolean backedupBefore) { 99 String s = backedup ? "@" : "*"; 100 switch (type) { 101 case DONE: 102 return s + "Done" + s; 103 case BACKEDUP_TOO_FAR: 104 return s + "Illegal Backup" + s; 105 case UNTERMINATED_QUOTE: 106 return s + "Unterminated Quote=" + getString() + s; 107 case STRING: 108 return s + "s=" + getString() + s; 109 case NUMBER: 110 return s + "n=" + getNumber() + s; 111 case UNICODESET: 112 return s + "n=" + getUnicodeSet() + s; 113 default: 114 return s + "c=" + usf.getName(type, true) + s; 115 } 116 } 117 118 private static final BagFormatter usf = new BagFormatter(); 119 backup()120 public void backup() { 121 if (backedup) throw new IllegalArgumentException("backup too far"); 122 backedup = true; 123 nextIndex = index; 124 index = lastIndex; 125 } 126 127 /* 128 public int next2() { 129 boolean backedupBefore = backedup; 130 int result = next(); 131 System.out.println(toString(result, backedupBefore)); 132 return result; 133 } 134 */ 135 next()136 public int next() { 137 if (backedup) { 138 backedup = false; 139 index = nextIndex; 140 return lastValue; 141 } 142 int cp = 0; 143 boolean inComment = false; 144 // clean off any leading whitespace or comments 145 while (true) { 146 if (index >= source.length()) return lastValue = DONE; 147 cp = nextChar(); 148 if (inComment) { 149 if (NEWLINE.contains(cp)) inComment = false; 150 } else { 151 if (cp == '#') 152 inComment = true; 153 else if (!whiteSpace.contains(cp)) break; 154 } 155 } 156 // record the last index in case we have to backup 157 lastIndex = index; 158 159 if (cp == '[') { 160 ParsePosition pos = new ParsePosition(index - 1); 161 unicodeSet = new UnicodeSet(source, pos, symbolTable); 162 index = pos.getIndex(); 163 return lastValue = UNICODESET; 164 } 165 // get syntax character 166 if (syntax.contains(cp)) return lastValue = cp; 167 168 // get number, if there is one 169 if (UCharacter.getType(cp) == Character.DECIMAL_DIGIT_NUMBER) { 170 number = UCharacter.getNumericValue(cp); 171 while (index < source.length()) { 172 cp = nextChar(); 173 if (UCharacter.getType(cp) != Character.DECIMAL_DIGIT_NUMBER) { 174 index -= UTF16.getCharCount(cp); // BACKUP! 175 break; 176 } 177 number *= 10; 178 number += UCharacter.getNumericValue(cp); 179 } 180 return lastValue = NUMBER; 181 } 182 buffer.setLength(0); 183 int status = IN_STRING; 184 main: while (true) { 185 switch (status) { 186 case AFTER_QUOTE: // check for double ''? 187 if (cp == QUOTE) { 188 UTF16.append(buffer, QUOTE); 189 status = IN_QUOTE; 190 break; 191 } 192 // OTHERWISE FALL THROUGH!!! 193 case IN_STRING: 194 if (cp == QUOTE) 195 status = IN_QUOTE; 196 else if (cp == BSLASH) 197 status = AFTER_BSLASH; 198 else if (non_string.contains(cp)) { 199 index -= UTF16.getCharCount(cp); // BACKUP! 200 break main; 201 } else 202 UTF16.append(buffer, cp); 203 break; 204 case IN_QUOTE: 205 if (cp == QUOTE) 206 status = AFTER_QUOTE; 207 else 208 UTF16.append(buffer, cp); 209 break; 210 case AFTER_BSLASH: 211 switch (cp) { 212 case 'n': 213 cp = '\n'; 214 break; 215 case 'r': 216 cp = '\r'; 217 break; 218 case 't': 219 cp = '\t'; 220 break; 221 } 222 UTF16.append(buffer, cp); 223 status = IN_STRING; 224 break; 225 default: 226 throw new IllegalArgumentException("Internal Error"); 227 } 228 if (index >= source.length()) break; 229 cp = nextChar(); 230 } 231 if (status > IN_STRING) return lastValue = UNTERMINATED_QUOTE; 232 return lastValue = STRING; 233 } 234 getString()235 public String getString() { 236 return buffer.toString(); 237 } 238 toString()239 public String toString() { 240 return source.substring(0, index) + "$$$" + source.substring(index); 241 } 242 getNumber()243 public long getNumber() { 244 return number; 245 } 246 getUnicodeSet()247 public UnicodeSet getUnicodeSet() { 248 return unicodeSet; 249 } 250 nextChar()251 private int nextChar() { 252 int cp = UTF16.charAt(source, index); 253 index += UTF16.getCharCount(cp); 254 return cp; 255 } 256 getIndex()257 public int getIndex() { 258 return index; 259 } 260 getSource()261 public String getSource() { 262 return source; 263 } 264 getSyntax()265 public UnicodeSet getSyntax() { 266 return syntax; 267 } 268 getWhiteSpace()269 public UnicodeSet getWhiteSpace() { 270 return whiteSpace; 271 } 272 setSyntax(UnicodeSet set)273 public void setSyntax(UnicodeSet set) { 274 syntax = set; 275 fixSets(); 276 } 277 setWhiteSpace(UnicodeSet set)278 public void setWhiteSpace(UnicodeSet set) { 279 whiteSpace = set; 280 fixSets(); 281 } 282 getLookedUpItems()283 public Set getLookedUpItems() { 284 return symbolTable.itemsLookedUp; 285 } 286 addSymbol(String var, String value, int start, int limit)287 public void addSymbol(String var, String value, int start, int limit) { 288 // the limit is after the ';', so remove it 289 --limit; 290 char[] body = new char[limit - start]; 291 value.getChars(start, limit, body, 0); 292 symbolTable.add(var, body); 293 } 294 295 public class TokenSymbolTable implements SymbolTable { 296 Map contents = new HashMap(); 297 Set itemsLookedUp = new HashSet(); 298 add(String var, char[] body)299 public void add(String var, char[] body) { 300 // start from 1 to avoid the $ 301 contents.put(var.substring(1), body); 302 } 303 304 /* (non-Javadoc) 305 * @see com.ibm.icu.text.SymbolTable#lookup(java.lang.String) 306 */ lookup(String s)307 public char[] lookup(String s) { 308 itemsLookedUp.add('$' + s); 309 return (char[]) contents.get(s); 310 } 311 312 /* (non-Javadoc) 313 * @see com.ibm.icu.text.SymbolTable#lookupMatcher(int) 314 */ lookupMatcher(int ch)315 public UnicodeMatcher lookupMatcher(int ch) { 316 // TODO Auto-generated method stub 317 return null; 318 } 319 320 /* (non-Javadoc) 321 * @see com.ibm.icu.text.SymbolTable#parseReference(java.lang.String, java.text.ParsePosition, int) 322 */ parseReference(String text, ParsePosition pos, int limit)323 public String parseReference(String text, ParsePosition pos, int limit) { 324 int cp; 325 int start = pos.getIndex(); 326 int i; 327 for (i = start; i < limit; i += UTF16.getCharCount(cp)) { 328 cp = UTF16.charAt(text, i); 329 if (!com.ibm.icu.lang.UCharacter.isUnicodeIdentifierPart(cp)) { 330 break; 331 } 332 } 333 pos.setIndex(i); 334 return text.substring(start, i); 335 } 336 337 } 338 } 339