1 /* 2 ******************************************************************************* 3 * Copyright (C) 2002-2012, International Business Machines Corporation and * 4 * others. All Rights Reserved. * 5 ******************************************************************************* 6 */ 7 package org.unicode.cldr.util; 8 9 import java.text.ParsePosition; 10 import java.util.HashMap; 11 import java.util.HashSet; 12 import java.util.Map; 13 import java.util.Set; 14 15 import org.unicode.cldr.util.props.BagFormatter; 16 17 import com.ibm.icu.lang.UCharacter; 18 import com.ibm.icu.text.SymbolTable; 19 import com.ibm.icu.text.UTF16; 20 import com.ibm.icu.text.UnicodeMatcher; 21 import com.ibm.icu.text.UnicodeSet; 22 23 public class Tokenizer { 24 protected String source; 25 26 protected StringBuffer buffer = new StringBuffer(); 27 protected long number; 28 protected UnicodeSet unicodeSet = null; 29 protected int index; 30 boolean backedup = false; 31 protected int lastIndex = -1; 32 protected int nextIndex; 33 int lastValue = BACKEDUP_TOO_FAR; 34 TokenSymbolTable symbolTable = new TokenSymbolTable(); 35 36 private static final char QUOTE = '\'', 37 BSLASH = '\\'; 38 private static final UnicodeSet QUOTERS = new UnicodeSet().add(QUOTE).add(BSLASH); 39 private static final UnicodeSet WHITESPACE = new UnicodeSet("[" + 40 "\\u0009-\\u000D\\u0020\\u0085\\u200E\\u200F\\u2028\\u2029" + 41 "]"); 42 private static final UnicodeSet SYNTAX = new UnicodeSet("[" + 43 "\\u0021-\\u002F\\u003A-\\u0040\\u005B-\\u0060\\u007B-\\u007E" + 44 "\\u00A1-\\u00A7\\u00A9\\u00AB-\\u00AC\\u00AE" + 45 "\\u00B0-\\u00B1\\u00B6\\u00B7\\u00BB\\u00BF\\u00D7\\u00F7" + 46 "\\u2010-\\u2027\\u2030-\\u205E\\u2190-\\u2BFF" + 47 "\\u3001\\u3003\\u3008-\\u3020\\u3030" + 48 "\\uFD3E\\uFD3F\\uFE45\\uFE46" + 49 "]").removeAll(QUOTERS).remove('$'); 50 private static final UnicodeSet NEWLINE = new UnicodeSet("[\\u000A\\u000D\\u0085\\u2028\\u2029]"); 51 //private static final UnicodeSet DECIMAL = new UnicodeSet("[:Nd:]"); 52 private static final UnicodeSet NON_STRING = new UnicodeSet() 53 .addAll(WHITESPACE) 54 .addAll(SYNTAX); 55 56 protected UnicodeSet whiteSpace = WHITESPACE; 57 protected UnicodeSet syntax = SYNTAX; 58 private UnicodeSet non_string = NON_STRING; 59 fixSets()60 private void fixSets() { 61 if (syntax.containsSome(QUOTERS) || syntax.containsSome(whiteSpace)) { 62 syntax = ((UnicodeSet) syntax.clone()).removeAll(QUOTERS).removeAll(whiteSpace); 63 } 64 if (whiteSpace.containsSome(QUOTERS)) { 65 whiteSpace = ((UnicodeSet) whiteSpace.clone()).removeAll(QUOTERS); 66 } 67 non_string = new UnicodeSet(syntax) 68 .addAll(whiteSpace); 69 } 70 setSource(String source)71 public Tokenizer setSource(String source) { 72 this.source = source; 73 this.index = 0; 74 return this; // for chaining 75 } 76 setIndex(int index)77 public Tokenizer setIndex(int index) { 78 this.index = index; 79 return this; // for chaining 80 } 81 82 public static final int DONE = -1, 83 NUMBER = -2, 84 STRING = -3, 85 UNICODESET = -4, 86 UNTERMINATED_QUOTE = -5, 87 BACKEDUP_TOO_FAR = -6; 88 89 private static final int 90 //FIRST = 0, 91 //IN_NUMBER = 1, 92 //IN_SPACE = 2, 93 AFTER_QUOTE = 3, // warning: order is important for switch statement 94 IN_STRING = 4, 95 AFTER_BSLASH = 5, 96 IN_QUOTE = 6; 97 toString(int type, boolean backedupBefore)98 public String toString(int type, boolean backedupBefore) { 99 String s = backedup ? "@" : "*"; 100 switch (type) { 101 case DONE: 102 return s + "Done" + s; 103 case BACKEDUP_TOO_FAR: 104 return s + "Illegal Backup" + s; 105 case UNTERMINATED_QUOTE: 106 return s + "Unterminated Quote=" + getString() + s; 107 case STRING: 108 return s + "s=" + getString() + s; 109 case NUMBER: 110 return s + "n=" + getNumber() + s; 111 case UNICODESET: 112 return s + "n=" + getUnicodeSet() + s; 113 default: 114 return s + "c=" + usf.getName(type, true) + s; 115 } 116 } 117 118 private static final BagFormatter usf = new BagFormatter(); 119 backup()120 public void backup() { 121 if (backedup) throw new IllegalArgumentException("backup too far"); 122 backedup = true; 123 nextIndex = index; 124 index = lastIndex; 125 } 126 127 /* 128 public int next2() { 129 boolean backedupBefore = backedup; 130 int result = next(); 131 System.out.println(toString(result, backedupBefore)); 132 return result; 133 } 134 */ 135 next()136 public int next() { 137 if (backedup) { 138 backedup = false; 139 index = nextIndex; 140 return lastValue; 141 } 142 int cp = 0; 143 boolean inComment = false; 144 // clean off any leading whitespace or comments 145 while (true) { 146 if (index >= source.length()) return lastValue = DONE; 147 cp = nextChar(); 148 if (inComment) { 149 if (NEWLINE.contains(cp)) inComment = false; 150 } else { 151 if (cp == '#') 152 inComment = true; 153 else if (!whiteSpace.contains(cp)) break; 154 } 155 } 156 // record the last index in case we have to backup 157 lastIndex = index; 158 159 if (cp == '[') { 160 ParsePosition pos = new ParsePosition(index - 1); 161 unicodeSet = new UnicodeSet(source, pos, symbolTable); 162 index = pos.getIndex(); 163 return lastValue = UNICODESET; 164 } 165 // get syntax character 166 if (syntax.contains(cp)) return lastValue = cp; 167 168 // get number, if there is one 169 if (UCharacter.getType(cp) == Character.DECIMAL_DIGIT_NUMBER) { 170 number = UCharacter.getNumericValue(cp); 171 while (index < source.length()) { 172 cp = nextChar(); 173 if (UCharacter.getType(cp) != Character.DECIMAL_DIGIT_NUMBER) { 174 index -= UTF16.getCharCount(cp); // BACKUP! 175 break; 176 } 177 number *= 10; 178 number += UCharacter.getNumericValue(cp); 179 } 180 return lastValue = NUMBER; 181 } 182 buffer.setLength(0); 183 int status = IN_STRING; 184 main: while (true) { 185 switch (status) { 186 case AFTER_QUOTE: // check for double ''? 187 if (cp == QUOTE) { 188 UTF16.append(buffer, QUOTE); 189 status = IN_QUOTE; 190 break; 191 } 192 // OTHERWISE FALL THROUGH!!! 193 case IN_STRING: 194 if (cp == QUOTE) 195 status = IN_QUOTE; 196 else if (cp == BSLASH) 197 status = AFTER_BSLASH; 198 else if (non_string.contains(cp)) { 199 index -= UTF16.getCharCount(cp); // BACKUP! 200 break main; 201 } else 202 UTF16.append(buffer, cp); 203 break; 204 case IN_QUOTE: 205 if (cp == QUOTE) 206 status = AFTER_QUOTE; 207 else 208 UTF16.append(buffer, cp); 209 break; 210 case AFTER_BSLASH: 211 switch (cp) { 212 case 'n': 213 cp = '\n'; 214 break; 215 case 'r': 216 cp = '\r'; 217 break; 218 case 't': 219 cp = '\t'; 220 break; 221 } 222 UTF16.append(buffer, cp); 223 status = IN_STRING; 224 break; 225 default: 226 throw new IllegalArgumentException("Internal Error"); 227 } 228 if (index >= source.length()) break; 229 cp = nextChar(); 230 } 231 if (status > IN_STRING) return lastValue = UNTERMINATED_QUOTE; 232 return lastValue = STRING; 233 } 234 getString()235 public String getString() { 236 return buffer.toString(); 237 } 238 239 @Override toString()240 public String toString() { 241 return source.substring(0, index) + "$$$" + source.substring(index); 242 } 243 getNumber()244 public long getNumber() { 245 return number; 246 } 247 getUnicodeSet()248 public UnicodeSet getUnicodeSet() { 249 return unicodeSet; 250 } 251 nextChar()252 private int nextChar() { 253 int cp = UTF16.charAt(source, index); 254 index += UTF16.getCharCount(cp); 255 return cp; 256 } 257 getIndex()258 public int getIndex() { 259 return index; 260 } 261 getSource()262 public String getSource() { 263 return source; 264 } 265 getSyntax()266 public UnicodeSet getSyntax() { 267 return syntax; 268 } 269 getWhiteSpace()270 public UnicodeSet getWhiteSpace() { 271 return whiteSpace; 272 } 273 setSyntax(UnicodeSet set)274 public void setSyntax(UnicodeSet set) { 275 syntax = set; 276 fixSets(); 277 } 278 setWhiteSpace(UnicodeSet set)279 public void setWhiteSpace(UnicodeSet set) { 280 whiteSpace = set; 281 fixSets(); 282 } 283 getLookedUpItems()284 public Set getLookedUpItems() { 285 return symbolTable.itemsLookedUp; 286 } 287 addSymbol(String var, String value, int start, int limit)288 public void addSymbol(String var, String value, int start, int limit) { 289 // the limit is after the ';', so remove it 290 --limit; 291 char[] body = new char[limit - start]; 292 value.getChars(start, limit, body, 0); 293 symbolTable.add(var, body); 294 } 295 296 public class TokenSymbolTable implements SymbolTable { 297 Map contents = new HashMap(); 298 Set itemsLookedUp = new HashSet(); 299 add(String var, char[] body)300 public void add(String var, char[] body) { 301 // start from 1 to avoid the $ 302 contents.put(var.substring(1), body); 303 } 304 305 /* (non-Javadoc) 306 * @see com.ibm.icu.text.SymbolTable#lookup(java.lang.String) 307 */ 308 @Override lookup(String s)309 public char[] lookup(String s) { 310 itemsLookedUp.add('$' + s); 311 return (char[]) contents.get(s); 312 } 313 314 /* (non-Javadoc) 315 * @see com.ibm.icu.text.SymbolTable#lookupMatcher(int) 316 */ 317 @Override lookupMatcher(int ch)318 public UnicodeMatcher lookupMatcher(int ch) { 319 // TODO Auto-generated method stub 320 return null; 321 } 322 323 /* (non-Javadoc) 324 * @see com.ibm.icu.text.SymbolTable#parseReference(java.lang.String, java.text.ParsePosition, int) 325 */ 326 @Override parseReference(String text, ParsePosition pos, int limit)327 public String parseReference(String text, ParsePosition pos, int limit) { 328 int cp; 329 int start = pos.getIndex(); 330 int i; 331 for (i = start; i < limit; i += UTF16.getCharCount(cp)) { 332 cp = UTF16.charAt(text, i); 333 if (!com.ibm.icu.lang.UCharacter.isUnicodeIdentifierPart(cp)) { 334 break; 335 } 336 } 337 pos.setIndex(i); 338 return text.substring(start, i); 339 } 340 341 } 342 } 343