1 /* 2 ******************************************************************************* 3 * Copyright (C) 2002-2012, International Business Machines Corporation and * 4 * others. All Rights Reserved. * 5 ******************************************************************************* 6 */ 7 package org.unicode.cldr.util; 8 9 import com.ibm.icu.lang.UCharacter; 10 import com.ibm.icu.text.SymbolTable; 11 import com.ibm.icu.text.UTF16; 12 import com.ibm.icu.text.UnicodeMatcher; 13 import com.ibm.icu.text.UnicodeSet; 14 import java.text.ParsePosition; 15 import java.util.HashMap; 16 import java.util.HashSet; 17 import java.util.Map; 18 import java.util.Set; 19 import org.unicode.cldr.util.props.BagFormatter; 20 21 public class Tokenizer { 22 protected String source; 23 24 protected StringBuffer buffer = new StringBuffer(); 25 protected long number; 26 protected UnicodeSet unicodeSet = null; 27 protected int index; 28 boolean backedup = false; 29 protected int lastIndex = -1; 30 protected int nextIndex; 31 int lastValue = BACKEDUP_TOO_FAR; 32 TokenSymbolTable symbolTable = new TokenSymbolTable(); 33 34 private static final char QUOTE = '\'', BSLASH = '\\'; 35 private static final UnicodeSet QUOTERS = new UnicodeSet().add(QUOTE).add(BSLASH); 36 private static final UnicodeSet WHITESPACE = 37 new UnicodeSet("[" + "\\u0009-\\u000D\\u0020\\u0085\\u200E\\u200F\\u2028\\u2029" + "]"); 38 private static final UnicodeSet SYNTAX = 39 new UnicodeSet( 40 "[" 41 + "\\u0021-\\u002F\\u003A-\\u0040\\u005B-\\u0060\\u007B-\\u007E" 42 + "\\u00A1-\\u00A7\\u00A9\\u00AB-\\u00AC\\u00AE" 43 + "\\u00B0-\\u00B1\\u00B6\\u00B7\\u00BB\\u00BF\\u00D7\\u00F7" 44 + "\\u2010-\\u2027\\u2030-\\u205E\\u2190-\\u2BFF" 45 + "\\u3001\\u3003\\u3008-\\u3020\\u3030" 46 + "\\uFD3E\\uFD3F\\uFE45\\uFE46" 47 + "]") 48 .removeAll(QUOTERS) 49 .remove('$'); 50 private static final UnicodeSet NEWLINE = 51 new UnicodeSet("[\\u000A\\u000D\\u0085\\u2028\\u2029]"); 52 // private static final UnicodeSet DECIMAL = new UnicodeSet("[:Nd:]"); 53 private static final UnicodeSet NON_STRING = new UnicodeSet().addAll(WHITESPACE).addAll(SYNTAX); 54 55 protected UnicodeSet whiteSpace = WHITESPACE; 56 protected UnicodeSet syntax = SYNTAX; 57 private UnicodeSet non_string = NON_STRING; 58 fixSets()59 private void fixSets() { 60 if (syntax.containsSome(QUOTERS) || syntax.containsSome(whiteSpace)) { 61 syntax = ((UnicodeSet) syntax.clone()).removeAll(QUOTERS).removeAll(whiteSpace); 62 } 63 if (whiteSpace.containsSome(QUOTERS)) { 64 whiteSpace = ((UnicodeSet) whiteSpace.clone()).removeAll(QUOTERS); 65 } 66 non_string = new UnicodeSet(syntax).addAll(whiteSpace); 67 } 68 setSource(String source)69 public Tokenizer setSource(String source) { 70 this.source = source; 71 this.index = 0; 72 return this; // for chaining 73 } 74 setIndex(int index)75 public Tokenizer setIndex(int index) { 76 this.index = index; 77 return this; // for chaining 78 } 79 80 public static final int DONE = -1, 81 NUMBER = -2, 82 STRING = -3, 83 UNICODESET = -4, 84 UNTERMINATED_QUOTE = -5, 85 BACKEDUP_TOO_FAR = -6; 86 private static final int 87 // FIRST = 0, 88 // IN_NUMBER = 1, 89 // IN_SPACE = 2, 90 AFTER_QUOTE = 3, // warning: order is important for switch statement 91 IN_STRING = 4, 92 AFTER_BSLASH = 5, 93 IN_QUOTE = 6; 94 toString(int type, boolean backedupBefore)95 public String toString(int type, boolean backedupBefore) { 96 String s = backedup ? "@" : "*"; 97 switch (type) { 98 case DONE: 99 return s + "Done" + s; 100 case BACKEDUP_TOO_FAR: 101 return s + "Illegal Backup" + s; 102 case UNTERMINATED_QUOTE: 103 return s + "Unterminated Quote=" + getString() + s; 104 case STRING: 105 return s + "s=" + getString() + s; 106 case NUMBER: 107 return s + "n=" + getNumber() + s; 108 case UNICODESET: 109 return s + "n=" + getUnicodeSet() + s; 110 default: 111 return s + "c=" + usf.getName(type, true) + s; 112 } 113 } 114 115 private static final BagFormatter usf = new BagFormatter(); 116 backup()117 public void backup() { 118 if (backedup) throw new IllegalArgumentException("backup too far"); 119 backedup = true; 120 nextIndex = index; 121 index = lastIndex; 122 } 123 124 /* 125 public int next2() { 126 boolean backedupBefore = backedup; 127 int result = next(); 128 System.out.println(toString(result, backedupBefore)); 129 return result; 130 } 131 */ 132 next()133 public int next() { 134 if (backedup) { 135 backedup = false; 136 index = nextIndex; 137 return lastValue; 138 } 139 int cp = 0; 140 boolean inComment = false; 141 // clean off any leading whitespace or comments 142 while (true) { 143 if (index >= source.length()) return lastValue = DONE; 144 cp = nextChar(); 145 if (inComment) { 146 if (NEWLINE.contains(cp)) inComment = false; 147 } else { 148 if (cp == '#') inComment = true; 149 else if (!whiteSpace.contains(cp)) break; 150 } 151 } 152 // record the last index in case we have to backup 153 lastIndex = index; 154 155 if (cp == '[') { 156 ParsePosition pos = new ParsePosition(index - 1); 157 unicodeSet = new UnicodeSet(source, pos, symbolTable); 158 index = pos.getIndex(); 159 return lastValue = UNICODESET; 160 } 161 // get syntax character 162 if (syntax.contains(cp)) return lastValue = cp; 163 164 // get number, if there is one 165 if (UCharacter.getType(cp) == Character.DECIMAL_DIGIT_NUMBER) { 166 number = UCharacter.getNumericValue(cp); 167 while (index < source.length()) { 168 cp = nextChar(); 169 if (UCharacter.getType(cp) != Character.DECIMAL_DIGIT_NUMBER) { 170 index -= UTF16.getCharCount(cp); // BACKUP! 171 break; 172 } 173 number *= 10; 174 number += UCharacter.getNumericValue(cp); 175 } 176 return lastValue = NUMBER; 177 } 178 buffer.setLength(0); 179 int status = IN_STRING; 180 main: 181 while (true) { 182 switch (status) { 183 case AFTER_QUOTE: // check for double ''? 184 if (cp == QUOTE) { 185 UTF16.append(buffer, QUOTE); 186 status = IN_QUOTE; 187 break; 188 } 189 // OTHERWISE FALL THROUGH!!! 190 case IN_STRING: 191 if (cp == QUOTE) status = IN_QUOTE; 192 else if (cp == BSLASH) status = AFTER_BSLASH; 193 else if (non_string.contains(cp)) { 194 index -= UTF16.getCharCount(cp); // BACKUP! 195 break main; 196 } else UTF16.append(buffer, cp); 197 break; 198 case IN_QUOTE: 199 if (cp == QUOTE) status = AFTER_QUOTE; 200 else UTF16.append(buffer, cp); 201 break; 202 case AFTER_BSLASH: 203 switch (cp) { 204 case 'n': 205 cp = '\n'; 206 break; 207 case 'r': 208 cp = '\r'; 209 break; 210 case 't': 211 cp = '\t'; 212 break; 213 } 214 UTF16.append(buffer, cp); 215 status = IN_STRING; 216 break; 217 default: 218 throw new IllegalArgumentException("Internal Error"); 219 } 220 if (index >= source.length()) break; 221 cp = nextChar(); 222 } 223 if (status > IN_STRING) return lastValue = UNTERMINATED_QUOTE; 224 return lastValue = STRING; 225 } 226 getString()227 public String getString() { 228 return buffer.toString(); 229 } 230 231 @Override toString()232 public String toString() { 233 return source.substring(0, index) + "$$$" + source.substring(index); 234 } 235 getNumber()236 public long getNumber() { 237 return number; 238 } 239 getUnicodeSet()240 public UnicodeSet getUnicodeSet() { 241 return unicodeSet; 242 } 243 nextChar()244 private int nextChar() { 245 int cp = UTF16.charAt(source, index); 246 index += UTF16.getCharCount(cp); 247 return cp; 248 } 249 getIndex()250 public int getIndex() { 251 return index; 252 } 253 getSource()254 public String getSource() { 255 return source; 256 } 257 getSyntax()258 public UnicodeSet getSyntax() { 259 return syntax; 260 } 261 getWhiteSpace()262 public UnicodeSet getWhiteSpace() { 263 return whiteSpace; 264 } 265 setSyntax(UnicodeSet set)266 public void setSyntax(UnicodeSet set) { 267 syntax = set; 268 fixSets(); 269 } 270 setWhiteSpace(UnicodeSet set)271 public void setWhiteSpace(UnicodeSet set) { 272 whiteSpace = set; 273 fixSets(); 274 } 275 getLookedUpItems()276 public Set getLookedUpItems() { 277 return symbolTable.itemsLookedUp; 278 } 279 addSymbol(String var, String value, int start, int limit)280 public void addSymbol(String var, String value, int start, int limit) { 281 // the limit is after the ';', so remove it 282 --limit; 283 char[] body = new char[limit - start]; 284 value.getChars(start, limit, body, 0); 285 symbolTable.add(var, body); 286 } 287 288 public class TokenSymbolTable implements SymbolTable { 289 Map contents = new HashMap(); 290 Set itemsLookedUp = new HashSet(); 291 add(String var, char[] body)292 public void add(String var, char[] body) { 293 // start from 1 to avoid the $ 294 contents.put(var.substring(1), body); 295 } 296 297 /* (non-Javadoc) 298 * @see com.ibm.icu.text.SymbolTable#lookup(java.lang.String) 299 */ 300 @Override lookup(String s)301 public char[] lookup(String s) { 302 itemsLookedUp.add('$' + s); 303 return (char[]) contents.get(s); 304 } 305 306 /* (non-Javadoc) 307 * @see com.ibm.icu.text.SymbolTable#lookupMatcher(int) 308 */ 309 @Override lookupMatcher(int ch)310 public UnicodeMatcher lookupMatcher(int ch) { 311 // TODO Auto-generated method stub 312 return null; 313 } 314 315 /* (non-Javadoc) 316 * @see com.ibm.icu.text.SymbolTable#parseReference(java.lang.String, java.text.ParsePosition, int) 317 */ 318 @Override parseReference(String text, ParsePosition pos, int limit)319 public String parseReference(String text, ParsePosition pos, int limit) { 320 int cp; 321 int start = pos.getIndex(); 322 int i; 323 for (i = start; i < limit; i += UTF16.getCharCount(cp)) { 324 cp = UTF16.charAt(text, i); 325 if (!com.ibm.icu.lang.UCharacter.isUnicodeIdentifierPart(cp)) { 326 break; 327 } 328 } 329 pos.setIndex(i); 330 return text.substring(start, i); 331 } 332 } 333 } 334