1 // © 2016 and later: Unicode, Inc. and others. 2 // License & terms of use: http://www.unicode.org/copyright.html 3 /* 4 ********************************************************************** 5 * Copyright (c) 2003-2011, International Business Machines 6 * Corporation and others. All Rights Reserved. 7 ********************************************************************** 8 * Author: Alan Liu 9 * Created: September 23 2003 10 * Since: ICU 2.8 11 ********************************************************************** 12 */ 13 package com.ibm.icu.impl; 14 15 import java.text.ParsePosition; 16 17 import com.ibm.icu.text.SymbolTable; 18 import com.ibm.icu.text.UTF16; 19 20 /** 21 * An iterator that returns 32-bit code points. This class is deliberately 22 * <em>not</em> related to any of the JDK or ICU4J character iterator classes 23 * in order to minimize complexity. 24 * @author Alan Liu 25 * @since ICU 2.8 26 */ 27 public class RuleCharacterIterator { 28 29 // TODO: Ideas for later. (Do not implement if not needed, lest the 30 // code coverage numbers go down due to unused methods.) 31 // 1. Add a copy constructor, equals() method, clone() method. 32 // 2. Rather than return DONE, throw an exception if the end 33 // is reached -- this is an alternate usage model, probably not useful. 34 // 3. Return isEscaped from next(). If this happens, 35 // don't keep an isEscaped member variable. 36 37 /** 38 * Text being iterated. 39 */ 40 private String text; 41 42 /** 43 * Position of iterator. 44 */ 45 private ParsePosition pos; 46 47 /** 48 * Symbol table used to parse and dereference variables. May be null. 49 */ 50 private SymbolTable sym; 51 52 /** 53 * Current variable expansion, or null if none. 54 */ 55 private String buf; 56 57 /** 58 * Position within buf[]. Meaningless if buf == null. 59 */ 60 private int bufPos; 61 62 /** 63 * Flag indicating whether the last character was parsed from an escape. 64 */ 65 private boolean isEscaped; 66 67 /** 68 * Value returned when there are no more characters to iterate. 69 */ 70 public static final int DONE = -1; 71 72 /** 73 * Bitmask option to enable parsing of variable names. If (options & 74 * PARSE_VARIABLES) != 0, then an embedded variable will be expanded to 75 * its value. Variables are parsed using the SymbolTable API. 76 */ 77 public static final int PARSE_VARIABLES = 1; 78 79 /** 80 * Bitmask option to enable parsing of escape sequences. If (options & 81 * PARSE_ESCAPES) != 0, then an embedded escape sequence will be expanded 82 * to its value. Escapes are parsed using Utility.unescapeAndLengthAt(). 83 */ 84 public static final int PARSE_ESCAPES = 2; 85 86 /** 87 * Bitmask option to enable skipping of whitespace. If (options & 88 * SKIP_WHITESPACE) != 0, then Unicode Pattern_White_Space characters will be silently 89 * skipped, as if they were not present in the input. 90 */ 91 public static final int SKIP_WHITESPACE = 4; 92 93 /** For use with {@link #getPos(Position)} & {@link #setPos(Position)}. */ 94 public static final class Position { 95 private String buf; 96 private int bufPos; 97 private int posIndex; 98 }; 99 100 /** 101 * Constructs an iterator over the given text, starting at the given 102 * position. 103 * @param text the text to be iterated 104 * @param sym the symbol table, or null if there is none. If sym is null, 105 * then variables will not be dereferenced, even if the PARSE_VARIABLES 106 * option is set. 107 * @param pos upon input, the index of the next character to return. If a 108 * variable has been dereferenced, then pos will <em>not</em> increment as 109 * characters of the variable value are iterated. 110 */ RuleCharacterIterator(String text, SymbolTable sym, ParsePosition pos)111 public RuleCharacterIterator(String text, SymbolTable sym, 112 ParsePosition pos) { 113 if (text == null || pos.getIndex() > text.length()) { 114 throw new IllegalArgumentException(); 115 } 116 this.text = text; 117 this.sym = sym; 118 this.pos = pos; 119 buf = null; 120 } 121 122 /** 123 * Returns true if this iterator has no more characters to return. 124 */ atEnd()125 public boolean atEnd() { 126 return buf == null && pos.getIndex() == text.length(); 127 } 128 129 /** 130 * Returns the next character using the given options, or DONE if there 131 * are no more characters, and advance the position to the next 132 * character. 133 * @param options one or more of the following options, bitwise-OR-ed 134 * together: PARSE_VARIABLES, PARSE_ESCAPES, SKIP_WHITESPACE. 135 * @return the current 32-bit code point, or DONE 136 */ next(int options)137 public int next(int options) { 138 int c = DONE; 139 isEscaped = false; 140 141 for (;;) { 142 c = _current(); 143 _advance(UTF16.getCharCount(c)); 144 145 if (c == SymbolTable.SYMBOL_REF && buf == null && 146 (options & PARSE_VARIABLES) != 0 && sym != null) { 147 String name = sym.parseReference(text, pos, text.length()); 148 // If name == null there was an isolated SYMBOL_REF; 149 // return it. Caller must be prepared for this. 150 if (name == null) { 151 break; 152 } 153 bufPos = 0; 154 char[] chars = sym.lookup(name); 155 if (chars == null) { 156 buf = null; 157 throw new IllegalArgumentException( 158 "Undefined variable: " + name); 159 } 160 // Handle empty variable value 161 if (chars.length == 0) { 162 buf = null; 163 } 164 buf = new String(chars); 165 continue; 166 } 167 168 if ((options & SKIP_WHITESPACE) != 0 && 169 PatternProps.isWhiteSpace(c)) { 170 continue; 171 } 172 173 if (c == '\\' && (options & PARSE_ESCAPES) != 0) { 174 int cpAndLength = Utility.unescapeAndLengthAt( 175 getCurrentBuffer(), getCurrentBufferPos()); 176 if (cpAndLength < 0) { 177 throw new IllegalArgumentException("Invalid escape"); 178 } 179 c = Utility.cpFromCodePointAndLength(cpAndLength); 180 jumpahead(Utility.lengthFromCodePointAndLength(cpAndLength)); 181 isEscaped = true; 182 } 183 184 break; 185 } 186 187 return c; 188 } 189 190 /** 191 * Returns true if the last character returned by next() was 192 * escaped. This will only be the case if the option passed in to 193 * next() included PARSE_ESCAPED and the next character was an 194 * escape sequence. 195 */ isEscaped()196 public boolean isEscaped() { 197 return isEscaped; 198 } 199 200 /** 201 * Returns true if this iterator is currently within a variable expansion. 202 */ inVariable()203 public boolean inVariable() { 204 return buf != null; 205 } 206 207 /** 208 * Returns an object which, when later passed to setPos(), will 209 * restore this iterator's position. Usage idiom: 210 * 211 * RuleCharacterIterator iterator = ...; 212 * Position pos = iterator.getPos(null); // allocate position object 213 * for (;;) { 214 * pos = iterator.getPos(pos); // reuse position object 215 * int c = iterator.next(...); 216 * ... 217 * } 218 * iterator.setPos(pos); 219 * 220 * @param p a position object previously returned by getPos(), 221 * or null. If not null, it will be updated and returned. If 222 * null, a new position object will be allocated and returned. 223 * @return a position object which may be passed to setPos(), 224 * either `p,' or if `p' == null, a newly-allocated object 225 */ getPos(Position p)226 public Position getPos(Position p) { 227 if (p == null) { 228 p = new Position(); 229 } 230 p.buf = buf; 231 p.bufPos = bufPos; 232 p.posIndex = pos.getIndex(); 233 return p; 234 } 235 236 /** 237 * Restores this iterator to the position it had when getPos() 238 * returned the given object. 239 * @param p a position object previously returned by getPos() 240 */ setPos(Position p)241 public void setPos(Position p) { 242 buf = p.buf; 243 pos.setIndex(p.posIndex); 244 bufPos = p.bufPos; 245 } 246 247 /** 248 * Skips ahead past any ignored characters, as indicated by the given 249 * options. This is useful in conjunction with the lookahead() method. 250 * 251 * Currently, this only has an effect for SKIP_WHITESPACE. 252 * @param options one or more of the following options, bitwise-OR-ed 253 * together: PARSE_VARIABLES, PARSE_ESCAPES, SKIP_WHITESPACE. 254 */ skipIgnored(int options)255 public void skipIgnored(int options) { 256 if ((options & SKIP_WHITESPACE) != 0) { 257 for (;;) { 258 int a = _current(); 259 if (!PatternProps.isWhiteSpace(a)) break; 260 _advance(UTF16.getCharCount(a)); 261 } 262 } 263 } 264 265 /** 266 * Returns a string containing the remainder of the characters to be 267 * returned by this iterator, without any option processing. If the 268 * iterator is currently within a variable expansion, this will only 269 * extend to the end of the variable expansion. 270 * This method, together with getCurrentBufferPos() (which replace the former lookahead()), 271 * is provided so that iterators may interoperate with string-based APIs. The typical 272 * sequence of calls is to call skipIgnored(), then call these methods, then 273 * parse that substring, then call jumpahead() to 274 * resynchronize the iterator. 275 * @return a string containing the characters to be returned by future 276 * calls to next() 277 */ getCurrentBuffer()278 public String getCurrentBuffer() { 279 if (buf != null) { 280 return buf; 281 } else { 282 return text; 283 } 284 } 285 getCurrentBufferPos()286 public int getCurrentBufferPos() { 287 if (buf != null) { 288 return bufPos; 289 } else { 290 return pos.getIndex(); 291 } 292 } 293 294 /** 295 * Advances the position by the given number of 16-bit code units. 296 * This is useful in conjunction with getCurrentBuffer()+getCurrentBufferPos() 297 * (formerly lookahead()). 298 * @param count the number of 16-bit code units to jump over 299 */ jumpahead(int count)300 public void jumpahead(int count) { 301 if (count < 0) { 302 throw new IllegalArgumentException(); 303 } 304 if (buf != null) { 305 bufPos += count; 306 if (bufPos > buf.length()) { 307 throw new IllegalArgumentException(); 308 } 309 if (bufPos == buf.length()) { 310 buf = null; 311 } 312 } else { 313 int i = pos.getIndex() + count; 314 pos.setIndex(i); 315 if (i > text.length()) { 316 throw new IllegalArgumentException(); 317 } 318 } 319 } 320 321 /** 322 * Returns a string representation of this object, consisting of the 323 * characters being iterated, with a '|' marking the current position. 324 * Position within an expanded variable is <em>not</em> indicated. 325 * @return a string representation of this object 326 */ 327 @Override toString()328 public String toString() { 329 int b = pos.getIndex(); 330 return text.substring(0, b) + '|' + text.substring(b); 331 } 332 333 /** 334 * Returns the current 32-bit code point without parsing escapes, parsing 335 * variables, or skipping whitespace. 336 * @return the current 32-bit code point 337 */ _current()338 private int _current() { 339 if (buf != null) { 340 return UTF16.charAt(buf, bufPos); 341 } else { 342 int i = pos.getIndex(); 343 return (i < text.length()) ? UTF16.charAt(text, i) : DONE; 344 } 345 } 346 347 /** 348 * Advances the position by the given amount. 349 * @param count the number of 16-bit code units to advance past 350 */ _advance(int count)351 private void _advance(int count) { 352 if (buf != null) { 353 bufPos += count; 354 if (bufPos == buf.length()) { 355 buf = null; 356 } 357 } else { 358 pos.setIndex(pos.getIndex() + count); 359 if (pos.getIndex() > text.length()) { 360 pos.setIndex(text.length()); 361 } 362 } 363 } 364 } 365