1 /* 2 * Copyright (c) 1994, 2020, Oracle and/or its affiliates. All rights reserved. 3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 4 * 5 * This code is free software; you can redistribute it and/or modify it 6 * under the terms of the GNU General Public License version 2 only, as 7 * published by the Free Software Foundation. Oracle designates this 8 * particular file as subject to the "Classpath" exception as provided 9 * by Oracle in the LICENSE file that accompanied this code. 10 * 11 * This code is distributed in the hope that it will be useful, but WITHOUT 12 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 13 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 14 * version 2 for more details (a copy is included in the LICENSE file that 15 * accompanied this code). 16 * 17 * You should have received a copy of the GNU General Public License version 18 * 2 along with this work; if not, write to the Free Software Foundation, 19 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 20 * 21 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 22 * or visit www.oracle.com if you need additional information or have any 23 * questions. 24 */ 25 26 package java.util; 27 28 import java.lang.*; 29 30 /** 31 * The string tokenizer class allows an application to break a 32 * string into tokens. The tokenization method is much simpler than 33 * the one used by the {@code StreamTokenizer} class. The 34 * {@code StringTokenizer} methods do not distinguish among 35 * identifiers, numbers, and quoted strings, nor do they recognize 36 * and skip comments. 37 * <p> 38 * The set of delimiters (the characters that separate tokens) may 39 * be specified either at creation time or on a per-token basis. 40 * <p> 41 * An instance of {@code StringTokenizer} behaves in one of two 42 * ways, depending on whether it was created with the 43 * {@code returnDelims} flag having the value {@code true} 44 * or {@code false}: 45 * <ul> 46 * <li>If the flag is {@code false}, delimiter characters serve to 47 * separate tokens. A token is a maximal sequence of consecutive 48 * characters that are not delimiters. 49 * <li>If the flag is {@code true}, delimiter characters are themselves 50 * considered to be tokens. A token is thus either one delimiter 51 * character, or a maximal sequence of consecutive characters that are 52 * not delimiters. 53 * </ul><p> 54 * A {@code StringTokenizer} object internally maintains a current 55 * position within the string to be tokenized. Some operations advance this 56 * current position past the characters processed.<p> 57 * A token is returned by taking a substring of the string that was used to 58 * create the {@code StringTokenizer} object. 59 * <p> 60 * The following is one example of the use of the tokenizer. The code: 61 * <blockquote><pre> 62 * StringTokenizer st = new StringTokenizer("this is a test"); 63 * while (st.hasMoreTokens()) { 64 * System.out.println(st.nextToken()); 65 * } 66 * </pre></blockquote> 67 * <p> 68 * prints the following output: 69 * <blockquote><pre> 70 * this 71 * is 72 * a 73 * test 74 * </pre></blockquote> 75 * 76 * <p> 77 * {@code StringTokenizer} is a legacy class that is retained for 78 * compatibility reasons although its use is discouraged in new code. It is 79 * recommended that anyone seeking this functionality use the {@code split} 80 * method of {@code String} or the java.util.regex package instead. 81 * <p> 82 * The following example illustrates how the {@code String.split} 83 * method can be used to break up a string into its basic tokens: 84 * <blockquote><pre> 85 * String[] result = "this is a test".split("\\s"); 86 * for (int x=0; x<result.length; x++) 87 * System.out.println(result[x]); 88 * </pre></blockquote> 89 * <p> 90 * prints the following output: 91 * <blockquote><pre> 92 * this 93 * is 94 * a 95 * test 96 * </pre></blockquote> 97 * 98 * @see java.io.StreamTokenizer 99 * @since 1.0 100 */ 101 public class StringTokenizer implements Enumeration<Object> { 102 private int currentPosition; 103 private int newPosition; 104 private int maxPosition; 105 private String str; 106 private String delimiters; 107 private boolean retDelims; 108 private boolean delimsChanged; 109 110 /** 111 * maxDelimCodePoint stores the value of the delimiter character with the 112 * highest value. It is used to optimize the detection of delimiter 113 * characters. 114 * 115 * It is unlikely to provide any optimization benefit in the 116 * hasSurrogates case because most string characters will be 117 * smaller than the limit, but we keep it so that the two code 118 * paths remain similar. 119 */ 120 private int maxDelimCodePoint; 121 122 /** 123 * If delimiters include any surrogates (including surrogate 124 * pairs), hasSurrogates is true and the tokenizer uses the 125 * different code path. This is because String.indexOf(int) 126 * doesn't handle unpaired surrogates as a single character. 127 */ 128 private boolean hasSurrogates = false; 129 130 /** 131 * When hasSurrogates is true, delimiters are converted to code 132 * points and isDelimiter(int) is used to determine if the given 133 * codepoint is a delimiter. 134 */ 135 private int[] delimiterCodePoints; 136 137 /** 138 * Set maxDelimCodePoint to the highest char in the delimiter set. 139 */ setMaxDelimCodePoint()140 private void setMaxDelimCodePoint() { 141 if (delimiters == null) { 142 maxDelimCodePoint = 0; 143 return; 144 } 145 146 int m = 0; 147 int c; 148 int count = 0; 149 for (int i = 0; i < delimiters.length(); i += Character.charCount(c)) { 150 c = delimiters.charAt(i); 151 if (c >= Character.MIN_HIGH_SURROGATE && c <= Character.MAX_LOW_SURROGATE) { 152 c = delimiters.codePointAt(i); 153 hasSurrogates = true; 154 } 155 if (m < c) 156 m = c; 157 count++; 158 } 159 maxDelimCodePoint = m; 160 161 if (hasSurrogates) { 162 delimiterCodePoints = new int[count]; 163 for (int i = 0, j = 0; i < count; i++, j += Character.charCount(c)) { 164 c = delimiters.codePointAt(j); 165 delimiterCodePoints[i] = c; 166 } 167 } 168 } 169 170 /** 171 * Constructs a string tokenizer for the specified string. All 172 * characters in the {@code delim} argument are the delimiters 173 * for separating tokens. 174 * <p> 175 * If the {@code returnDelims} flag is {@code true}, then 176 * the delimiter characters are also returned as tokens. Each 177 * delimiter is returned as a string of length one. If the flag is 178 * {@code false}, the delimiter characters are skipped and only 179 * serve as separators between tokens. 180 * <p> 181 * Note that if {@code delim} is {@code null}, this constructor does 182 * not throw an exception. However, trying to invoke other methods on the 183 * resulting {@code StringTokenizer} may result in a 184 * {@code NullPointerException}. 185 * 186 * @param str a string to be parsed. 187 * @param delim the delimiters. 188 * @param returnDelims flag indicating whether to return the delimiters 189 * as tokens. 190 * @throws NullPointerException if str is {@code null} 191 */ StringTokenizer(String str, String delim, boolean returnDelims)192 public StringTokenizer(String str, String delim, boolean returnDelims) { 193 currentPosition = 0; 194 newPosition = -1; 195 delimsChanged = false; 196 this.str = str; 197 maxPosition = str.length(); 198 delimiters = delim; 199 retDelims = returnDelims; 200 setMaxDelimCodePoint(); 201 } 202 203 /** 204 * Constructs a string tokenizer for the specified string. The 205 * characters in the {@code delim} argument are the delimiters 206 * for separating tokens. Delimiter characters themselves will not 207 * be treated as tokens. 208 * <p> 209 * Note that if {@code delim} is {@code null}, this constructor does 210 * not throw an exception. However, trying to invoke other methods on the 211 * resulting {@code StringTokenizer} may result in a 212 * {@code NullPointerException}. 213 * 214 * @param str a string to be parsed. 215 * @param delim the delimiters. 216 * @throws NullPointerException if str is {@code null} 217 */ StringTokenizer(String str, String delim)218 public StringTokenizer(String str, String delim) { 219 this(str, delim, false); 220 } 221 222 /** 223 * Constructs a string tokenizer for the specified string. The 224 * tokenizer uses the default delimiter set, which is 225 * <code>" \t\n\r\f"</code>: the space character, 226 * the tab character, the newline character, the carriage-return character, 227 * and the form-feed character. Delimiter characters themselves will 228 * not be treated as tokens. 229 * 230 * @param str a string to be parsed. 231 * @throws NullPointerException if str is {@code null} 232 */ StringTokenizer(String str)233 public StringTokenizer(String str) { 234 this(str, " \t\n\r\f", false); 235 } 236 237 /** 238 * Skips delimiters starting from the specified position. If retDelims 239 * is false, returns the index of the first non-delimiter character at or 240 * after startPos. If retDelims is true, startPos is returned. 241 */ skipDelimiters(int startPos)242 private int skipDelimiters(int startPos) { 243 if (delimiters == null) 244 throw new NullPointerException(); 245 246 int position = startPos; 247 while (!retDelims && position < maxPosition) { 248 if (!hasSurrogates) { 249 char c = str.charAt(position); 250 if ((c > maxDelimCodePoint) || (delimiters.indexOf(c) < 0)) 251 break; 252 position++; 253 } else { 254 int c = str.codePointAt(position); 255 if ((c > maxDelimCodePoint) || !isDelimiter(c)) { 256 break; 257 } 258 position += Character.charCount(c); 259 } 260 } 261 return position; 262 } 263 264 /** 265 * Skips ahead from startPos and returns the index of the next delimiter 266 * character encountered, or maxPosition if no such delimiter is found. 267 */ scanToken(int startPos)268 private int scanToken(int startPos) { 269 int position = startPos; 270 while (position < maxPosition) { 271 if (!hasSurrogates) { 272 char c = str.charAt(position); 273 if ((c <= maxDelimCodePoint) && (delimiters.indexOf(c) >= 0)) 274 break; 275 position++; 276 } else { 277 int c = str.codePointAt(position); 278 if ((c <= maxDelimCodePoint) && isDelimiter(c)) 279 break; 280 position += Character.charCount(c); 281 } 282 } 283 if (retDelims && (startPos == position)) { 284 if (!hasSurrogates) { 285 char c = str.charAt(position); 286 if ((c <= maxDelimCodePoint) && (delimiters.indexOf(c) >= 0)) 287 position++; 288 } else { 289 int c = str.codePointAt(position); 290 if ((c <= maxDelimCodePoint) && isDelimiter(c)) 291 position += Character.charCount(c); 292 } 293 } 294 return position; 295 } 296 isDelimiter(int codePoint)297 private boolean isDelimiter(int codePoint) { 298 for (int delimiterCodePoint : delimiterCodePoints) { 299 if (delimiterCodePoint == codePoint) { 300 return true; 301 } 302 } 303 return false; 304 } 305 306 /** 307 * Tests if there are more tokens available from this tokenizer's string. 308 * If this method returns {@code true}, then a subsequent call to 309 * {@code nextToken} with no argument will successfully return a token. 310 * 311 * @return {@code true} if and only if there is at least one token 312 * in the string after the current position; {@code false} 313 * otherwise. 314 */ hasMoreTokens()315 public boolean hasMoreTokens() { 316 /* 317 * Temporarily store this position and use it in the following 318 * nextToken() method only if the delimiters haven't been changed in 319 * that nextToken() invocation. 320 */ 321 newPosition = skipDelimiters(currentPosition); 322 return (newPosition < maxPosition); 323 } 324 325 /** 326 * Returns the next token from this string tokenizer. 327 * 328 * @return the next token from this string tokenizer. 329 * @throws NoSuchElementException if there are no more tokens in this 330 * tokenizer's string. 331 */ nextToken()332 public String nextToken() { 333 /* 334 * If next position already computed in hasMoreElements() and 335 * delimiters have changed between the computation and this invocation, 336 * then use the computed value. 337 */ 338 339 currentPosition = (newPosition >= 0 && !delimsChanged) ? 340 newPosition : skipDelimiters(currentPosition); 341 342 /* Reset these anyway */ 343 delimsChanged = false; 344 newPosition = -1; 345 346 if (currentPosition >= maxPosition) 347 throw new NoSuchElementException(); 348 int start = currentPosition; 349 currentPosition = scanToken(currentPosition); 350 return str.substring(start, currentPosition); 351 } 352 353 /** 354 * Returns the next token in this string tokenizer's string. First, 355 * the set of characters considered to be delimiters by this 356 * {@code StringTokenizer} object is changed to be the characters in 357 * the string {@code delim}. Then the next token in the string 358 * after the current position is returned. The current position is 359 * advanced beyond the recognized token. The new delimiter set 360 * remains the default after this call. 361 * 362 * @param delim the new delimiters. 363 * @return the next token, after switching to the new delimiter set. 364 * @throws NoSuchElementException if there are no more tokens in this 365 * tokenizer's string. 366 * @throws NullPointerException if delim is {@code null} 367 */ nextToken(String delim)368 public String nextToken(String delim) { 369 delimiters = delim; 370 371 /* delimiter string specified, so set the appropriate flag. */ 372 delimsChanged = true; 373 374 setMaxDelimCodePoint(); 375 return nextToken(); 376 } 377 378 /** 379 * Returns the same value as the {@code hasMoreTokens} 380 * method. It exists so that this class can implement the 381 * {@code Enumeration} interface. 382 * 383 * @return {@code true} if there are more tokens; 384 * {@code false} otherwise. 385 * @see java.util.Enumeration 386 * @see java.util.StringTokenizer#hasMoreTokens() 387 */ hasMoreElements()388 public boolean hasMoreElements() { 389 return hasMoreTokens(); 390 } 391 392 /** 393 * Returns the same value as the {@code nextToken} method, 394 * except that its declared return value is {@code Object} rather than 395 * {@code String}. It exists so that this class can implement the 396 * {@code Enumeration} interface. 397 * 398 * @return the next token in the string. 399 * @throws NoSuchElementException if there are no more tokens in this 400 * tokenizer's string. 401 * @see java.util.Enumeration 402 * @see java.util.StringTokenizer#nextToken() 403 */ nextElement()404 public Object nextElement() { 405 return nextToken(); 406 } 407 408 /** 409 * Calculates the number of times that this tokenizer's 410 * {@code nextToken} method can be called before it generates an 411 * exception. The current position is not advanced. 412 * 413 * @return the number of tokens remaining in the string using the current 414 * delimiter set. 415 * @see java.util.StringTokenizer#nextToken() 416 */ countTokens()417 public int countTokens() { 418 int count = 0; 419 int currpos = currentPosition; 420 while (currpos < maxPosition) { 421 currpos = skipDelimiters(currpos); 422 if (currpos >= maxPosition) 423 break; 424 currpos = scanToken(currpos); 425 count++; 426 } 427 return count; 428 } 429 } 430