• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Licensed to the Apache Software Foundation (ASF) under one or more
3  * contributor license agreements.  See the NOTICE file distributed with
4  * this work for additional information regarding copyright ownership.
5  * The ASF licenses this file to You under the Apache License, Version 2.0
6  * (the "License"); you may not use this file except in compliance with
7  * the License.  You may obtain a copy of the License at
8  *
9  *      http://www.apache.org/licenses/LICENSE-2.0
10  *
11  * Unless required by applicable law or agreed to in writing, software
12  * distributed under the License is distributed on an "AS IS" BASIS,
13  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14  * See the License for the specific language governing permissions and
15  * limitations under the License.
16  */
17 package org.apache.commons.lang3.text;
18 
19 import java.util.ArrayList;
20 import java.util.Arrays;
21 import java.util.Collections;
22 import java.util.List;
23 import java.util.ListIterator;
24 import java.util.NoSuchElementException;
25 import java.util.StringTokenizer;
26 
27 import org.apache.commons.lang3.ArrayUtils;
28 import org.apache.commons.lang3.StringUtils;
29 
30 /**
31  * Tokenizes a string based on delimiters (separators)
32  * and supporting quoting and ignored character concepts.
33  * <p>
34  * This class can split a String into many smaller strings. It aims
35  * to do a similar job to {@link java.util.StringTokenizer StringTokenizer},
36  * however it offers much more control and flexibility including implementing
37  * the {@link ListIterator} interface. By default, it is set up
38  * like {@link StringTokenizer}.
39  * </p>
40  * <p>
41  * The input String is split into a number of <i>tokens</i>.
42  * Each token is separated from the next String by a <i>delimiter</i>.
43  * One or more delimiter characters must be specified.
44  * </p>
45  * <p>
46  * Each token may be surrounded by quotes.
47  * The <i>quote</i> matcher specifies the quote character(s).
48  * A quote may be escaped within a quoted section by duplicating itself.
49  * </p>
50  * <p>
51  * Between each token and the delimiter are potentially characters that need trimming.
52  * The <i>trimmer</i> matcher specifies these characters.
53  * One usage might be to trim whitespace characters.
54  * </p>
55  * <p>
56  * At any point outside the quotes there might potentially be invalid characters.
57  * The <i>ignored</i> matcher specifies these characters to be removed.
58  * One usage might be to remove new line characters.
59  * </p>
60  * <p>
61  * Empty tokens may be removed or returned as null.
62  * </p>
63  * <pre>
64  * "a,b,c"         - Three tokens "a","b","c"   (comma delimiter)
65  * " a, b , c "    - Three tokens "a","b","c"   (default CSV processing trims whitespace)
66  * "a, ", b ,", c" - Three tokens "a, " , " b ", ", c" (quoted text untouched)
67  * </pre>
68  *
69  * <table>
70  *  <caption>StrTokenizer properties and options</caption>
71  *  <tr>
72  *   <th>Property</th><th>Type</th><th>Default</th>
73  *  </tr>
74  *  <tr>
75  *   <td>delim</td><td>CharSetMatcher</td><td>{ \t\n\r\f}</td>
76  *  </tr>
77  *  <tr>
78  *   <td>quote</td><td>NoneMatcher</td><td>{}</td>
79  *  </tr>
80  *  <tr>
81  *   <td>ignore</td><td>NoneMatcher</td><td>{}</td>
82  *  </tr>
83  *  <tr>
84  *   <td>emptyTokenAsNull</td><td>boolean</td><td>false</td>
85  *  </tr>
86  *  <tr>
87  *   <td>ignoreEmptyTokens</td><td>boolean</td><td>true</td>
88  *  </tr>
89  * </table>
90  *
91  * @since 2.2
92  * @deprecated As of 3.6, use Apache Commons Text
93  * <a href="https://commons.apache.org/proper/commons-text/javadocs/api-release/org/apache/commons/text/StringTokenizer.html">
94  * StringTokenizer</a> instead
95  */
96 @Deprecated
97 public class StrTokenizer implements ListIterator<String>, Cloneable {
98 
99     private static final StrTokenizer CSV_TOKENIZER_PROTOTYPE;
100     private static final StrTokenizer TSV_TOKENIZER_PROTOTYPE;
101     static {
102         CSV_TOKENIZER_PROTOTYPE = new StrTokenizer();
StrMatcher.commaMatcher()103         CSV_TOKENIZER_PROTOTYPE.setDelimiterMatcher(StrMatcher.commaMatcher());
StrMatcher.doubleQuoteMatcher()104         CSV_TOKENIZER_PROTOTYPE.setQuoteMatcher(StrMatcher.doubleQuoteMatcher());
StrMatcher.noneMatcher()105         CSV_TOKENIZER_PROTOTYPE.setIgnoredMatcher(StrMatcher.noneMatcher());
StrMatcher.trimMatcher()106         CSV_TOKENIZER_PROTOTYPE.setTrimmerMatcher(StrMatcher.trimMatcher());
107         CSV_TOKENIZER_PROTOTYPE.setEmptyTokenAsNull(false);
108         CSV_TOKENIZER_PROTOTYPE.setIgnoreEmptyTokens(false);
109 
110         TSV_TOKENIZER_PROTOTYPE = new StrTokenizer();
StrMatcher.tabMatcher()111         TSV_TOKENIZER_PROTOTYPE.setDelimiterMatcher(StrMatcher.tabMatcher());
StrMatcher.doubleQuoteMatcher()112         TSV_TOKENIZER_PROTOTYPE.setQuoteMatcher(StrMatcher.doubleQuoteMatcher());
StrMatcher.noneMatcher()113         TSV_TOKENIZER_PROTOTYPE.setIgnoredMatcher(StrMatcher.noneMatcher());
StrMatcher.trimMatcher()114         TSV_TOKENIZER_PROTOTYPE.setTrimmerMatcher(StrMatcher.trimMatcher());
115         TSV_TOKENIZER_PROTOTYPE.setEmptyTokenAsNull(false);
116         TSV_TOKENIZER_PROTOTYPE.setIgnoreEmptyTokens(false);
117     }
118 
119     /** The text to work on. */
120     private char[] chars;
121     /** The parsed tokens */
122     private String[] tokens;
123     /** The current iteration position */
124     private int tokenPos;
125 
126     /** The delimiter matcher */
127     private StrMatcher delimMatcher = StrMatcher.splitMatcher();
128     /** The quote matcher */
129     private StrMatcher quoteMatcher = StrMatcher.noneMatcher();
130     /** The ignored matcher */
131     private StrMatcher ignoredMatcher = StrMatcher.noneMatcher();
132     /** The trimmer matcher */
133     private StrMatcher trimmerMatcher = StrMatcher.noneMatcher();
134 
135     /** Whether to return empty tokens as null */
136     private boolean emptyAsNull;
137     /** Whether to ignore empty tokens */
138     private boolean ignoreEmptyTokens = true;
139 
140 
141     /**
142      * Returns a clone of {@code CSV_TOKENIZER_PROTOTYPE}.
143      *
144      * @return a clone of {@code CSV_TOKENIZER_PROTOTYPE}.
145      */
getCSVClone()146     private static StrTokenizer getCSVClone() {
147         return (StrTokenizer) CSV_TOKENIZER_PROTOTYPE.clone();
148     }
149 
150     /**
151      * Gets a new tokenizer instance which parses Comma Separated Value strings
152      * initializing it with the given input.  The default for CSV processing
153      * will be trim whitespace from both ends (which can be overridden with
154      * the setTrimmer method).
155      * <p>
156      * You must call a "reset" method to set the string which you want to parse.
157      * </p>
158      * @return a new tokenizer instance which parses Comma Separated Value strings
159      */
getCSVInstance()160     public static StrTokenizer getCSVInstance() {
161         return getCSVClone();
162     }
163 
164     /**
165      * Gets a new tokenizer instance which parses Comma Separated Value strings
166      * initializing it with the given input.  The default for CSV processing
167      * will be trim whitespace from both ends (which can be overridden with
168      * the setTrimmer method).
169      *
170      * @param input  the text to parse
171      * @return a new tokenizer instance which parses Comma Separated Value strings
172      */
getCSVInstance(final String input)173     public static StrTokenizer getCSVInstance(final String input) {
174         final StrTokenizer tok = getCSVClone();
175         tok.reset(input);
176         return tok;
177     }
178 
179     /**
180      * Gets a new tokenizer instance which parses Comma Separated Value strings
181      * initializing it with the given input.  The default for CSV processing
182      * will be trim whitespace from both ends (which can be overridden with
183      * the setTrimmer method).
184      *
185      * @param input  the text to parse
186      * @return a new tokenizer instance which parses Comma Separated Value strings
187      */
getCSVInstance(final char[] input)188     public static StrTokenizer getCSVInstance(final char[] input) {
189         final StrTokenizer tok = getCSVClone();
190         tok.reset(input);
191         return tok;
192     }
193 
194     /**
195      * Returns a clone of {@code TSV_TOKENIZER_PROTOTYPE}.
196      *
197      * @return a clone of {@code TSV_TOKENIZER_PROTOTYPE}.
198      */
getTSVClone()199     private static StrTokenizer getTSVClone() {
200         return (StrTokenizer) TSV_TOKENIZER_PROTOTYPE.clone();
201     }
202 
203 
204     /**
205      * Gets a new tokenizer instance which parses Tab Separated Value strings.
206      * The default for CSV processing will be trim whitespace from both ends
207      * (which can be overridden with the setTrimmer method).
208      * <p>
209      * You must call a "reset" method to set the string which you want to parse.
210      * </p>
211      * @return a new tokenizer instance which parses Tab Separated Value strings.
212      */
getTSVInstance()213     public static StrTokenizer getTSVInstance() {
214         return getTSVClone();
215     }
216 
217     /**
218      * Gets a new tokenizer instance which parses Tab Separated Value strings.
219      * The default for CSV processing will be trim whitespace from both ends
220      * (which can be overridden with the setTrimmer method).
221      * @param input  the string to parse
222      * @return a new tokenizer instance which parses Tab Separated Value strings.
223      */
getTSVInstance(final String input)224     public static StrTokenizer getTSVInstance(final String input) {
225         final StrTokenizer tok = getTSVClone();
226         tok.reset(input);
227         return tok;
228     }
229 
230     /**
231      * Gets a new tokenizer instance which parses Tab Separated Value strings.
232      * The default for CSV processing will be trim whitespace from both ends
233      * (which can be overridden with the setTrimmer method).
234      * @param input  the string to parse
235      * @return a new tokenizer instance which parses Tab Separated Value strings.
236      */
getTSVInstance(final char[] input)237     public static StrTokenizer getTSVInstance(final char[] input) {
238         final StrTokenizer tok = getTSVClone();
239         tok.reset(input);
240         return tok;
241     }
242 
243     /**
244      * Constructs a tokenizer splitting on space, tab, newline and formfeed
245      * as per StringTokenizer, but with no text to tokenize.
246      * <p>
247      * This constructor is normally used with {@link #reset(String)}.
248      * </p>
249      */
StrTokenizer()250     public StrTokenizer() {
251         this.chars = null;
252     }
253 
254     /**
255      * Constructs a tokenizer splitting on space, tab, newline and formfeed
256      * as per StringTokenizer.
257      *
258      * @param input  the string which is to be parsed
259      */
StrTokenizer(final String input)260     public StrTokenizer(final String input) {
261         if (input != null) {
262             chars = input.toCharArray();
263         } else {
264             chars = null;
265         }
266     }
267 
268     /**
269      * Constructs a tokenizer splitting on the specified delimiter character.
270      *
271      * @param input  the string which is to be parsed
272      * @param delim  the field delimiter character
273      */
StrTokenizer(final String input, final char delim)274     public StrTokenizer(final String input, final char delim) {
275         this(input);
276         setDelimiterChar(delim);
277     }
278 
279     /**
280      * Constructs a tokenizer splitting on the specified delimiter string.
281      *
282      * @param input  the string which is to be parsed
283      * @param delim  the field delimiter string
284      */
StrTokenizer(final String input, final String delim)285     public StrTokenizer(final String input, final String delim) {
286         this(input);
287         setDelimiterString(delim);
288     }
289 
290     /**
291      * Constructs a tokenizer splitting using the specified delimiter matcher.
292      *
293      * @param input  the string which is to be parsed
294      * @param delim  the field delimiter matcher
295      */
StrTokenizer(final String input, final StrMatcher delim)296     public StrTokenizer(final String input, final StrMatcher delim) {
297         this(input);
298         setDelimiterMatcher(delim);
299     }
300 
301     /**
302      * Constructs a tokenizer splitting on the specified delimiter character
303      * and handling quotes using the specified quote character.
304      *
305      * @param input  the string which is to be parsed
306      * @param delim  the field delimiter character
307      * @param quote  the field quoted string character
308      */
StrTokenizer(final String input, final char delim, final char quote)309     public StrTokenizer(final String input, final char delim, final char quote) {
310         this(input, delim);
311         setQuoteChar(quote);
312     }
313 
314     /**
315      * Constructs a tokenizer splitting using the specified delimiter matcher
316      * and handling quotes using the specified quote matcher.
317      *
318      * @param input  the string which is to be parsed
319      * @param delim  the field delimiter matcher
320      * @param quote  the field quoted string matcher
321      */
StrTokenizer(final String input, final StrMatcher delim, final StrMatcher quote)322     public StrTokenizer(final String input, final StrMatcher delim, final StrMatcher quote) {
323         this(input, delim);
324         setQuoteMatcher(quote);
325     }
326 
327     /**
328      * Constructs a tokenizer splitting on space, tab, newline and formfeed
329      * as per StringTokenizer.
330      *
331      * @param input  the string which is to be parsed, not cloned
332      */
StrTokenizer(final char[] input)333     public StrTokenizer(final char[] input) {
334         this.chars = ArrayUtils.clone(input);
335     }
336 
337     /**
338      * Constructs a tokenizer splitting on the specified character.
339      *
340      * @param input  the string which is to be parsed, not cloned
341      * @param delim the field delimiter character
342      */
StrTokenizer(final char[] input, final char delim)343     public StrTokenizer(final char[] input, final char delim) {
344         this(input);
345         setDelimiterChar(delim);
346     }
347 
348     /**
349      * Constructs a tokenizer splitting on the specified string.
350      *
351      * @param input  the string which is to be parsed, not cloned
352      * @param delim the field delimiter string
353      */
StrTokenizer(final char[] input, final String delim)354     public StrTokenizer(final char[] input, final String delim) {
355         this(input);
356         setDelimiterString(delim);
357     }
358 
359     /**
360      * Constructs a tokenizer splitting using the specified delimiter matcher.
361      *
362      * @param input  the string which is to be parsed, not cloned
363      * @param delim  the field delimiter matcher
364      */
StrTokenizer(final char[] input, final StrMatcher delim)365     public StrTokenizer(final char[] input, final StrMatcher delim) {
366         this(input);
367         setDelimiterMatcher(delim);
368     }
369 
370     /**
371      * Constructs a tokenizer splitting on the specified delimiter character
372      * and handling quotes using the specified quote character.
373      *
374      * @param input  the string which is to be parsed, not cloned
375      * @param delim  the field delimiter character
376      * @param quote  the field quoted string character
377      */
StrTokenizer(final char[] input, final char delim, final char quote)378     public StrTokenizer(final char[] input, final char delim, final char quote) {
379         this(input, delim);
380         setQuoteChar(quote);
381     }
382 
383     /**
384      * Constructs a tokenizer splitting using the specified delimiter matcher
385      * and handling quotes using the specified quote matcher.
386      *
387      * @param input  the string which is to be parsed, not cloned
388      * @param delim  the field delimiter character
389      * @param quote  the field quoted string character
390      */
StrTokenizer(final char[] input, final StrMatcher delim, final StrMatcher quote)391     public StrTokenizer(final char[] input, final StrMatcher delim, final StrMatcher quote) {
392         this(input, delim);
393         setQuoteMatcher(quote);
394     }
395 
396     // API
397     /**
398      * Gets the number of tokens found in the String.
399      *
400      * @return the number of matched tokens
401      */
size()402     public int size() {
403         checkTokenized();
404         return tokens.length;
405     }
406 
407     /**
408      * Gets the next token from the String.
409      * Equivalent to {@link #next()} except it returns null rather than
410      * throwing {@link NoSuchElementException} when no tokens remain.
411      *
412      * @return the next sequential token, or null when no more tokens are found
413      */
nextToken()414     public String nextToken() {
415         if (hasNext()) {
416             return tokens[tokenPos++];
417         }
418         return null;
419     }
420 
421     /**
422      * Gets the previous token from the String.
423      *
424      * @return the previous sequential token, or null when no more tokens are found
425      */
previousToken()426     public String previousToken() {
427         if (hasPrevious()) {
428             return tokens[--tokenPos];
429         }
430         return null;
431     }
432 
433     /**
434      * Gets a copy of the full token list as an independent modifiable array.
435      *
436      * @return the tokens as a String array
437      */
getTokenArray()438     public String[] getTokenArray() {
439         checkTokenized();
440         return tokens.clone();
441     }
442 
443     /**
444      * Gets a copy of the full token list as an independent modifiable list.
445      *
446      * @return the tokens as a String array
447      */
getTokenList()448     public List<String> getTokenList() {
449         checkTokenized();
450         final List<String> list = new ArrayList<>(tokens.length);
451         list.addAll(Arrays.asList(tokens));
452         return list;
453     }
454 
455     /**
456      * Resets this tokenizer, forgetting all parsing and iteration already completed.
457      * <p>
458      * This method allows the same tokenizer to be reused for the same String.
459      * </p>
460      *
461      * @return this, to enable chaining
462      */
reset()463     public StrTokenizer reset() {
464         tokenPos = 0;
465         tokens = null;
466         return this;
467     }
468 
469     /**
470      * Reset this tokenizer, giving it a new input string to parse.
471      * In this manner you can re-use a tokenizer with the same settings
472      * on multiple input lines.
473      *
474      * @param input  the new string to tokenize, null sets no text to parse
475      * @return this, to enable chaining
476      */
reset(final String input)477     public StrTokenizer reset(final String input) {
478         reset();
479         if (input != null) {
480             this.chars = input.toCharArray();
481         } else {
482             this.chars = null;
483         }
484         return this;
485     }
486 
487     /**
488      * Reset this tokenizer, giving it a new input string to parse.
489      * In this manner you can re-use a tokenizer with the same settings
490      * on multiple input lines.
491      *
492      * @param input  the new character array to tokenize, not cloned, null sets no text to parse
493      * @return this, to enable chaining
494      */
reset(final char[] input)495     public StrTokenizer reset(final char[] input) {
496         reset();
497         this.chars = ArrayUtils.clone(input);
498         return this;
499     }
500 
501     /**
502      * Checks whether there are any more tokens.
503      *
504      * @return true if there are more tokens
505      */
506     @Override
hasNext()507     public boolean hasNext() {
508         checkTokenized();
509         return tokenPos < tokens.length;
510     }
511 
512     /**
513      * Gets the next token.
514      *
515      * @return the next String token
516      * @throws NoSuchElementException if there are no more elements
517      */
518     @Override
next()519     public String next() {
520         if (hasNext()) {
521             return tokens[tokenPos++];
522         }
523         throw new NoSuchElementException();
524     }
525 
526     /**
527      * Gets the index of the next token to return.
528      *
529      * @return the next token index
530      */
531     @Override
nextIndex()532     public int nextIndex() {
533         return tokenPos;
534     }
535 
536     /**
537      * Checks whether there are any previous tokens that can be iterated to.
538      *
539      * @return true if there are previous tokens
540      */
541     @Override
hasPrevious()542     public boolean hasPrevious() {
543         checkTokenized();
544         return tokenPos > 0;
545     }
546 
547     /**
548      * Gets the token previous to the last returned token.
549      *
550      * @return the previous token
551      */
552     @Override
previous()553     public String previous() {
554         if (hasPrevious()) {
555             return tokens[--tokenPos];
556         }
557         throw new NoSuchElementException();
558     }
559 
560     /**
561      * Gets the index of the previous token.
562      *
563      * @return the previous token index
564      */
565     @Override
previousIndex()566     public int previousIndex() {
567         return tokenPos - 1;
568     }
569 
570     /**
571      * Unsupported ListIterator operation.
572      *
573      * @throws UnsupportedOperationException always
574      */
575     @Override
remove()576     public void remove() {
577         throw new UnsupportedOperationException("remove() is unsupported");
578     }
579 
580     /**
581      * Unsupported ListIterator operation.
582      * @param obj this parameter ignored.
583      * @throws UnsupportedOperationException always
584      */
585     @Override
set(final String obj)586     public void set(final String obj) {
587         throw new UnsupportedOperationException("set() is unsupported");
588     }
589 
590     /**
591      * Unsupported ListIterator operation.
592      * @param obj this parameter ignored.
593      * @throws UnsupportedOperationException always
594      */
595     @Override
add(final String obj)596     public void add(final String obj) {
597         throw new UnsupportedOperationException("add() is unsupported");
598     }
599 
600     /**
601      * Checks if tokenization has been done, and if not then do it.
602      */
checkTokenized()603     private void checkTokenized() {
604         if (tokens == null) {
605             if (chars == null) {
606                 // still call tokenize as subclass may do some work
607                 final List<String> split = tokenize(null, 0, 0);
608                 tokens = split.toArray(ArrayUtils.EMPTY_STRING_ARRAY);
609             } else {
610                 final List<String> split = tokenize(chars, 0, chars.length);
611                 tokens = split.toArray(ArrayUtils.EMPTY_STRING_ARRAY);
612             }
613         }
614     }
615 
616     /**
617      * Internal method to performs the tokenization.
618      * <p>
619      * Most users of this class do not need to call this method. This method
620      * will be called automatically by other (public) methods when required.
621      * </p>
622      * <p>
623      * This method exists to allow subclasses to add code before or after the
624      * tokenization. For example, a subclass could alter the character array,
625      * offset or count to be parsed, or call the tokenizer multiple times on
626      * multiple strings. It is also be possible to filter the results.
627      * </p>
628      * <p>
629      * {@link StrTokenizer} will always pass a zero offset and a count
630      * equal to the length of the array to this method, however a subclass
631      * may pass other values, or even an entirely different array.
632      * </p>
633      *
634      * @param srcChars  the character array being tokenized, may be null
635      * @param offset  the start position within the character array, must be valid
636      * @param count  the number of characters to tokenize, must be valid
637      * @return the modifiable list of String tokens, unmodifiable if null array or zero count
638      */
tokenize(final char[] srcChars, final int offset, final int count)639     protected List<String> tokenize(final char[] srcChars, final int offset, final int count) {
640         if (ArrayUtils.isEmpty(srcChars)) {
641             return Collections.emptyList();
642         }
643         final StrBuilder buf = new StrBuilder();
644         final List<String> tokenList = new ArrayList<>();
645         int pos = offset;
646 
647         // loop around the entire buffer
648         while (pos >= 0 && pos < count) {
649             // find next token
650             pos = readNextToken(srcChars, pos, count, buf, tokenList);
651 
652             // handle case where end of string is a delimiter
653             if (pos >= count) {
654                 addToken(tokenList, StringUtils.EMPTY);
655             }
656         }
657         return tokenList;
658     }
659 
660     /**
661      * Adds a token to a list, paying attention to the parameters we've set.
662      *
663      * @param list  the list to add to
664      * @param tok  the token to add
665      */
addToken(final List<String> list, String tok)666     private void addToken(final List<String> list, String tok) {
667         if (StringUtils.isEmpty(tok)) {
668             if (isIgnoreEmptyTokens()) {
669                 return;
670             }
671             if (isEmptyTokenAsNull()) {
672                 tok = null;
673             }
674         }
675         list.add(tok);
676     }
677 
678     /**
679      * Reads character by character through the String to get the next token.
680      *
681      * @param srcChars  the character array being tokenized
682      * @param start  the first character of field
683      * @param len  the length of the character array being tokenized
684      * @param workArea  a temporary work area
685      * @param tokenList  the list of parsed tokens
686      * @return the starting position of the next field (the character
687      *  immediately after the delimiter), or -1 if end of string found
688      */
readNextToken(final char[] srcChars, int start, final int len, final StrBuilder workArea, final List<String> tokenList)689     private int readNextToken(final char[] srcChars, int start, final int len, final StrBuilder workArea, final List<String> tokenList) {
690         // skip all leading whitespace, unless it is the
691         // field delimiter or the quote character
692         while (start < len) {
693             final int removeLen = Math.max(
694                     getIgnoredMatcher().isMatch(srcChars, start, start, len),
695                     getTrimmerMatcher().isMatch(srcChars, start, start, len));
696             if (removeLen == 0 ||
697                 getDelimiterMatcher().isMatch(srcChars, start, start, len) > 0 ||
698                 getQuoteMatcher().isMatch(srcChars, start, start, len) > 0) {
699                 break;
700             }
701             start += removeLen;
702         }
703 
704         // handle reaching end
705         if (start >= len) {
706             addToken(tokenList, StringUtils.EMPTY);
707             return -1;
708         }
709 
710         // handle empty token
711         final int delimLen = getDelimiterMatcher().isMatch(srcChars, start, start, len);
712         if (delimLen > 0) {
713             addToken(tokenList, StringUtils.EMPTY);
714             return start + delimLen;
715         }
716 
717         // handle found token
718         final int quoteLen = getQuoteMatcher().isMatch(srcChars, start, start, len);
719         if (quoteLen > 0) {
720             return readWithQuotes(srcChars, start + quoteLen, len, workArea, tokenList, start, quoteLen);
721         }
722         return readWithQuotes(srcChars, start, len, workArea, tokenList, 0, 0);
723     }
724 
725     /**
726      * Reads a possibly quoted string token.
727      *
728      * @param srcChars  the character array being tokenized
729      * @param start  the first character of field
730      * @param len  the length of the character array being tokenized
731      * @param workArea  a temporary work area
732      * @param tokenList  the list of parsed tokens
733      * @param quoteStart  the start position of the matched quote, 0 if no quoting
734      * @param quoteLen  the length of the matched quote, 0 if no quoting
735      * @return the starting position of the next field (the character
736      *  immediately after the delimiter, or if end of string found,
737      *  then the length of string
738      */
readWithQuotes(final char[] srcChars, final int start, final int len, final StrBuilder workArea, final List<String> tokenList, final int quoteStart, final int quoteLen)739     private int readWithQuotes(final char[] srcChars, final int start, final int len, final StrBuilder workArea,
740                                final List<String> tokenList, final int quoteStart, final int quoteLen) {
741         // Loop until we've found the end of the quoted
742         // string or the end of the input
743         workArea.clear();
744         int pos = start;
745         boolean quoting = quoteLen > 0;
746         int trimStart = 0;
747 
748         while (pos < len) {
749             // quoting mode can occur several times throughout a string
750             // we must switch between quoting and non-quoting until we
751             // encounter a non-quoted delimiter, or end of string
752             if (quoting) {
753                 // In quoting mode
754 
755                 // If we've found a quote character, see if it's
756                 // followed by a second quote.  If so, then we need
757                 // to actually put the quote character into the token
758                 // rather than end the token.
759                 if (isQuote(srcChars, pos, len, quoteStart, quoteLen)) {
760                     if (isQuote(srcChars, pos + quoteLen, len, quoteStart, quoteLen)) {
761                         // matched pair of quotes, thus an escaped quote
762                         workArea.append(srcChars, pos, quoteLen);
763                         pos += quoteLen * 2;
764                         trimStart = workArea.size();
765                         continue;
766                     }
767 
768                     // end of quoting
769                     quoting = false;
770                     pos += quoteLen;
771                     continue;
772                 }
773 
774             } else {
775                 // Not in quoting mode
776 
777                 // check for delimiter, and thus end of token
778                 final int delimLen = getDelimiterMatcher().isMatch(srcChars, pos, start, len);
779                 if (delimLen > 0) {
780                     // return condition when end of token found
781                     addToken(tokenList, workArea.substring(0, trimStart));
782                     return pos + delimLen;
783                 }
784 
785                 // check for quote, and thus back into quoting mode
786                 if (quoteLen > 0 && isQuote(srcChars, pos, len, quoteStart, quoteLen)) {
787                     quoting = true;
788                     pos += quoteLen;
789                     continue;
790                 }
791 
792                 // check for ignored (outside quotes), and ignore
793                 final int ignoredLen = getIgnoredMatcher().isMatch(srcChars, pos, start, len);
794                 if (ignoredLen > 0) {
795                     pos += ignoredLen;
796                     continue;
797                 }
798 
799                 // check for trimmed character
800                 // don't yet know if it's at the end, so copy to workArea
801                 // use trimStart to keep track of trim at the end
802                 final int trimmedLen = getTrimmerMatcher().isMatch(srcChars, pos, start, len);
803                 if (trimmedLen > 0) {
804                     workArea.append(srcChars, pos, trimmedLen);
805                     pos += trimmedLen;
806                     continue;
807                 }
808             }
809             // copy regular character from inside quotes
810             workArea.append(srcChars[pos++]);
811             trimStart = workArea.size();
812         }
813 
814         // return condition when end of string found
815         addToken(tokenList, workArea.substring(0, trimStart));
816         return -1;
817     }
818 
819     /**
820      * Checks if the characters at the index specified match the quote
821      * already matched in readNextToken().
822      *
823      * @param srcChars  the character array being tokenized
824      * @param pos  the position to check for a quote
825      * @param len  the length of the character array being tokenized
826      * @param quoteStart  the start position of the matched quote, 0 if no quoting
827      * @param quoteLen  the length of the matched quote, 0 if no quoting
828      * @return true if a quote is matched
829      */
isQuote(final char[] srcChars, final int pos, final int len, final int quoteStart, final int quoteLen)830     private boolean isQuote(final char[] srcChars, final int pos, final int len, final int quoteStart, final int quoteLen) {
831         for (int i = 0; i < quoteLen; i++) {
832             if (pos + i >= len || srcChars[pos + i] != srcChars[quoteStart + i]) {
833                 return false;
834             }
835         }
836         return true;
837     }
838 
839     /**
840      * Gets the field delimiter matcher.
841      *
842      * @return the delimiter matcher in use
843      */
getDelimiterMatcher()844     public StrMatcher getDelimiterMatcher() {
845         return this.delimMatcher;
846     }
847 
848     /**
849      * Sets the field delimiter matcher.
850      * <p>
851      * The delimiter is used to separate one token from another.
852      * </p>
853      *
854      * @param delim  the delimiter matcher to use
855      * @return this, to enable chaining
856      */
setDelimiterMatcher(final StrMatcher delim)857     public StrTokenizer setDelimiterMatcher(final StrMatcher delim) {
858         if (delim == null) {
859             this.delimMatcher = StrMatcher.noneMatcher();
860         } else {
861             this.delimMatcher = delim;
862         }
863         return this;
864     }
865 
866     /**
867      * Sets the field delimiter character.
868      *
869      * @param delim  the delimiter character to use
870      * @return this, to enable chaining
871      */
setDelimiterChar(final char delim)872     public StrTokenizer setDelimiterChar(final char delim) {
873         return setDelimiterMatcher(StrMatcher.charMatcher(delim));
874     }
875 
876     /**
877      * Sets the field delimiter string.
878      *
879      * @param delim  the delimiter string to use
880      * @return this, to enable chaining
881      */
setDelimiterString(final String delim)882     public StrTokenizer setDelimiterString(final String delim) {
883         return setDelimiterMatcher(StrMatcher.stringMatcher(delim));
884     }
885 
886     /**
887      * Gets the quote matcher currently in use.
888      * <p>
889      * The quote character is used to wrap data between the tokens.
890      * This enables delimiters to be entered as data.
891      * The default value is '"' (double quote).
892      * </p>
893      *
894      * @return the quote matcher in use
895      */
getQuoteMatcher()896     public StrMatcher getQuoteMatcher() {
897         return quoteMatcher;
898     }
899 
900     /**
901      * Set the quote matcher to use.
902      * <p>
903      * The quote character is used to wrap data between the tokens.
904      * This enables delimiters to be entered as data.
905      * </p>
906      *
907      * @param quote  the quote matcher to use, null ignored
908      * @return this, to enable chaining
909      */
setQuoteMatcher(final StrMatcher quote)910     public StrTokenizer setQuoteMatcher(final StrMatcher quote) {
911         if (quote != null) {
912             this.quoteMatcher = quote;
913         }
914         return this;
915     }
916 
917     /**
918      * Sets the quote character to use.
919      * <p>
920      * The quote character is used to wrap data between the tokens.
921      * This enables delimiters to be entered as data.
922      * </p>
923      *
924      * @param quote  the quote character to use
925      * @return this, to enable chaining
926      */
setQuoteChar(final char quote)927     public StrTokenizer setQuoteChar(final char quote) {
928         return setQuoteMatcher(StrMatcher.charMatcher(quote));
929     }
930 
931     // Ignored
932     /**
933      * Gets the ignored character matcher.
934      * <p>
935      * These characters are ignored when parsing the String, unless they are
936      * within a quoted region.
937      * The default value is not to ignore anything.
938      * </p>
939      *
940      * @return the ignored matcher in use
941      */
getIgnoredMatcher()942     public StrMatcher getIgnoredMatcher() {
943         return ignoredMatcher;
944     }
945 
946     /**
947      * Set the matcher for characters to ignore.
948      * <p>
949      * These characters are ignored when parsing the String, unless they are
950      * within a quoted region.
951      * </p>
952      *
953      * @param ignored  the ignored matcher to use, null ignored
954      * @return this, to enable chaining
955      */
setIgnoredMatcher(final StrMatcher ignored)956     public StrTokenizer setIgnoredMatcher(final StrMatcher ignored) {
957         if (ignored != null) {
958             this.ignoredMatcher = ignored;
959         }
960         return this;
961     }
962 
963     /**
964      * Set the character to ignore.
965      * <p>
966      * This character is ignored when parsing the String, unless it is
967      * within a quoted region.
968      *
969      * @param ignored  the ignored character to use
970      * @return this, to enable chaining
971      */
setIgnoredChar(final char ignored)972     public StrTokenizer setIgnoredChar(final char ignored) {
973         return setIgnoredMatcher(StrMatcher.charMatcher(ignored));
974     }
975 
976     /**
977      * Gets the trimmer character matcher.
978      * <p>
979      * These characters are trimmed off on each side of the delimiter
980      * until the token or quote is found.
981      * The default value is not to trim anything.
982      * </p>
983      *
984      * @return the trimmer matcher in use
985      */
getTrimmerMatcher()986     public StrMatcher getTrimmerMatcher() {
987         return trimmerMatcher;
988     }
989 
990     /**
991      * Sets the matcher for characters to trim.
992      * <p>
993      * These characters are trimmed off on each side of the delimiter
994      * until the token or quote is found.
995      * </p>
996      *
997      * @param trimmer  the trimmer matcher to use, null ignored
998      * @return this, to enable chaining
999      */
setTrimmerMatcher(final StrMatcher trimmer)1000     public StrTokenizer setTrimmerMatcher(final StrMatcher trimmer) {
1001         if (trimmer != null) {
1002             this.trimmerMatcher = trimmer;
1003         }
1004         return this;
1005     }
1006 
1007     /**
1008      * Gets whether the tokenizer currently returns empty tokens as null.
1009      * The default for this property is false.
1010      *
1011      * @return true if empty tokens are returned as null
1012      */
isEmptyTokenAsNull()1013     public boolean isEmptyTokenAsNull() {
1014         return this.emptyAsNull;
1015     }
1016 
1017     /**
1018      * Sets whether the tokenizer should return empty tokens as null.
1019      * The default for this property is false.
1020      *
1021      * @param emptyAsNull  whether empty tokens are returned as null
1022      * @return this, to enable chaining
1023      */
setEmptyTokenAsNull(final boolean emptyAsNull)1024     public StrTokenizer setEmptyTokenAsNull(final boolean emptyAsNull) {
1025         this.emptyAsNull = emptyAsNull;
1026         return this;
1027     }
1028 
1029     /**
1030      * Gets whether the tokenizer currently ignores empty tokens.
1031      * The default for this property is true.
1032      *
1033      * @return true if empty tokens are not returned
1034      */
isIgnoreEmptyTokens()1035     public boolean isIgnoreEmptyTokens() {
1036         return ignoreEmptyTokens;
1037     }
1038 
1039     /**
1040      * Sets whether the tokenizer should ignore and not return empty tokens.
1041      * The default for this property is true.
1042      *
1043      * @param ignoreEmptyTokens  whether empty tokens are not returned
1044      * @return this, to enable chaining
1045      */
setIgnoreEmptyTokens(final boolean ignoreEmptyTokens)1046     public StrTokenizer setIgnoreEmptyTokens(final boolean ignoreEmptyTokens) {
1047         this.ignoreEmptyTokens = ignoreEmptyTokens;
1048         return this;
1049     }
1050 
1051     /**
1052      * Gets the String content that the tokenizer is parsing.
1053      *
1054      * @return the string content being parsed
1055      */
getContent()1056     public String getContent() {
1057         if (chars == null) {
1058             return null;
1059         }
1060         return new String(chars);
1061     }
1062 
1063     /**
1064      * Creates a new instance of this Tokenizer. The new instance is reset so
1065      * that it will be at the start of the token list.
1066      * If a {@link CloneNotSupportedException} is caught, return {@code null}.
1067      *
1068      * @return a new instance of this Tokenizer which has been reset.
1069      */
1070     @Override
clone()1071     public Object clone() {
1072         try {
1073             return cloneReset();
1074         } catch (final CloneNotSupportedException ex) {
1075             return null;
1076         }
1077     }
1078 
1079     /**
1080      * Creates a new instance of this Tokenizer. The new instance is reset so that
1081      * it will be at the start of the token list.
1082      *
1083      * @return a new instance of this Tokenizer which has been reset.
1084      * @throws CloneNotSupportedException if there is a problem cloning
1085      */
cloneReset()1086     Object cloneReset() throws CloneNotSupportedException {
1087         // this method exists to enable 100% test coverage
1088         final StrTokenizer cloned = (StrTokenizer) super.clone();
1089         if (cloned.chars != null) {
1090             cloned.chars = cloned.chars.clone();
1091         }
1092         cloned.reset();
1093         return cloned;
1094     }
1095 
1096     /**
1097      * Gets the String content that the tokenizer is parsing.
1098      *
1099      * @return the string content being parsed
1100      */
1101     @Override
toString()1102     public String toString() {
1103         if (tokens == null) {
1104             return "StrTokenizer[not tokenized yet]";
1105         }
1106         return "StrTokenizer" + getTokenList();
1107     }
1108 
1109 }
1110