• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright (c) 2010 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4 
5 #ifndef BASE_STRING_TOKENIZER_H_
6 #define BASE_STRING_TOKENIZER_H_
7 #pragma once
8 
9 #include <algorithm>
10 #include <string>
11 
12 #include "base/string_piece.h"
13 
14 // StringTokenizerT is a simple string tokenizer class.  It works like an
15 // iterator that with each step (see the Advance method) updates members that
16 // refer to the next token in the input string.  The user may optionally
17 // configure the tokenizer to return delimiters.
18 //
19 // Warning: be careful not to pass a C string into the 2-arg constructor:
20 // StringTokenizer t("this is a test", " ");  // WRONG
21 // This will create a temporary std::string, save the begin() and end()
22 // iterators, and then the string will be freed before we actually start
23 // tokenizing it.
24 // Instead, use a std::string or use the 3 arg constructor of CStringTokenizer.
25 //
26 //
27 // EXAMPLE 1:
28 //
29 //   char input[] = "this is a test";
30 //   CStringTokenizer t(input, input + strlen(input), " ");
31 //   while (t.GetNext()) {
32 //     printf("%s\n", t.token().c_str());
33 //   }
34 //
35 // Output:
36 //
37 //   this
38 //   is
39 //   a
40 //   test
41 //
42 //
43 // EXAMPLE 2:
44 //
45 //   std::string input = "no-cache=\"foo, bar\", private";
46 //   StringTokenizer t(input, ", ");
47 //   t.set_quote_chars("\"");
48 //   while (t.GetNext()) {
49 //     printf("%s\n", t.token().c_str());
50 //   }
51 //
52 // Output:
53 //
54 //   no-cache="foo, bar"
55 //   private
56 //
57 //
58 // EXAMPLE 3:
59 //
60 //   bool next_is_option = false, next_is_value = false;
61 //   std::string input = "text/html; charset=UTF-8; foo=bar";
62 //   StringTokenizer t(input, "; =");
63 //   t.set_options(StringTokenizer::RETURN_DELIMS);
64 //   while (t.GetNext()) {
65 //     if (t.token_is_delim()) {
66 //       switch (*t.token_begin()) {
67 //         case ';':
68 //           next_is_option = true;
69 //           break;
70 //         case '=':
71 //           next_is_value = true;
72 //           break;
73 //       }
74 //     } else {
75 //       const char* label;
76 //       if (next_is_option) {
77 //         label = "option-name";
78 //         next_is_option = false;
79 //       } else if (next_is_value) {
80 //         label = "option-value";
81 //         next_is_value = false;
82 //       } else {
83 //         label = "mime-type";
84 //       }
85 //       printf("%s: %s\n", label, t.token().c_str());
86 //     }
87 //   }
88 //
89 //
90 template <class str, class const_iterator>
91 class StringTokenizerT {
92  public:
93   typedef typename str::value_type char_type;
94 
95   // Options that may be pass to set_options()
96   enum {
97     // Specifies the delimiters should be returned as tokens
98     RETURN_DELIMS = 1 << 0,
99   };
100 
101   // The string object must live longer than the tokenizer.  (In particular this
102   // should not be constructed with a temporary.)
StringTokenizerT(const str & string,const str & delims)103   StringTokenizerT(const str& string,
104                    const str& delims) {
105     Init(string.begin(), string.end(), delims);
106   }
107 
StringTokenizerT(const_iterator string_begin,const_iterator string_end,const str & delims)108   StringTokenizerT(const_iterator string_begin,
109                    const_iterator string_end,
110                    const str& delims) {
111     Init(string_begin, string_end, delims);
112   }
113 
114   // Set the options for this tokenizer.  By default, this is 0.
set_options(int options)115   void set_options(int options) { options_ = options; }
116 
117   // Set the characters to regard as quotes.  By default, this is empty.  When
118   // a quote char is encountered, the tokenizer will switch into a mode where
119   // it ignores delimiters that it finds.  It switches out of this mode once it
120   // finds another instance of the quote char.  If a backslash is encountered
121   // within a quoted string, then the next character is skipped.
set_quote_chars(const str & quotes)122   void set_quote_chars(const str& quotes) { quotes_ = quotes; }
123 
124   // Call this method to advance the tokenizer to the next delimiter.  This
125   // returns false if the tokenizer is complete.  This method must be called
126   // before calling any of the token* methods.
GetNext()127   bool GetNext() {
128     if (quotes_.empty() && options_ == 0)
129       return QuickGetNext();
130     else
131       return FullGetNext();
132   }
133 
134   // Start iterating through tokens from the beginning of the string.
Reset()135   void Reset() {
136     token_end_ = start_pos_;
137   }
138 
139   // Returns true if token is a delimiter.  When the tokenizer is constructed
140   // with the RETURN_DELIMS option, this method can be used to check if the
141   // returned token is actually a delimiter.
token_is_delim()142   bool token_is_delim() const { return token_is_delim_; }
143 
144   // If GetNext() returned true, then these methods may be used to read the
145   // value of the token.
token_begin()146   const_iterator token_begin() const { return token_begin_; }
token_end()147   const_iterator token_end() const { return token_end_; }
token()148   str token() const { return str(token_begin_, token_end_); }
token_piece()149   base::StringPiece token_piece() const {
150     return base::StringPiece(&*token_begin_,
151                              std::distance(token_begin_, token_end_));
152   }
153 
154  private:
Init(const_iterator string_begin,const_iterator string_end,const str & delims)155   void Init(const_iterator string_begin,
156             const_iterator string_end,
157             const str& delims) {
158     start_pos_ = string_begin;
159     token_begin_ = string_begin;
160     token_end_ = string_begin;
161     end_ = string_end;
162     delims_ = delims;
163     options_ = 0;
164     token_is_delim_ = false;
165   }
166 
167   // Implementation of GetNext() for when we have no quote characters. We have
168   // two separate implementations because AdvanceOne() is a hot spot in large
169   // text files with large tokens.
QuickGetNext()170   bool QuickGetNext() {
171     token_is_delim_ = false;
172     for (;;) {
173       token_begin_ = token_end_;
174       if (token_end_ == end_)
175         return false;
176       ++token_end_;
177       if (delims_.find(*token_begin_) == str::npos)
178         break;
179       // else skip over delimiter.
180     }
181     while (token_end_ != end_ && delims_.find(*token_end_) == str::npos)
182       ++token_end_;
183     return true;
184   }
185 
186   // Implementation of GetNext() for when we have to take quotes into account.
FullGetNext()187   bool FullGetNext() {
188     AdvanceState state;
189     token_is_delim_ = false;
190     for (;;) {
191       token_begin_ = token_end_;
192       if (token_end_ == end_)
193         return false;
194       ++token_end_;
195       if (AdvanceOne(&state, *token_begin_))
196         break;
197       if (options_ & RETURN_DELIMS) {
198         token_is_delim_ = true;
199         return true;
200       }
201       // else skip over delimiter.
202     }
203     while (token_end_ != end_ && AdvanceOne(&state, *token_end_))
204       ++token_end_;
205     return true;
206   }
207 
IsDelim(char_type c)208   bool IsDelim(char_type c) const {
209     return delims_.find(c) != str::npos;
210   }
211 
IsQuote(char_type c)212   bool IsQuote(char_type c) const {
213     return quotes_.find(c) != str::npos;
214   }
215 
216   struct AdvanceState {
217     bool in_quote;
218     bool in_escape;
219     char_type quote_char;
AdvanceStateAdvanceState220     AdvanceState() : in_quote(false), in_escape(false) {}
221   };
222 
223   // Returns true if a delimiter was not hit.
AdvanceOne(AdvanceState * state,char_type c)224   bool AdvanceOne(AdvanceState* state, char_type c) {
225     if (state->in_quote) {
226       if (state->in_escape) {
227         state->in_escape = false;
228       } else if (c == '\\') {
229         state->in_escape = true;
230       } else if (c == state->quote_char) {
231         state->in_quote = false;
232       }
233     } else {
234       if (IsDelim(c))
235         return false;
236       state->in_quote = IsQuote(state->quote_char = c);
237     }
238     return true;
239   }
240 
241   const_iterator start_pos_;
242   const_iterator token_begin_;
243   const_iterator token_end_;
244   const_iterator end_;
245   str delims_;
246   str quotes_;
247   int options_;
248   bool token_is_delim_;
249 };
250 
251 typedef StringTokenizerT<std::string, std::string::const_iterator>
252     StringTokenizer;
253 typedef StringTokenizerT<std::wstring, std::wstring::const_iterator>
254     WStringTokenizer;
255 typedef StringTokenizerT<std::string, const char*> CStringTokenizer;
256 
257 #endif  // BASE_STRING_TOKENIZER_H_
258