1 // Copyright (c) 2011 The Chromium Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style license that can be 3 // found in the LICENSE file. 4 5 #ifndef BASE_STRINGS_STRING_TOKENIZER_H_ 6 #define BASE_STRINGS_STRING_TOKENIZER_H_ 7 8 #include <algorithm> 9 #include <string> 10 11 #include "base/strings/string_piece.h" 12 13 namespace base { 14 15 // StringTokenizerT is a simple string tokenizer class. It works like an 16 // iterator that with each step (see the Advance method) updates members that 17 // refer to the next token in the input string. The user may optionally 18 // configure the tokenizer to return delimiters. 19 // 20 // EXAMPLE 1: 21 // 22 // char input[] = "this is a test"; 23 // CStringTokenizer t(input, input + strlen(input), " "); 24 // while (t.GetNext()) { 25 // printf("%s\n", t.token().c_str()); 26 // } 27 // 28 // Output: 29 // 30 // this 31 // is 32 // a 33 // test 34 // 35 // 36 // EXAMPLE 2: 37 // 38 // std::string input = "no-cache=\"foo, bar\", private"; 39 // StringTokenizer t(input, ", "); 40 // t.set_quote_chars("\""); 41 // while (t.GetNext()) { 42 // printf("%s\n", t.token().c_str()); 43 // } 44 // 45 // Output: 46 // 47 // no-cache="foo, bar" 48 // private 49 // 50 // 51 // EXAMPLE 3: 52 // 53 // bool next_is_option = false, next_is_value = false; 54 // std::string input = "text/html; charset=UTF-8; foo=bar"; 55 // StringTokenizer t(input, "; ="); 56 // t.set_options(StringTokenizer::RETURN_DELIMS); 57 // while (t.GetNext()) { 58 // if (t.token_is_delim()) { 59 // switch (*t.token_begin()) { 60 // case ';': 61 // next_is_option = true; 62 // break; 63 // case '=': 64 // next_is_value = true; 65 // break; 66 // } 67 // } else { 68 // const char* label; 69 // if (next_is_option) { 70 // label = "option-name"; 71 // next_is_option = false; 72 // } else if (next_is_value) { 73 // label = "option-value"; 74 // next_is_value = false; 75 // } else { 76 // label = "mime-type"; 77 // } 78 // printf("%s: %s\n", label, t.token().c_str()); 79 // } 80 // } 81 // 82 // 83 template <class str, class const_iterator> 84 class StringTokenizerT { 85 public: 86 typedef typename str::value_type char_type; 87 88 // Options that may be pass to set_options() 89 enum { 90 // Specifies the delimiters should be returned as tokens 91 RETURN_DELIMS = 1 << 0, 92 }; 93 94 // The string object must live longer than the tokenizer. In particular, this 95 // should not be constructed with a temporary. The deleted rvalue constructor 96 // blocks the most obvious instances of this (e.g. passing a string literal to 97 // the constructor), but caution must still be exercised. StringTokenizerT(const str & string,const str & delims)98 StringTokenizerT(const str& string, 99 const str& delims) { 100 Init(string.begin(), string.end(), delims); 101 } 102 103 // Don't allow temporary strings to be used with string tokenizer, since 104 // Init() would otherwise save iterators to a temporary string. 105 StringTokenizerT(str&&, const str& delims) = delete; 106 StringTokenizerT(const_iterator string_begin,const_iterator string_end,const str & delims)107 StringTokenizerT(const_iterator string_begin, 108 const_iterator string_end, 109 const str& delims) { 110 Init(string_begin, string_end, delims); 111 } 112 113 // Set the options for this tokenizer. By default, this is 0. set_options(int options)114 void set_options(int options) { options_ = options; } 115 116 // Set the characters to regard as quotes. By default, this is empty. When 117 // a quote char is encountered, the tokenizer will switch into a mode where 118 // it ignores delimiters that it finds. It switches out of this mode once it 119 // finds another instance of the quote char. If a backslash is encountered 120 // within a quoted string, then the next character is skipped. set_quote_chars(const str & quotes)121 void set_quote_chars(const str& quotes) { quotes_ = quotes; } 122 123 // Call this method to advance the tokenizer to the next delimiter. This 124 // returns false if the tokenizer is complete. This method must be called 125 // before calling any of the token* methods. GetNext()126 bool GetNext() { 127 if (quotes_.empty() && options_ == 0) 128 return QuickGetNext(); 129 else 130 return FullGetNext(); 131 } 132 133 // Start iterating through tokens from the beginning of the string. Reset()134 void Reset() { 135 token_end_ = start_pos_; 136 } 137 138 // Returns true if token is a delimiter. When the tokenizer is constructed 139 // with the RETURN_DELIMS option, this method can be used to check if the 140 // returned token is actually a delimiter. token_is_delim()141 bool token_is_delim() const { return token_is_delim_; } 142 143 // If GetNext() returned true, then these methods may be used to read the 144 // value of the token. token_begin()145 const_iterator token_begin() const { return token_begin_; } token_end()146 const_iterator token_end() const { return token_end_; } token()147 str token() const { return str(token_begin_, token_end_); } token_piece()148 BasicStringPiece<str> token_piece() const { 149 return BasicStringPiece<str>(&*token_begin_, 150 std::distance(token_begin_, token_end_)); 151 } 152 153 private: Init(const_iterator string_begin,const_iterator string_end,const str & delims)154 void Init(const_iterator string_begin, 155 const_iterator string_end, 156 const str& delims) { 157 start_pos_ = string_begin; 158 token_begin_ = string_begin; 159 token_end_ = string_begin; 160 end_ = string_end; 161 delims_ = delims; 162 options_ = 0; 163 token_is_delim_ = false; 164 } 165 166 // Implementation of GetNext() for when we have no quote characters. We have 167 // two separate implementations because AdvanceOne() is a hot spot in large 168 // text files with large tokens. QuickGetNext()169 bool QuickGetNext() { 170 token_is_delim_ = false; 171 for (;;) { 172 token_begin_ = token_end_; 173 if (token_end_ == end_) 174 return false; 175 ++token_end_; 176 if (delims_.find(*token_begin_) == str::npos) 177 break; 178 // else skip over delimiter. 179 } 180 while (token_end_ != end_ && delims_.find(*token_end_) == str::npos) 181 ++token_end_; 182 return true; 183 } 184 185 // Implementation of GetNext() for when we have to take quotes into account. FullGetNext()186 bool FullGetNext() { 187 AdvanceState state; 188 token_is_delim_ = false; 189 for (;;) { 190 token_begin_ = token_end_; 191 if (token_end_ == end_) 192 return false; 193 ++token_end_; 194 if (AdvanceOne(&state, *token_begin_)) 195 break; 196 if (options_ & RETURN_DELIMS) { 197 token_is_delim_ = true; 198 return true; 199 } 200 // else skip over delimiter. 201 } 202 while (token_end_ != end_ && AdvanceOne(&state, *token_end_)) 203 ++token_end_; 204 return true; 205 } 206 IsDelim(char_type c)207 bool IsDelim(char_type c) const { 208 return delims_.find(c) != str::npos; 209 } 210 IsQuote(char_type c)211 bool IsQuote(char_type c) const { 212 return quotes_.find(c) != str::npos; 213 } 214 215 struct AdvanceState { 216 bool in_quote; 217 bool in_escape; 218 char_type quote_char; AdvanceStateAdvanceState219 AdvanceState() : in_quote(false), in_escape(false), quote_char('\0') {} 220 }; 221 222 // Returns true if a delimiter was not hit. AdvanceOne(AdvanceState * state,char_type c)223 bool AdvanceOne(AdvanceState* state, char_type c) { 224 if (state->in_quote) { 225 if (state->in_escape) { 226 state->in_escape = false; 227 } else if (c == '\\') { 228 state->in_escape = true; 229 } else if (c == state->quote_char) { 230 state->in_quote = false; 231 } 232 } else { 233 if (IsDelim(c)) 234 return false; 235 state->in_quote = IsQuote(state->quote_char = c); 236 } 237 return true; 238 } 239 240 const_iterator start_pos_; 241 const_iterator token_begin_; 242 const_iterator token_end_; 243 const_iterator end_; 244 str delims_; 245 str quotes_; 246 int options_; 247 bool token_is_delim_; 248 }; 249 250 typedef StringTokenizerT<std::string, std::string::const_iterator> 251 StringTokenizer; 252 typedef StringTokenizerT<std::wstring, std::wstring::const_iterator> 253 WStringTokenizer; 254 typedef StringTokenizerT<std::string, const char*> CStringTokenizer; 255 256 } // namespace base 257 258 #endif // BASE_STRINGS_STRING_TOKENIZER_H_ 259