1 // Copyright (c) 2010 The Chromium Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style license that can be 3 // found in the LICENSE file. 4 5 #ifndef BASE_STRING_TOKENIZER_H_ 6 #define BASE_STRING_TOKENIZER_H_ 7 #pragma once 8 9 #include <algorithm> 10 #include <string> 11 12 #include "base/string_piece.h" 13 14 // StringTokenizerT is a simple string tokenizer class. It works like an 15 // iterator that with each step (see the Advance method) updates members that 16 // refer to the next token in the input string. The user may optionally 17 // configure the tokenizer to return delimiters. 18 // 19 // Warning: be careful not to pass a C string into the 2-arg constructor: 20 // StringTokenizer t("this is a test", " "); // WRONG 21 // This will create a temporary std::string, save the begin() and end() 22 // iterators, and then the string will be freed before we actually start 23 // tokenizing it. 24 // Instead, use a std::string or use the 3 arg constructor of CStringTokenizer. 25 // 26 // 27 // EXAMPLE 1: 28 // 29 // char input[] = "this is a test"; 30 // CStringTokenizer t(input, input + strlen(input), " "); 31 // while (t.GetNext()) { 32 // printf("%s\n", t.token().c_str()); 33 // } 34 // 35 // Output: 36 // 37 // this 38 // is 39 // a 40 // test 41 // 42 // 43 // EXAMPLE 2: 44 // 45 // std::string input = "no-cache=\"foo, bar\", private"; 46 // StringTokenizer t(input, ", "); 47 // t.set_quote_chars("\""); 48 // while (t.GetNext()) { 49 // printf("%s\n", t.token().c_str()); 50 // } 51 // 52 // Output: 53 // 54 // no-cache="foo, bar" 55 // private 56 // 57 // 58 // EXAMPLE 3: 59 // 60 // bool next_is_option = false, next_is_value = false; 61 // std::string input = "text/html; charset=UTF-8; foo=bar"; 62 // StringTokenizer t(input, "; ="); 63 // t.set_options(StringTokenizer::RETURN_DELIMS); 64 // while (t.GetNext()) { 65 // if (t.token_is_delim()) { 66 // switch (*t.token_begin()) { 67 // case ';': 68 // next_is_option = true; 69 // break; 70 // case '=': 71 // next_is_value = true; 72 // break; 73 // } 74 // } else { 75 // const char* label; 76 // if (next_is_option) { 77 // label = "option-name"; 78 // next_is_option = false; 79 // } else if (next_is_value) { 80 // label = "option-value"; 81 // next_is_value = false; 82 // } else { 83 // label = "mime-type"; 84 // } 85 // printf("%s: %s\n", label, t.token().c_str()); 86 // } 87 // } 88 // 89 // 90 template <class str, class const_iterator> 91 class StringTokenizerT { 92 public: 93 typedef typename str::value_type char_type; 94 95 // Options that may be pass to set_options() 96 enum { 97 // Specifies the delimiters should be returned as tokens 98 RETURN_DELIMS = 1 << 0, 99 }; 100 101 // The string object must live longer than the tokenizer. (In particular this 102 // should not be constructed with a temporary.) StringTokenizerT(const str & string,const str & delims)103 StringTokenizerT(const str& string, 104 const str& delims) { 105 Init(string.begin(), string.end(), delims); 106 } 107 StringTokenizerT(const_iterator string_begin,const_iterator string_end,const str & delims)108 StringTokenizerT(const_iterator string_begin, 109 const_iterator string_end, 110 const str& delims) { 111 Init(string_begin, string_end, delims); 112 } 113 114 // Set the options for this tokenizer. By default, this is 0. set_options(int options)115 void set_options(int options) { options_ = options; } 116 117 // Set the characters to regard as quotes. By default, this is empty. When 118 // a quote char is encountered, the tokenizer will switch into a mode where 119 // it ignores delimiters that it finds. It switches out of this mode once it 120 // finds another instance of the quote char. If a backslash is encountered 121 // within a quoted string, then the next character is skipped. set_quote_chars(const str & quotes)122 void set_quote_chars(const str& quotes) { quotes_ = quotes; } 123 124 // Call this method to advance the tokenizer to the next delimiter. This 125 // returns false if the tokenizer is complete. This method must be called 126 // before calling any of the token* methods. GetNext()127 bool GetNext() { 128 if (quotes_.empty() && options_ == 0) 129 return QuickGetNext(); 130 else 131 return FullGetNext(); 132 } 133 134 // Start iterating through tokens from the beginning of the string. Reset()135 void Reset() { 136 token_end_ = start_pos_; 137 } 138 139 // Returns true if token is a delimiter. When the tokenizer is constructed 140 // with the RETURN_DELIMS option, this method can be used to check if the 141 // returned token is actually a delimiter. token_is_delim()142 bool token_is_delim() const { return token_is_delim_; } 143 144 // If GetNext() returned true, then these methods may be used to read the 145 // value of the token. token_begin()146 const_iterator token_begin() const { return token_begin_; } token_end()147 const_iterator token_end() const { return token_end_; } token()148 str token() const { return str(token_begin_, token_end_); } token_piece()149 base::StringPiece token_piece() const { 150 return base::StringPiece(&*token_begin_, 151 std::distance(token_begin_, token_end_)); 152 } 153 154 private: Init(const_iterator string_begin,const_iterator string_end,const str & delims)155 void Init(const_iterator string_begin, 156 const_iterator string_end, 157 const str& delims) { 158 start_pos_ = string_begin; 159 token_begin_ = string_begin; 160 token_end_ = string_begin; 161 end_ = string_end; 162 delims_ = delims; 163 options_ = 0; 164 token_is_delim_ = false; 165 } 166 167 // Implementation of GetNext() for when we have no quote characters. We have 168 // two separate implementations because AdvanceOne() is a hot spot in large 169 // text files with large tokens. QuickGetNext()170 bool QuickGetNext() { 171 token_is_delim_ = false; 172 for (;;) { 173 token_begin_ = token_end_; 174 if (token_end_ == end_) 175 return false; 176 ++token_end_; 177 if (delims_.find(*token_begin_) == str::npos) 178 break; 179 // else skip over delimiter. 180 } 181 while (token_end_ != end_ && delims_.find(*token_end_) == str::npos) 182 ++token_end_; 183 return true; 184 } 185 186 // Implementation of GetNext() for when we have to take quotes into account. FullGetNext()187 bool FullGetNext() { 188 AdvanceState state; 189 token_is_delim_ = false; 190 for (;;) { 191 token_begin_ = token_end_; 192 if (token_end_ == end_) 193 return false; 194 ++token_end_; 195 if (AdvanceOne(&state, *token_begin_)) 196 break; 197 if (options_ & RETURN_DELIMS) { 198 token_is_delim_ = true; 199 return true; 200 } 201 // else skip over delimiter. 202 } 203 while (token_end_ != end_ && AdvanceOne(&state, *token_end_)) 204 ++token_end_; 205 return true; 206 } 207 IsDelim(char_type c)208 bool IsDelim(char_type c) const { 209 return delims_.find(c) != str::npos; 210 } 211 IsQuote(char_type c)212 bool IsQuote(char_type c) const { 213 return quotes_.find(c) != str::npos; 214 } 215 216 struct AdvanceState { 217 bool in_quote; 218 bool in_escape; 219 char_type quote_char; AdvanceStateAdvanceState220 AdvanceState() : in_quote(false), in_escape(false) {} 221 }; 222 223 // Returns true if a delimiter was not hit. AdvanceOne(AdvanceState * state,char_type c)224 bool AdvanceOne(AdvanceState* state, char_type c) { 225 if (state->in_quote) { 226 if (state->in_escape) { 227 state->in_escape = false; 228 } else if (c == '\\') { 229 state->in_escape = true; 230 } else if (c == state->quote_char) { 231 state->in_quote = false; 232 } 233 } else { 234 if (IsDelim(c)) 235 return false; 236 state->in_quote = IsQuote(state->quote_char = c); 237 } 238 return true; 239 } 240 241 const_iterator start_pos_; 242 const_iterator token_begin_; 243 const_iterator token_end_; 244 const_iterator end_; 245 str delims_; 246 str quotes_; 247 int options_; 248 bool token_is_delim_; 249 }; 250 251 typedef StringTokenizerT<std::string, std::string::const_iterator> 252 StringTokenizer; 253 typedef StringTokenizerT<std::wstring, std::wstring::const_iterator> 254 WStringTokenizer; 255 typedef StringTokenizerT<std::string, const char*> CStringTokenizer; 256 257 #endif // BASE_STRING_TOKENIZER_H_ 258