1 // Copyright 2011 The Chromium Authors 2 // Use of this source code is governed by a BSD-style license that can be 3 // found in the LICENSE file. 4 5 #ifndef BASE_STRINGS_STRING_TOKENIZER_H_ 6 #define BASE_STRINGS_STRING_TOKENIZER_H_ 7 8 #include <algorithm> 9 #include <string> 10 11 #include "base/check.h" 12 #include "base/strings/string_piece.h" 13 #include "base/strings/string_util.h" 14 15 namespace base { 16 17 // StringTokenizerT is a simple string tokenizer class. It works like an 18 // iterator that with each step (see the Advance method) updates members that 19 // refer to the next token in the input string. The user may optionally 20 // configure the tokenizer to return delimiters. For the optional 21 // WhitespacePolicy parameter, kSkipOver will cause the tokenizer to skip 22 // over whitespace characters. The tokenizer never stops on a whitespace 23 // character. 24 // 25 // EXAMPLE 1: 26 // 27 // char input[] = "this is a test"; 28 // CStringTokenizer t(input, input + strlen(input), " "); 29 // while (t.GetNext()) { 30 // printf("%s\n", t.token().c_str()); 31 // } 32 // 33 // Output: 34 // 35 // this 36 // is 37 // a 38 // test 39 // 40 // 41 // EXAMPLE 2: 42 // 43 // std::string input = "no-cache=\"foo, bar\", private"; 44 // StringTokenizer t(input, ", "); 45 // t.set_quote_chars("\""); 46 // while (t.GetNext()) { 47 // printf("%s\n", t.token().c_str()); 48 // } 49 // 50 // Output: 51 // 52 // no-cache="foo, bar" 53 // private 54 // 55 // 56 // EXAMPLE 3: 57 // 58 // bool next_is_option = false, next_is_value = false; 59 // std::string input = "text/html; charset=UTF-8; foo=bar"; 60 // StringTokenizer t(input, "; ="); 61 // t.set_options(StringTokenizer::RETURN_DELIMS); 62 // while (t.GetNext()) { 63 // if (t.token_is_delim()) { 64 // switch (*t.token_begin()) { 65 // case ';': 66 // next_is_option = true; 67 // break; 68 // case '=': 69 // next_is_value = true; 70 // break; 71 // } 72 // } else { 73 // const char* label; 74 // if (next_is_option) { 75 // label = "option-name"; 76 // next_is_option = false; 77 // } else if (next_is_value) { 78 // label = "option-value"; 79 // next_is_value = false; 80 // } else { 81 // label = "mime-type"; 82 // } 83 // printf("%s: %s\n", label, t.token().c_str()); 84 // } 85 // } 86 // 87 // 88 // EXAMPLE 4: 89 // 90 // std::string input = "this, \t is, \t a, \t test"; 91 // StringTokenizer t(input, ",", 92 // StringTokenizer::WhitespacePolicy::kSkipOver); 93 // while (t.GetNext()) { 94 // printf("%s\n", t.token().c_str()); 95 // } 96 // 97 // Output: 98 // 99 // this 100 // is 101 // a 102 // test 103 // 104 // 105 template <class str, class const_iterator> 106 class StringTokenizerT { 107 public: 108 typedef typename str::value_type char_type; 109 110 // Options that may be pass to set_options() 111 enum { 112 // Specifies the delimiters should be returned as tokens 113 RETURN_DELIMS = 1 << 0, 114 115 // Specifies that empty tokens should be returned. Treats the beginning and 116 // ending of the string as implicit delimiters, though doesn't return them 117 // as tokens if RETURN_DELIMS is also used. 118 RETURN_EMPTY_TOKENS = 1 << 1, 119 }; 120 121 // Policy indicating what to do with whitespace characters. Whitespace is 122 // defined to be the characters indicated here: 123 // https://www.w3schools.com/jsref/jsref_regexp_whitespace.asp 124 enum class WhitespacePolicy { 125 // Whitespace should be treated the same as any other non-delimiter 126 // character. 127 kIncludeInTokens, 128 // Whitespace is skipped over and not included in the resulting token. 129 // Whitespace will also delimit other tokens, however it is never returned 130 // even if RETURN_DELIMS is set. If quote chars are set (See set_quote_chars 131 // below) Whitespace will be included in a token when processing quotes. 132 kSkipOver, 133 }; 134 135 // The string object must live longer than the tokenizer. In particular, this 136 // should not be constructed with a temporary. The deleted rvalue constructor 137 // blocks the most obvious instances of this (e.g. passing a string literal to 138 // the constructor), but caution must still be exercised. 139 StringTokenizerT( 140 const str& string, 141 const str& delims, 142 WhitespacePolicy whitespace_policy = WhitespacePolicy::kIncludeInTokens) { 143 Init(string.begin(), string.end(), delims, whitespace_policy); 144 } 145 146 // Don't allow temporary strings to be used with string tokenizer, since 147 // Init() would otherwise save iterators to a temporary string. 148 StringTokenizerT(str&&, const str& delims) = delete; 149 150 StringTokenizerT( 151 const_iterator string_begin, 152 const_iterator string_end, 153 const str& delims, 154 WhitespacePolicy whitespace_policy = WhitespacePolicy::kIncludeInTokens) { 155 Init(string_begin, string_end, delims, whitespace_policy); 156 } 157 158 // Set the options for this tokenizer. By default, this is 0. set_options(int options)159 void set_options(int options) { options_ = options; } 160 161 // Set the characters to regard as quotes. By default, this is empty. When 162 // a quote char is encountered, the tokenizer will switch into a mode where 163 // it ignores delimiters that it finds. It switches out of this mode once it 164 // finds another instance of the quote char. If a backslash is encountered 165 // within a quoted string, then the next character is skipped. set_quote_chars(const str & quotes)166 void set_quote_chars(const str& quotes) { quotes_ = quotes; } 167 168 // Call this method to advance the tokenizer to the next delimiter. This 169 // returns false if the tokenizer is complete. This method must be called 170 // before calling any of the token* methods. GetNext()171 bool GetNext() { 172 if (quotes_.empty() && options_ == 0) 173 return QuickGetNext(); 174 else 175 return FullGetNext(); 176 } 177 178 // Start iterating through tokens from the beginning of the string. Reset()179 void Reset() { 180 token_end_ = start_pos_; 181 } 182 183 // Returns true if token is a delimiter. When the tokenizer is constructed 184 // with the RETURN_DELIMS option, this method can be used to check if the 185 // returned token is actually a delimiter. Returns true before the first 186 // time GetNext() has been called, and after GetNext() returns false. token_is_delim()187 bool token_is_delim() const { return token_is_delim_; } 188 189 // If GetNext() returned true, then these methods may be used to read the 190 // value of the token. token_begin()191 const_iterator token_begin() const { return token_begin_; } token_end()192 const_iterator token_end() const { return token_end_; } token()193 str token() const { return str(token_begin_, token_end_); } token_piece()194 BasicStringPiece<char_type> token_piece() const { 195 return MakeBasicStringPiece<char_type>(token_begin_, token_end_); 196 } 197 198 private: Init(const_iterator string_begin,const_iterator string_end,const str & delims,WhitespacePolicy whitespace_policy)199 void Init(const_iterator string_begin, 200 const_iterator string_end, 201 const str& delims, 202 WhitespacePolicy whitespace_policy) { 203 start_pos_ = string_begin; 204 token_begin_ = string_begin; 205 token_end_ = string_begin; 206 end_ = string_end; 207 delims_ = delims; 208 options_ = 0; 209 token_is_delim_ = true; 210 whitespace_policy_ = whitespace_policy; 211 } 212 ShouldSkip(char_type c)213 bool ShouldSkip(char_type c) const { 214 return whitespace_policy_ == WhitespacePolicy::kSkipOver && 215 IsAsciiWhitespace(c); 216 } 217 218 // Skip over any contiguous whitespace characters according to the whitespace 219 // policy. SkipWhitespace()220 void SkipWhitespace() { 221 while (token_end_ != end_ && ShouldSkip(*token_end_)) 222 ++token_end_; 223 } 224 225 // Implementation of GetNext() for when we have no quote characters. We have 226 // two separate implementations because AdvanceOne() is a hot spot in large 227 // text files with large tokens. QuickGetNext()228 bool QuickGetNext() { 229 token_is_delim_ = false; 230 for (;;) { 231 token_begin_ = token_end_; 232 if (token_end_ == end_) { 233 token_is_delim_ = true; 234 return false; 235 } 236 ++token_end_; 237 if (delims_.find(*token_begin_) == str::npos && 238 !ShouldSkip(*token_begin_)) { 239 break; 240 } 241 // else skip over delimiter or skippable character. 242 } 243 while (token_end_ != end_ && delims_.find(*token_end_) == str::npos && 244 !ShouldSkip(*token_end_)) { 245 ++token_end_; 246 } 247 return true; 248 } 249 250 // Implementation of GetNext() for when we have to take quotes into account. FullGetNext()251 bool FullGetNext() { 252 AdvanceState state; 253 254 SkipWhitespace(); 255 for (;;) { 256 if (token_is_delim_) { 257 // Last token was a delimiter. Note: This is also the case at the start. 258 // 259 // ... D T T T T D ... 260 // ^ ^ 261 // | | 262 // | |token_end_| : The next character to look at or |end_|. 263 // | 264 // |token_begin_| : Points to delimiter or |token_end_|. 265 // 266 // The next token is always a non-delimiting token. It could be empty, 267 // however. 268 token_is_delim_ = false; 269 token_begin_ = token_end_; 270 271 // Slurp all non-delimiter characters into the token. 272 while (token_end_ != end_ && AdvanceOne(&state, *token_end_)) { 273 ++token_end_; 274 } 275 276 // If it's non-empty, or empty tokens were requested, return the token. 277 if (token_begin_ != token_end_ || (options_ & RETURN_EMPTY_TOKENS)) 278 return true; 279 } 280 281 DCHECK(!token_is_delim_); 282 // Last token was a regular token. 283 // 284 // ... T T T D T T ... 285 // ^ ^ 286 // | | 287 // | token_end_ : The next character to look at. Always one 288 // | char beyond the token boundary. 289 // | 290 // token_begin_ : Points to beginning of token. Note: token could 291 // be empty, in which case 292 // token_begin_ == token_end_. 293 // 294 // The next token is always a delimiter. It could be |end_| however, but 295 // |end_| is also an implicit delimiter. 296 token_is_delim_ = true; 297 token_begin_ = token_end_; 298 299 if (token_end_ == end_) 300 return false; 301 302 // Look at the delimiter. 303 ++token_end_; 304 if (options_ & RETURN_DELIMS) 305 return true; 306 } 307 308 return false; 309 } 310 IsDelim(char_type c)311 bool IsDelim(char_type c) const { return delims_.find(c) != str::npos; } 312 IsQuote(char_type c)313 bool IsQuote(char_type c) const { return quotes_.find(c) != str::npos; } 314 315 struct AdvanceState { 316 bool in_quote; 317 bool in_escape; 318 char_type quote_char; AdvanceStateAdvanceState319 AdvanceState() : in_quote(false), in_escape(false), quote_char('\0') {} 320 }; 321 322 // Returns true if a delimiter or, depending on policy, whitespace was not 323 // hit. AdvanceOne(AdvanceState * state,char_type c)324 bool AdvanceOne(AdvanceState* state, char_type c) { 325 if (state->in_quote) { 326 if (state->in_escape) { 327 state->in_escape = false; 328 } else if (c == '\\') { 329 state->in_escape = true; 330 } else if (c == state->quote_char) { 331 state->in_quote = false; 332 } 333 } else { 334 if (IsDelim(c) || ShouldSkip(c)) 335 return false; 336 state->in_quote = IsQuote(state->quote_char = c); 337 } 338 return true; 339 } 340 341 const_iterator start_pos_; 342 const_iterator token_begin_; 343 const_iterator token_end_; 344 const_iterator end_; 345 str delims_; 346 str quotes_; 347 int options_; 348 bool token_is_delim_; 349 WhitespacePolicy whitespace_policy_; 350 }; 351 352 typedef StringTokenizerT<std::string, std::string::const_iterator> 353 StringTokenizer; 354 typedef StringTokenizerT<std::u16string, std::u16string::const_iterator> 355 String16Tokenizer; 356 typedef StringTokenizerT<std::string, const char*> CStringTokenizer; 357 358 } // namespace base 359 360 #endif // BASE_STRINGS_STRING_TOKENIZER_H_ 361