1 // Copyright (c) 2011 The Chromium Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style license that can be 3 // found in the LICENSE file. 4 5 #ifndef BASE_STRINGS_STRING_TOKENIZER_H_ 6 #define BASE_STRINGS_STRING_TOKENIZER_H_ 7 8 #include <algorithm> 9 #include <string> 10 #include <string_view> 11 12 namespace base { 13 14 // StringTokenizerT is a simple string tokenizer class. It works like an 15 // iterator that with each step (see the Advance method) updates members that 16 // refer to the next token in the input string. The user may optionally 17 // configure the tokenizer to return delimiters. 18 // 19 // EXAMPLE 1: 20 // 21 // char input[] = "this is a test"; 22 // CStringTokenizer t(input, input + strlen(input), " "); 23 // while (t.GetNext()) { 24 // printf("%s\n", t.token().c_str()); 25 // } 26 // 27 // Output: 28 // 29 // this 30 // is 31 // a 32 // test 33 // 34 // 35 // EXAMPLE 2: 36 // 37 // std::string input = "no-cache=\"foo, bar\", private"; 38 // StringTokenizer t(input, ", "); 39 // t.set_quote_chars("\""); 40 // while (t.GetNext()) { 41 // printf("%s\n", t.token().c_str()); 42 // } 43 // 44 // Output: 45 // 46 // no-cache="foo, bar" 47 // private 48 // 49 // 50 // EXAMPLE 3: 51 // 52 // bool next_is_option = false, next_is_value = false; 53 // std::string input = "text/html; charset=UTF-8; foo=bar"; 54 // StringTokenizer t(input, "; ="); 55 // t.set_options(StringTokenizer::RETURN_DELIMS); 56 // while (t.GetNext()) { 57 // if (t.token_is_delim()) { 58 // switch (*t.token_begin()) { 59 // case ';': 60 // next_is_option = true; 61 // break; 62 // case '=': 63 // next_is_value = true; 64 // break; 65 // } 66 // } else { 67 // const char* label; 68 // if (next_is_option) { 69 // label = "option-name"; 70 // next_is_option = false; 71 // } else if (next_is_value) { 72 // label = "option-value"; 73 // next_is_value = false; 74 // } else { 75 // label = "mime-type"; 76 // } 77 // printf("%s: %s\n", label, t.token().c_str()); 78 // } 79 // } 80 // 81 // 82 template <class str, class const_iterator> 83 class StringTokenizerT { 84 public: 85 typedef typename str::value_type char_type; 86 87 // Options that may be pass to set_options() 88 enum { 89 // Specifies the delimiters should be returned as tokens 90 RETURN_DELIMS = 1 << 0, 91 }; 92 93 // The string object must live longer than the tokenizer. In particular, this 94 // should not be constructed with a temporary. The deleted rvalue constructor 95 // blocks the most obvious instances of this (e.g. passing a string literal to 96 // the constructor), but caution must still be exercised. StringTokenizerT(const str & string,const str & delims)97 StringTokenizerT(const str& string, const str& delims) { 98 Init(string.begin(), string.end(), delims); 99 } 100 101 // Don't allow temporary strings to be used with string tokenizer, since 102 // Init() would otherwise save iterators to a temporary string. 103 StringTokenizerT(str&&, const str& delims) = delete; 104 StringTokenizerT(const_iterator string_begin,const_iterator string_end,const str & delims)105 StringTokenizerT(const_iterator string_begin, 106 const_iterator string_end, 107 const str& delims) { 108 Init(string_begin, string_end, delims); 109 } 110 111 // Set the options for this tokenizer. By default, this is 0. set_options(int options)112 void set_options(int options) { options_ = options; } 113 114 // Set the characters to regard as quotes. By default, this is empty. When 115 // a quote char is encountered, the tokenizer will switch into a mode where 116 // it ignores delimiters that it finds. It switches out of this mode once it 117 // finds another instance of the quote char. If a backslash is encountered 118 // within a quoted string, then the next character is skipped. set_quote_chars(const str & quotes)119 void set_quote_chars(const str& quotes) { quotes_ = quotes; } 120 121 // Call this method to advance the tokenizer to the next delimiter. This 122 // returns false if the tokenizer is complete. This method must be called 123 // before calling any of the token* methods. GetNext()124 bool GetNext() { 125 if (quotes_.empty() && options_ == 0) 126 return QuickGetNext(); 127 else 128 return FullGetNext(); 129 } 130 131 // Start iterating through tokens from the beginning of the string. Reset()132 void Reset() { token_end_ = start_pos_; } 133 134 // Returns true if token is a delimiter. When the tokenizer is constructed 135 // with the RETURN_DELIMS option, this method can be used to check if the 136 // returned token is actually a delimiter. token_is_delim()137 bool token_is_delim() const { return token_is_delim_; } 138 139 // If GetNext() returned true, then these methods may be used to read the 140 // value of the token. token_begin()141 const_iterator token_begin() const { return token_begin_; } token_end()142 const_iterator token_end() const { return token_end_; } token()143 str token() const { return str(token_begin_, token_end_); } token_piece()144 std::basic_string_view<typename str::value_type> token_piece() const { 145 return std::basic_string_view<typename str::value_type>( 146 &*token_begin_, std::distance(token_begin_, token_end_)); 147 } 148 149 private: Init(const_iterator string_begin,const_iterator string_end,const str & delims)150 void Init(const_iterator string_begin, 151 const_iterator string_end, 152 const str& delims) { 153 start_pos_ = string_begin; 154 token_begin_ = string_begin; 155 token_end_ = string_begin; 156 end_ = string_end; 157 delims_ = delims; 158 options_ = 0; 159 token_is_delim_ = false; 160 } 161 162 // Implementation of GetNext() for when we have no quote characters. We have 163 // two separate implementations because AdvanceOne() is a hot spot in large 164 // text files with large tokens. QuickGetNext()165 bool QuickGetNext() { 166 token_is_delim_ = false; 167 for (;;) { 168 token_begin_ = token_end_; 169 if (token_end_ == end_) 170 return false; 171 ++token_end_; 172 if (delims_.find(*token_begin_) == str::npos) 173 break; 174 // else skip over delimiter. 175 } 176 while (token_end_ != end_ && delims_.find(*token_end_) == str::npos) 177 ++token_end_; 178 return true; 179 } 180 181 // Implementation of GetNext() for when we have to take quotes into account. FullGetNext()182 bool FullGetNext() { 183 AdvanceState state; 184 token_is_delim_ = false; 185 for (;;) { 186 token_begin_ = token_end_; 187 if (token_end_ == end_) 188 return false; 189 ++token_end_; 190 if (AdvanceOne(&state, *token_begin_)) 191 break; 192 if (options_ & RETURN_DELIMS) { 193 token_is_delim_ = true; 194 return true; 195 } 196 // else skip over delimiter. 197 } 198 while (token_end_ != end_ && AdvanceOne(&state, *token_end_)) 199 ++token_end_; 200 return true; 201 } 202 IsDelim(char_type c)203 bool IsDelim(char_type c) const { return delims_.find(c) != str::npos; } 204 IsQuote(char_type c)205 bool IsQuote(char_type c) const { return quotes_.find(c) != str::npos; } 206 207 struct AdvanceState { 208 bool in_quote; 209 bool in_escape; 210 char_type quote_char; AdvanceStateAdvanceState211 AdvanceState() : in_quote(false), in_escape(false), quote_char('\0') {} 212 }; 213 214 // Returns true if a delimiter was not hit. AdvanceOne(AdvanceState * state,char_type c)215 bool AdvanceOne(AdvanceState* state, char_type c) { 216 if (state->in_quote) { 217 if (state->in_escape) { 218 state->in_escape = false; 219 } else if (c == '\\') { 220 state->in_escape = true; 221 } else if (c == state->quote_char) { 222 state->in_quote = false; 223 } 224 } else { 225 if (IsDelim(c)) 226 return false; 227 state->in_quote = IsQuote(state->quote_char = c); 228 } 229 return true; 230 } 231 232 const_iterator start_pos_; 233 const_iterator token_begin_; 234 const_iterator token_end_; 235 const_iterator end_; 236 str delims_; 237 str quotes_; 238 int options_; 239 bool token_is_delim_; 240 }; 241 242 typedef StringTokenizerT<std::string, std::string::const_iterator> 243 StringTokenizer; 244 typedef StringTokenizerT<std::u16string, std::u16string::const_iterator> 245 WStringTokenizer; 246 typedef StringTokenizerT<std::string, const char*> CStringTokenizer; 247 248 } // namespace base 249 250 #endif // BASE_STRINGS_STRING_TOKENIZER_H_ 251