1 // 2 // Copyright 2017 The Abseil Authors. 3 // 4 // Licensed under the Apache License, Version 2.0 (the "License"); 5 // you may not use this file except in compliance with the License. 6 // You may obtain a copy of the License at 7 // 8 // https://www.apache.org/licenses/LICENSE-2.0 9 // 10 // Unless required by applicable law or agreed to in writing, software 11 // distributed under the License is distributed on an "AS IS" BASIS, 12 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 // See the License for the specific language governing permissions and 14 // limitations under the License. 15 // 16 // ----------------------------------------------------------------------------- 17 // File: str_split.h 18 // ----------------------------------------------------------------------------- 19 // 20 // This file contains functions for splitting strings. It defines the main 21 // `StrSplit()` function, several delimiters for determining the boundaries on 22 // which to split the string, and predicates for filtering delimited results. 23 // `StrSplit()` adapts the returned collection to the type specified by the 24 // caller. 25 // 26 // Example: 27 // 28 // // Splits the given string on commas. Returns the results in a 29 // // vector of strings. 30 // std::vector<std::string> v = absl::StrSplit("a,b,c", ','); 31 // // Can also use "," 32 // // v[0] == "a", v[1] == "b", v[2] == "c" 33 // 34 // See StrSplit() below for more information. 35 #ifndef ABSL_STRINGS_STR_SPLIT_H_ 36 #define ABSL_STRINGS_STR_SPLIT_H_ 37 38 #include <algorithm> 39 #include <cstddef> 40 #include <map> 41 #include <set> 42 #include <string> 43 #include <utility> 44 #include <vector> 45 46 #include "absl/base/internal/raw_logging.h" 47 #include "absl/strings/internal/str_split_internal.h" 48 #include "absl/strings/string_view.h" 49 #include "absl/strings/strip.h" 50 51 namespace absl { 52 ABSL_NAMESPACE_BEGIN 53 54 //------------------------------------------------------------------------------ 55 // Delimiters 56 //------------------------------------------------------------------------------ 57 // 58 // `StrSplit()` uses delimiters to define the boundaries between elements in the 59 // provided input. Several `Delimiter` types are defined below. If a string 60 // (`const char*`, `std::string`, or `absl::string_view`) is passed in place of 61 // an explicit `Delimiter` object, `StrSplit()` treats it the same way as if it 62 // were passed a `ByString` delimiter. 63 // 64 // A `Delimiter` is an object with a `Find()` function that knows how to find 65 // the first occurrence of itself in a given `absl::string_view`. 66 // 67 // The following `Delimiter` types are available for use within `StrSplit()`: 68 // 69 // - `ByString` (default for string arguments) 70 // - `ByChar` (default for a char argument) 71 // - `ByAnyChar` 72 // - `ByLength` 73 // - `MaxSplits` 74 // 75 // A Delimiter's `Find()` member function will be passed an input `text` that is 76 // to be split and a position (`pos`) to begin searching for the next delimiter 77 // in `text`. The returned absl::string_view should refer to the next occurrence 78 // (after `pos`) of the represented delimiter; this returned absl::string_view 79 // represents the next location where the input `text` should be broken. 80 // 81 // The returned absl::string_view may be zero-length if the Delimiter does not 82 // represent a part of the string (e.g., a fixed-length delimiter). If no 83 // delimiter is found in the input `text`, a zero-length absl::string_view 84 // referring to `text.end()` should be returned (e.g., 85 // `text.substr(text.size())`). It is important that the returned 86 // absl::string_view always be within the bounds of the input `text` given as an 87 // argument--it must not refer to a string that is physically located outside of 88 // the given string. 89 // 90 // The following example is a simple Delimiter object that is created with a 91 // single char and will look for that char in the text passed to the `Find()` 92 // function: 93 // 94 // struct SimpleDelimiter { 95 // const char c_; 96 // explicit SimpleDelimiter(char c) : c_(c) {} 97 // absl::string_view Find(absl::string_view text, size_t pos) { 98 // auto found = text.find(c_, pos); 99 // if (found == absl::string_view::npos) 100 // return text.substr(text.size()); 101 // 102 // return text.substr(found, 1); 103 // } 104 // }; 105 106 // ByString 107 // 108 // A sub-string delimiter. If `StrSplit()` is passed a string in place of a 109 // `Delimiter` object, the string will be implicitly converted into a 110 // `ByString` delimiter. 111 // 112 // Example: 113 // 114 // // Because a string literal is converted to an `absl::ByString`, 115 // // the following two splits are equivalent. 116 // 117 // std::vector<std::string> v1 = absl::StrSplit("a, b, c", ", "); 118 // 119 // using absl::ByString; 120 // std::vector<std::string> v2 = absl::StrSplit("a, b, c", 121 // ByString(", ")); 122 // // v[0] == "a", v[1] == "b", v[2] == "c" 123 class ByString { 124 public: 125 explicit ByString(absl::string_view sp); 126 absl::string_view Find(absl::string_view text, size_t pos) const; 127 128 private: 129 const std::string delimiter_; 130 }; 131 132 // ByChar 133 // 134 // A single character delimiter. `ByChar` is functionally equivalent to a 135 // 1-char string within a `ByString` delimiter, but slightly more efficient. 136 // 137 // Example: 138 // 139 // // Because a char literal is converted to a absl::ByChar, 140 // // the following two splits are equivalent. 141 // std::vector<std::string> v1 = absl::StrSplit("a,b,c", ','); 142 // using absl::ByChar; 143 // std::vector<std::string> v2 = absl::StrSplit("a,b,c", ByChar(',')); 144 // // v[0] == "a", v[1] == "b", v[2] == "c" 145 // 146 // `ByChar` is also the default delimiter if a single character is given 147 // as the delimiter to `StrSplit()`. For example, the following calls are 148 // equivalent: 149 // 150 // std::vector<std::string> v = absl::StrSplit("a-b", '-'); 151 // 152 // using absl::ByChar; 153 // std::vector<std::string> v = absl::StrSplit("a-b", ByChar('-')); 154 // 155 class ByChar { 156 public: ByChar(char c)157 explicit ByChar(char c) : c_(c) {} 158 absl::string_view Find(absl::string_view text, size_t pos) const; 159 160 private: 161 char c_; 162 }; 163 164 // ByAnyChar 165 // 166 // A delimiter that will match any of the given byte-sized characters within 167 // its provided string. 168 // 169 // Note: this delimiter works with single-byte string data, but does not work 170 // with variable-width encodings, such as UTF-8. 171 // 172 // Example: 173 // 174 // using absl::ByAnyChar; 175 // std::vector<std::string> v = absl::StrSplit("a,b=c", ByAnyChar(",=")); 176 // // v[0] == "a", v[1] == "b", v[2] == "c" 177 // 178 // If `ByAnyChar` is given the empty string, it behaves exactly like 179 // `ByString` and matches each individual character in the input string. 180 // 181 class ByAnyChar { 182 public: 183 explicit ByAnyChar(absl::string_view sp); 184 absl::string_view Find(absl::string_view text, size_t pos) const; 185 186 private: 187 const std::string delimiters_; 188 }; 189 190 // ByLength 191 // 192 // A delimiter for splitting into equal-length strings. The length argument to 193 // the constructor must be greater than 0. 194 // 195 // Note: this delimiter works with single-byte string data, but does not work 196 // with variable-width encodings, such as UTF-8. 197 // 198 // Example: 199 // 200 // using absl::ByLength; 201 // std::vector<std::string> v = absl::StrSplit("123456789", ByLength(3)); 202 203 // // v[0] == "123", v[1] == "456", v[2] == "789" 204 // 205 // Note that the string does not have to be a multiple of the fixed split 206 // length. In such a case, the last substring will be shorter. 207 // 208 // using absl::ByLength; 209 // std::vector<std::string> v = absl::StrSplit("12345", ByLength(2)); 210 // 211 // // v[0] == "12", v[1] == "34", v[2] == "5" 212 class ByLength { 213 public: 214 explicit ByLength(ptrdiff_t length); 215 absl::string_view Find(absl::string_view text, size_t pos) const; 216 217 private: 218 const ptrdiff_t length_; 219 }; 220 221 namespace strings_internal { 222 223 // A traits-like metafunction for selecting the default Delimiter object type 224 // for a particular Delimiter type. The base case simply exposes type Delimiter 225 // itself as the delimiter's Type. However, there are specializations for 226 // string-like objects that map them to the ByString delimiter object. 227 // This allows functions like absl::StrSplit() and absl::MaxSplits() to accept 228 // string-like objects (e.g., ',') as delimiter arguments but they will be 229 // treated as if a ByString delimiter was given. 230 template <typename Delimiter> 231 struct SelectDelimiter { 232 using type = Delimiter; 233 }; 234 235 template <> 236 struct SelectDelimiter<char> { 237 using type = ByChar; 238 }; 239 template <> 240 struct SelectDelimiter<char*> { 241 using type = ByString; 242 }; 243 template <> 244 struct SelectDelimiter<const char*> { 245 using type = ByString; 246 }; 247 template <> 248 struct SelectDelimiter<absl::string_view> { 249 using type = ByString; 250 }; 251 template <> 252 struct SelectDelimiter<std::string> { 253 using type = ByString; 254 }; 255 256 // Wraps another delimiter and sets a max number of matches for that delimiter. 257 template <typename Delimiter> 258 class MaxSplitsImpl { 259 public: 260 MaxSplitsImpl(Delimiter delimiter, int limit) 261 : delimiter_(delimiter), limit_(limit), count_(0) {} 262 absl::string_view Find(absl::string_view text, size_t pos) { 263 if (count_++ == limit_) { 264 return absl::string_view(text.data() + text.size(), 265 0); // No more matches. 266 } 267 return delimiter_.Find(text, pos); 268 } 269 270 private: 271 Delimiter delimiter_; 272 const int limit_; 273 int count_; 274 }; 275 276 } // namespace strings_internal 277 278 // MaxSplits() 279 // 280 // A delimiter that limits the number of matches which can occur to the passed 281 // `limit`. The last element in the returned collection will contain all 282 // remaining unsplit pieces, which may contain instances of the delimiter. 283 // The collection will contain at most `limit` + 1 elements. 284 // Example: 285 // 286 // using absl::MaxSplits; 287 // std::vector<std::string> v = absl::StrSplit("a,b,c", MaxSplits(',', 1)); 288 // 289 // // v[0] == "a", v[1] == "b,c" 290 template <typename Delimiter> 291 inline strings_internal::MaxSplitsImpl< 292 typename strings_internal::SelectDelimiter<Delimiter>::type> 293 MaxSplits(Delimiter delimiter, int limit) { 294 typedef 295 typename strings_internal::SelectDelimiter<Delimiter>::type DelimiterType; 296 return strings_internal::MaxSplitsImpl<DelimiterType>( 297 DelimiterType(delimiter), limit); 298 } 299 300 //------------------------------------------------------------------------------ 301 // Predicates 302 //------------------------------------------------------------------------------ 303 // 304 // Predicates filter the results of a `StrSplit()` by determining whether or not 305 // a resultant element is included in the result set. A predicate may be passed 306 // as an optional third argument to the `StrSplit()` function. 307 // 308 // Predicates are unary functions (or functors) that take a single 309 // `absl::string_view` argument and return a bool indicating whether the 310 // argument should be included (`true`) or excluded (`false`). 311 // 312 // Predicates are useful when filtering out empty substrings. By default, empty 313 // substrings may be returned by `StrSplit()`, which is similar to the way split 314 // functions work in other programming languages. 315 316 // AllowEmpty() 317 // 318 // Always returns `true`, indicating that all strings--including empty 319 // strings--should be included in the split output. This predicate is not 320 // strictly needed because this is the default behavior of `StrSplit()`; 321 // however, it might be useful at some call sites to make the intent explicit. 322 // 323 // Example: 324 // 325 // std::vector<std::string> v = absl::StrSplit(" a , ,,b,", ',', AllowEmpty()); 326 // 327 // // v[0] == " a ", v[1] == " ", v[2] == "", v[3] = "b", v[4] == "" 328 struct AllowEmpty { 329 bool operator()(absl::string_view) const { return true; } 330 }; 331 332 // SkipEmpty() 333 // 334 // Returns `false` if the given `absl::string_view` is empty, indicating that 335 // `StrSplit()` should omit the empty string. 336 // 337 // Example: 338 // 339 // std::vector<std::string> v = absl::StrSplit(",a,,b,", ',', SkipEmpty()); 340 // 341 // // v[0] == "a", v[1] == "b" 342 // 343 // Note: `SkipEmpty()` does not consider a string containing only whitespace 344 // to be empty. To skip such whitespace as well, use the `SkipWhitespace()` 345 // predicate. 346 struct SkipEmpty { 347 bool operator()(absl::string_view sp) const { return !sp.empty(); } 348 }; 349 350 // SkipWhitespace() 351 // 352 // Returns `false` if the given `absl::string_view` is empty *or* contains only 353 // whitespace, indicating that `StrSplit()` should omit the string. 354 // 355 // Example: 356 // 357 // std::vector<std::string> v = absl::StrSplit(" a , ,,b,", 358 // ',', SkipWhitespace()); 359 // // v[0] == " a ", v[1] == "b" 360 // 361 // // SkipEmpty() would return whitespace elements 362 // std::vector<std::string> v = absl::StrSplit(" a , ,,b,", ',', SkipEmpty()); 363 // // v[0] == " a ", v[1] == " ", v[2] == "b" 364 struct SkipWhitespace { 365 bool operator()(absl::string_view sp) const { 366 sp = absl::StripAsciiWhitespace(sp); 367 return !sp.empty(); 368 } 369 }; 370 371 //------------------------------------------------------------------------------ 372 // StrSplit() 373 //------------------------------------------------------------------------------ 374 375 // StrSplit() 376 // 377 // Splits a given string based on the provided `Delimiter` object, returning the 378 // elements within the type specified by the caller. Optionally, you may pass a 379 // `Predicate` to `StrSplit()` indicating whether to include or exclude the 380 // resulting element within the final result set. (See the overviews for 381 // Delimiters and Predicates above.) 382 // 383 // Example: 384 // 385 // std::vector<std::string> v = absl::StrSplit("a,b,c,d", ','); 386 // // v[0] == "a", v[1] == "b", v[2] == "c", v[3] == "d" 387 // 388 // You can also provide an explicit `Delimiter` object: 389 // 390 // Example: 391 // 392 // using absl::ByAnyChar; 393 // std::vector<std::string> v = absl::StrSplit("a,b=c", ByAnyChar(",=")); 394 // // v[0] == "a", v[1] == "b", v[2] == "c" 395 // 396 // See above for more information on delimiters. 397 // 398 // By default, empty strings are included in the result set. You can optionally 399 // include a third `Predicate` argument to apply a test for whether the 400 // resultant element should be included in the result set: 401 // 402 // Example: 403 // 404 // std::vector<std::string> v = absl::StrSplit(" a , ,,b,", 405 // ',', SkipWhitespace()); 406 // // v[0] == " a ", v[1] == "b" 407 // 408 // See above for more information on predicates. 409 // 410 //------------------------------------------------------------------------------ 411 // StrSplit() Return Types 412 //------------------------------------------------------------------------------ 413 // 414 // The `StrSplit()` function adapts the returned collection to the collection 415 // specified by the caller (e.g. `std::vector` above). The returned collections 416 // may contain `std::string`, `absl::string_view` (in which case the original 417 // string being split must ensure that it outlives the collection), or any 418 // object that can be explicitly created from an `absl::string_view`. This 419 // behavior works for: 420 // 421 // 1) All standard STL containers including `std::vector`, `std::list`, 422 // `std::deque`, `std::set`,`std::multiset`, 'std::map`, and `std::multimap` 423 // 2) `std::pair` (which is not actually a container). See below. 424 // 425 // Example: 426 // 427 // // The results are returned as `absl::string_view` objects. Note that we 428 // // have to ensure that the input string outlives any results. 429 // std::vector<absl::string_view> v = absl::StrSplit("a,b,c", ','); 430 // 431 // // Stores results in a std::set<std::string>, which also performs 432 // // de-duplication and orders the elements in ascending order. 433 // std::set<std::string> a = absl::StrSplit("b,a,c,a,b", ','); 434 // // v[0] == "a", v[1] == "b", v[2] = "c" 435 // 436 // // `StrSplit()` can be used within a range-based for loop, in which case 437 // // each element will be of type `absl::string_view`. 438 // std::vector<std::string> v; 439 // for (const auto sv : absl::StrSplit("a,b,c", ',')) { 440 // if (sv != "b") v.emplace_back(sv); 441 // } 442 // // v[0] == "a", v[1] == "c" 443 // 444 // // Stores results in a map. The map implementation assumes that the input 445 // // is provided as a series of key/value pairs. For example, the 0th element 446 // // resulting from the split will be stored as a key to the 1st element. If 447 // // an odd number of elements are resolved, the last element is paired with 448 // // a default-constructed value (e.g., empty string). 449 // std::map<std::string, std::string> m = absl::StrSplit("a,b,c", ','); 450 // // m["a"] == "b", m["c"] == "" // last component value equals "" 451 // 452 // Splitting to `std::pair` is an interesting case because it can hold only two 453 // elements and is not a collection type. When splitting to a `std::pair` the 454 // first two split strings become the `std::pair` `.first` and `.second` 455 // members, respectively. The remaining split substrings are discarded. If there 456 // are less than two split substrings, the empty string is used for the 457 // corresponding 458 // `std::pair` member. 459 // 460 // Example: 461 // 462 // // Stores first two split strings as the members in a std::pair. 463 // std::pair<std::string, std::string> p = absl::StrSplit("a,b,c", ','); 464 // // p.first == "a", p.second == "b" // "c" is omitted. 465 // 466 // The `StrSplit()` function can be used multiple times to perform more 467 // complicated splitting logic, such as intelligently parsing key-value pairs. 468 // 469 // Example: 470 // 471 // // The input string "a=b=c,d=e,f=,g" becomes 472 // // { "a" => "b=c", "d" => "e", "f" => "", "g" => "" } 473 // std::map<std::string, std::string> m; 474 // for (absl::string_view sp : absl::StrSplit("a=b=c,d=e,f=,g", ',')) { 475 // m.insert(absl::StrSplit(sp, absl::MaxSplits('=', 1))); 476 // } 477 // EXPECT_EQ("b=c", m.find("a")->second); 478 // EXPECT_EQ("e", m.find("d")->second); 479 // EXPECT_EQ("", m.find("f")->second); 480 // EXPECT_EQ("", m.find("g")->second); 481 // 482 // WARNING: Due to a legacy bug that is maintained for backward compatibility, 483 // splitting the following empty string_views produces different results: 484 // 485 // absl::StrSplit(absl::string_view(""), '-'); // {""} 486 // absl::StrSplit(absl::string_view(), '-'); // {}, but should be {""} 487 // 488 // Try not to depend on this distinction because the bug may one day be fixed. 489 template <typename Delimiter> 490 strings_internal::Splitter< 491 typename strings_internal::SelectDelimiter<Delimiter>::type, AllowEmpty> 492 StrSplit(strings_internal::ConvertibleToStringView text, Delimiter d) { 493 using DelimiterType = 494 typename strings_internal::SelectDelimiter<Delimiter>::type; 495 return strings_internal::Splitter<DelimiterType, AllowEmpty>( 496 std::move(text), DelimiterType(d), AllowEmpty()); 497 } 498 499 template <typename Delimiter, typename Predicate> 500 strings_internal::Splitter< 501 typename strings_internal::SelectDelimiter<Delimiter>::type, Predicate> 502 StrSplit(strings_internal::ConvertibleToStringView text, Delimiter d, 503 Predicate p) { 504 using DelimiterType = 505 typename strings_internal::SelectDelimiter<Delimiter>::type; 506 return strings_internal::Splitter<DelimiterType, Predicate>( 507 std::move(text), DelimiterType(d), std::move(p)); 508 } 509 510 ABSL_NAMESPACE_END 511 } // namespace absl 512 513 #endif // ABSL_STRINGS_STR_SPLIT_H_ 514