• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 //
2 // Copyright 2017 The Abseil Authors.
3 //
4 // Licensed under the Apache License, Version 2.0 (the "License");
5 // you may not use this file except in compliance with the License.
6 // You may obtain a copy of the License at
7 //
8 //      https://www.apache.org/licenses/LICENSE-2.0
9 //
10 // Unless required by applicable law or agreed to in writing, software
11 // distributed under the License is distributed on an "AS IS" BASIS,
12 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 // See the License for the specific language governing permissions and
14 // limitations under the License.
15 //
16 // -----------------------------------------------------------------------------
17 // File: str_split.h
18 // -----------------------------------------------------------------------------
19 //
20 // This file contains functions for splitting strings. It defines the main
21 // `StrSplit()` function, several delimiters for determining the boundaries on
22 // which to split the string, and predicates for filtering delimited results.
23 // `StrSplit()` adapts the returned collection to the type specified by the
24 // caller.
25 //
26 // Example:
27 //
28 //   // Splits the given string on commas. Returns the results in a
29 //   // vector of strings.
30 //   std::vector<std::string> v = absl::StrSplit("a,b,c", ',');
31 //   // Can also use ","
32 //   // v[0] == "a", v[1] == "b", v[2] == "c"
33 //
34 // See StrSplit() below for more information.
35 #ifndef ABSL_STRINGS_STR_SPLIT_H_
36 #define ABSL_STRINGS_STR_SPLIT_H_
37 
38 #include <algorithm>
39 #include <cstddef>
40 #include <map>
41 #include <set>
42 #include <string>
43 #include <utility>
44 #include <vector>
45 
46 #include "absl/base/internal/raw_logging.h"
47 #include "absl/strings/internal/str_split_internal.h"
48 #include "absl/strings/string_view.h"
49 #include "absl/strings/strip.h"
50 
51 namespace absl {
52 ABSL_NAMESPACE_BEGIN
53 
54 //------------------------------------------------------------------------------
55 // Delimiters
56 //------------------------------------------------------------------------------
57 //
58 // `StrSplit()` uses delimiters to define the boundaries between elements in the
59 // provided input. Several `Delimiter` types are defined below. If a string
60 // (`const char*`, `std::string`, or `absl::string_view`) is passed in place of
61 // an explicit `Delimiter` object, `StrSplit()` treats it the same way as if it
62 // were passed a `ByString` delimiter.
63 //
64 // A `Delimiter` is an object with a `Find()` function that knows how to find
65 // the first occurrence of itself in a given `absl::string_view`.
66 //
67 // The following `Delimiter` types are available for use within `StrSplit()`:
68 //
69 //   - `ByString` (default for string arguments)
70 //   - `ByChar` (default for a char argument)
71 //   - `ByAnyChar`
72 //   - `ByLength`
73 //   - `MaxSplits`
74 //
75 // A Delimiter's `Find()` member function will be passed an input `text` that is
76 // to be split and a position (`pos`) to begin searching for the next delimiter
77 // in `text`. The returned absl::string_view should refer to the next occurrence
78 // (after `pos`) of the represented delimiter; this returned absl::string_view
79 // represents the next location where the input `text` should be broken.
80 //
81 // The returned absl::string_view may be zero-length if the Delimiter does not
82 // represent a part of the string (e.g., a fixed-length delimiter). If no
83 // delimiter is found in the input `text`, a zero-length absl::string_view
84 // referring to `text.end()` should be returned (e.g.,
85 // `text.substr(text.size())`). It is important that the returned
86 // absl::string_view always be within the bounds of the input `text` given as an
87 // argument--it must not refer to a string that is physically located outside of
88 // the given string.
89 //
90 // The following example is a simple Delimiter object that is created with a
91 // single char and will look for that char in the text passed to the `Find()`
92 // function:
93 //
94 //   struct SimpleDelimiter {
95 //     const char c_;
96 //     explicit SimpleDelimiter(char c) : c_(c) {}
97 //     absl::string_view Find(absl::string_view text, size_t pos) {
98 //       auto found = text.find(c_, pos);
99 //       if (found == absl::string_view::npos)
100 //         return text.substr(text.size());
101 //
102 //       return text.substr(found, 1);
103 //     }
104 //   };
105 
106 // ByString
107 //
108 // A sub-string delimiter. If `StrSplit()` is passed a string in place of a
109 // `Delimiter` object, the string will be implicitly converted into a
110 // `ByString` delimiter.
111 //
112 // Example:
113 //
114 //   // Because a string literal is converted to an `absl::ByString`,
115 //   // the following two splits are equivalent.
116 //
117 //   std::vector<std::string> v1 = absl::StrSplit("a, b, c", ", ");
118 //
119 //   using absl::ByString;
120 //   std::vector<std::string> v2 = absl::StrSplit("a, b, c",
121 //                                                ByString(", "));
122 //   // v[0] == "a", v[1] == "b", v[2] == "c"
123 class ByString {
124  public:
125   explicit ByString(absl::string_view sp);
126   absl::string_view Find(absl::string_view text, size_t pos) const;
127 
128  private:
129   const std::string delimiter_;
130 };
131 
132 // ByChar
133 //
134 // A single character delimiter. `ByChar` is functionally equivalent to a
135 // 1-char string within a `ByString` delimiter, but slightly more efficient.
136 //
137 // Example:
138 //
139 //   // Because a char literal is converted to a absl::ByChar,
140 //   // the following two splits are equivalent.
141 //   std::vector<std::string> v1 = absl::StrSplit("a,b,c", ',');
142 //   using absl::ByChar;
143 //   std::vector<std::string> v2 = absl::StrSplit("a,b,c", ByChar(','));
144 //   // v[0] == "a", v[1] == "b", v[2] == "c"
145 //
146 // `ByChar` is also the default delimiter if a single character is given
147 // as the delimiter to `StrSplit()`. For example, the following calls are
148 // equivalent:
149 //
150 //   std::vector<std::string> v = absl::StrSplit("a-b", '-');
151 //
152 //   using absl::ByChar;
153 //   std::vector<std::string> v = absl::StrSplit("a-b", ByChar('-'));
154 //
155 class ByChar {
156  public:
ByChar(char c)157   explicit ByChar(char c) : c_(c) {}
158   absl::string_view Find(absl::string_view text, size_t pos) const;
159 
160  private:
161   char c_;
162 };
163 
164 // ByAnyChar
165 //
166 // A delimiter that will match any of the given byte-sized characters within
167 // its provided string.
168 //
169 // Note: this delimiter works with single-byte string data, but does not work
170 // with variable-width encodings, such as UTF-8.
171 //
172 // Example:
173 //
174 //   using absl::ByAnyChar;
175 //   std::vector<std::string> v = absl::StrSplit("a,b=c", ByAnyChar(",="));
176 //   // v[0] == "a", v[1] == "b", v[2] == "c"
177 //
178 // If `ByAnyChar` is given the empty string, it behaves exactly like
179 // `ByString` and matches each individual character in the input string.
180 //
181 class ByAnyChar {
182  public:
183   explicit ByAnyChar(absl::string_view sp);
184   absl::string_view Find(absl::string_view text, size_t pos) const;
185 
186  private:
187   const std::string delimiters_;
188 };
189 
190 // ByLength
191 //
192 // A delimiter for splitting into equal-length strings. The length argument to
193 // the constructor must be greater than 0.
194 //
195 // Note: this delimiter works with single-byte string data, but does not work
196 // with variable-width encodings, such as UTF-8.
197 //
198 // Example:
199 //
200 //   using absl::ByLength;
201 //   std::vector<std::string> v = absl::StrSplit("123456789", ByLength(3));
202 
203 //   // v[0] == "123", v[1] == "456", v[2] == "789"
204 //
205 // Note that the string does not have to be a multiple of the fixed split
206 // length. In such a case, the last substring will be shorter.
207 //
208 //   using absl::ByLength;
209 //   std::vector<std::string> v = absl::StrSplit("12345", ByLength(2));
210 //
211 //   // v[0] == "12", v[1] == "34", v[2] == "5"
212 class ByLength {
213  public:
214   explicit ByLength(ptrdiff_t length);
215   absl::string_view Find(absl::string_view text, size_t pos) const;
216 
217  private:
218   const ptrdiff_t length_;
219 };
220 
221 namespace strings_internal {
222 
223 // A traits-like metafunction for selecting the default Delimiter object type
224 // for a particular Delimiter type. The base case simply exposes type Delimiter
225 // itself as the delimiter's Type. However, there are specializations for
226 // string-like objects that map them to the ByString delimiter object.
227 // This allows functions like absl::StrSplit() and absl::MaxSplits() to accept
228 // string-like objects (e.g., ',') as delimiter arguments but they will be
229 // treated as if a ByString delimiter was given.
230 template <typename Delimiter>
231 struct SelectDelimiter {
232   using type = Delimiter;
233 };
234 
235 template <>
236 struct SelectDelimiter<char> {
237   using type = ByChar;
238 };
239 template <>
240 struct SelectDelimiter<char*> {
241   using type = ByString;
242 };
243 template <>
244 struct SelectDelimiter<const char*> {
245   using type = ByString;
246 };
247 template <>
248 struct SelectDelimiter<absl::string_view> {
249   using type = ByString;
250 };
251 template <>
252 struct SelectDelimiter<std::string> {
253   using type = ByString;
254 };
255 
256 // Wraps another delimiter and sets a max number of matches for that delimiter.
257 template <typename Delimiter>
258 class MaxSplitsImpl {
259  public:
260   MaxSplitsImpl(Delimiter delimiter, int limit)
261       : delimiter_(delimiter), limit_(limit), count_(0) {}
262   absl::string_view Find(absl::string_view text, size_t pos) {
263     if (count_++ == limit_) {
264       return absl::string_view(text.data() + text.size(),
265                                0);  // No more matches.
266     }
267     return delimiter_.Find(text, pos);
268   }
269 
270  private:
271   Delimiter delimiter_;
272   const int limit_;
273   int count_;
274 };
275 
276 }  // namespace strings_internal
277 
278 // MaxSplits()
279 //
280 // A delimiter that limits the number of matches which can occur to the passed
281 // `limit`. The last element in the returned collection will contain all
282 // remaining unsplit pieces, which may contain instances of the delimiter.
283 // The collection will contain at most `limit` + 1 elements.
284 // Example:
285 //
286 //   using absl::MaxSplits;
287 //   std::vector<std::string> v = absl::StrSplit("a,b,c", MaxSplits(',', 1));
288 //
289 //   // v[0] == "a", v[1] == "b,c"
290 template <typename Delimiter>
291 inline strings_internal::MaxSplitsImpl<
292     typename strings_internal::SelectDelimiter<Delimiter>::type>
293 MaxSplits(Delimiter delimiter, int limit) {
294   typedef
295       typename strings_internal::SelectDelimiter<Delimiter>::type DelimiterType;
296   return strings_internal::MaxSplitsImpl<DelimiterType>(
297       DelimiterType(delimiter), limit);
298 }
299 
300 //------------------------------------------------------------------------------
301 // Predicates
302 //------------------------------------------------------------------------------
303 //
304 // Predicates filter the results of a `StrSplit()` by determining whether or not
305 // a resultant element is included in the result set. A predicate may be passed
306 // as an optional third argument to the `StrSplit()` function.
307 //
308 // Predicates are unary functions (or functors) that take a single
309 // `absl::string_view` argument and return a bool indicating whether the
310 // argument should be included (`true`) or excluded (`false`).
311 //
312 // Predicates are useful when filtering out empty substrings. By default, empty
313 // substrings may be returned by `StrSplit()`, which is similar to the way split
314 // functions work in other programming languages.
315 
316 // AllowEmpty()
317 //
318 // Always returns `true`, indicating that all strings--including empty
319 // strings--should be included in the split output. This predicate is not
320 // strictly needed because this is the default behavior of `StrSplit()`;
321 // however, it might be useful at some call sites to make the intent explicit.
322 //
323 // Example:
324 //
325 //  std::vector<std::string> v = absl::StrSplit(" a , ,,b,", ',', AllowEmpty());
326 //
327 //  // v[0] == " a ", v[1] == " ", v[2] == "", v[3] = "b", v[4] == ""
328 struct AllowEmpty {
329   bool operator()(absl::string_view) const { return true; }
330 };
331 
332 // SkipEmpty()
333 //
334 // Returns `false` if the given `absl::string_view` is empty, indicating that
335 // `StrSplit()` should omit the empty string.
336 //
337 // Example:
338 //
339 //   std::vector<std::string> v = absl::StrSplit(",a,,b,", ',', SkipEmpty());
340 //
341 //   // v[0] == "a", v[1] == "b"
342 //
343 // Note: `SkipEmpty()` does not consider a string containing only whitespace
344 // to be empty. To skip such whitespace as well, use the `SkipWhitespace()`
345 // predicate.
346 struct SkipEmpty {
347   bool operator()(absl::string_view sp) const { return !sp.empty(); }
348 };
349 
350 // SkipWhitespace()
351 //
352 // Returns `false` if the given `absl::string_view` is empty *or* contains only
353 // whitespace, indicating that `StrSplit()` should omit the string.
354 //
355 // Example:
356 //
357 //   std::vector<std::string> v = absl::StrSplit(" a , ,,b,",
358 //                                               ',', SkipWhitespace());
359 //   // v[0] == " a ", v[1] == "b"
360 //
361 //   // SkipEmpty() would return whitespace elements
362 //   std::vector<std::string> v = absl::StrSplit(" a , ,,b,", ',', SkipEmpty());
363 //   // v[0] == " a ", v[1] == " ", v[2] == "b"
364 struct SkipWhitespace {
365   bool operator()(absl::string_view sp) const {
366     sp = absl::StripAsciiWhitespace(sp);
367     return !sp.empty();
368   }
369 };
370 
371 //------------------------------------------------------------------------------
372 //                                  StrSplit()
373 //------------------------------------------------------------------------------
374 
375 // StrSplit()
376 //
377 // Splits a given string based on the provided `Delimiter` object, returning the
378 // elements within the type specified by the caller. Optionally, you may pass a
379 // `Predicate` to `StrSplit()` indicating whether to include or exclude the
380 // resulting element within the final result set. (See the overviews for
381 // Delimiters and Predicates above.)
382 //
383 // Example:
384 //
385 //   std::vector<std::string> v = absl::StrSplit("a,b,c,d", ',');
386 //   // v[0] == "a", v[1] == "b", v[2] == "c", v[3] == "d"
387 //
388 // You can also provide an explicit `Delimiter` object:
389 //
390 // Example:
391 //
392 //   using absl::ByAnyChar;
393 //   std::vector<std::string> v = absl::StrSplit("a,b=c", ByAnyChar(",="));
394 //   // v[0] == "a", v[1] == "b", v[2] == "c"
395 //
396 // See above for more information on delimiters.
397 //
398 // By default, empty strings are included in the result set. You can optionally
399 // include a third `Predicate` argument to apply a test for whether the
400 // resultant element should be included in the result set:
401 //
402 // Example:
403 //
404 //   std::vector<std::string> v = absl::StrSplit(" a , ,,b,",
405 //                                               ',', SkipWhitespace());
406 //   // v[0] == " a ", v[1] == "b"
407 //
408 // See above for more information on predicates.
409 //
410 //------------------------------------------------------------------------------
411 // StrSplit() Return Types
412 //------------------------------------------------------------------------------
413 //
414 // The `StrSplit()` function adapts the returned collection to the collection
415 // specified by the caller (e.g. `std::vector` above). The returned collections
416 // may contain `std::string`, `absl::string_view` (in which case the original
417 // string being split must ensure that it outlives the collection), or any
418 // object that can be explicitly created from an `absl::string_view`. This
419 // behavior works for:
420 //
421 // 1) All standard STL containers including `std::vector`, `std::list`,
422 //    `std::deque`, `std::set`,`std::multiset`, 'std::map`, and `std::multimap`
423 // 2) `std::pair` (which is not actually a container). See below.
424 //
425 // Example:
426 //
427 //   // The results are returned as `absl::string_view` objects. Note that we
428 //   // have to ensure that the input string outlives any results.
429 //   std::vector<absl::string_view> v = absl::StrSplit("a,b,c", ',');
430 //
431 //   // Stores results in a std::set<std::string>, which also performs
432 //   // de-duplication and orders the elements in ascending order.
433 //   std::set<std::string> a = absl::StrSplit("b,a,c,a,b", ',');
434 //   // v[0] == "a", v[1] == "b", v[2] = "c"
435 //
436 //   // `StrSplit()` can be used within a range-based for loop, in which case
437 //   // each element will be of type `absl::string_view`.
438 //   std::vector<std::string> v;
439 //   for (const auto sv : absl::StrSplit("a,b,c", ',')) {
440 //     if (sv != "b") v.emplace_back(sv);
441 //   }
442 //   // v[0] == "a", v[1] == "c"
443 //
444 //   // Stores results in a map. The map implementation assumes that the input
445 //   // is provided as a series of key/value pairs. For example, the 0th element
446 //   // resulting from the split will be stored as a key to the 1st element. If
447 //   // an odd number of elements are resolved, the last element is paired with
448 //   // a default-constructed value (e.g., empty string).
449 //   std::map<std::string, std::string> m = absl::StrSplit("a,b,c", ',');
450 //   // m["a"] == "b", m["c"] == ""     // last component value equals ""
451 //
452 // Splitting to `std::pair` is an interesting case because it can hold only two
453 // elements and is not a collection type. When splitting to a `std::pair` the
454 // first two split strings become the `std::pair` `.first` and `.second`
455 // members, respectively. The remaining split substrings are discarded. If there
456 // are less than two split substrings, the empty string is used for the
457 // corresponding
458 // `std::pair` member.
459 //
460 // Example:
461 //
462 //   // Stores first two split strings as the members in a std::pair.
463 //   std::pair<std::string, std::string> p = absl::StrSplit("a,b,c", ',');
464 //   // p.first == "a", p.second == "b"       // "c" is omitted.
465 //
466 // The `StrSplit()` function can be used multiple times to perform more
467 // complicated splitting logic, such as intelligently parsing key-value pairs.
468 //
469 // Example:
470 //
471 //   // The input string "a=b=c,d=e,f=,g" becomes
472 //   // { "a" => "b=c", "d" => "e", "f" => "", "g" => "" }
473 //   std::map<std::string, std::string> m;
474 //   for (absl::string_view sp : absl::StrSplit("a=b=c,d=e,f=,g", ',')) {
475 //     m.insert(absl::StrSplit(sp, absl::MaxSplits('=', 1)));
476 //   }
477 //   EXPECT_EQ("b=c", m.find("a")->second);
478 //   EXPECT_EQ("e", m.find("d")->second);
479 //   EXPECT_EQ("", m.find("f")->second);
480 //   EXPECT_EQ("", m.find("g")->second);
481 //
482 // WARNING: Due to a legacy bug that is maintained for backward compatibility,
483 // splitting the following empty string_views produces different results:
484 //
485 //   absl::StrSplit(absl::string_view(""), '-');  // {""}
486 //   absl::StrSplit(absl::string_view(), '-');    // {}, but should be {""}
487 //
488 // Try not to depend on this distinction because the bug may one day be fixed.
489 template <typename Delimiter>
490 strings_internal::Splitter<
491     typename strings_internal::SelectDelimiter<Delimiter>::type, AllowEmpty>
492 StrSplit(strings_internal::ConvertibleToStringView text, Delimiter d) {
493   using DelimiterType =
494       typename strings_internal::SelectDelimiter<Delimiter>::type;
495   return strings_internal::Splitter<DelimiterType, AllowEmpty>(
496       std::move(text), DelimiterType(d), AllowEmpty());
497 }
498 
499 template <typename Delimiter, typename Predicate>
500 strings_internal::Splitter<
501     typename strings_internal::SelectDelimiter<Delimiter>::type, Predicate>
502 StrSplit(strings_internal::ConvertibleToStringView text, Delimiter d,
503          Predicate p) {
504   using DelimiterType =
505       typename strings_internal::SelectDelimiter<Delimiter>::type;
506   return strings_internal::Splitter<DelimiterType, Predicate>(
507       std::move(text), DelimiterType(d), std::move(p));
508 }
509 
510 ABSL_NAMESPACE_END
511 }  // namespace absl
512 
513 #endif  // ABSL_STRINGS_STR_SPLIT_H_
514