1 // Copyright 2006-2008 The Chromium Authors
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4
5 #include "base/strings/string_tokenizer.h"
6
7 #include "testing/gmock/include/gmock/gmock.h"
8 #include "testing/gtest/include/gtest/gtest.h"
9
10 using std::string;
11 using testing::Eq;
12 using testing::Optional;
13
14 namespace base {
15
16 namespace {
17
TEST(StringTokenizerTest,Simple)18 TEST(StringTokenizerTest, Simple) {
19 string input = "this is a test";
20 StringTokenizer t(input, " ");
21 // The start of string, before returning any tokens, is considered a
22 // delimiter.
23 EXPECT_TRUE(t.token_is_delim());
24
25 EXPECT_TRUE(t.GetNext());
26 EXPECT_FALSE(t.token_is_delim());
27 EXPECT_EQ("this", t.token());
28
29 EXPECT_TRUE(t.GetNext());
30 EXPECT_FALSE(t.token_is_delim());
31 EXPECT_EQ("is", t.token());
32
33 EXPECT_TRUE(t.GetNext());
34 EXPECT_FALSE(t.token_is_delim());
35 EXPECT_EQ("a", t.token());
36
37 EXPECT_TRUE(t.GetNext());
38 EXPECT_FALSE(t.token_is_delim());
39 EXPECT_EQ("test", t.token());
40
41 EXPECT_FALSE(t.GetNext());
42 // The end of string, after the last token tokens, is considered a delimiter.
43 EXPECT_TRUE(t.token_is_delim());
44 }
45
TEST(StringTokenizerTest,SimpleUsingTokenView)46 TEST(StringTokenizerTest, SimpleUsingTokenView) {
47 string input = "this is a test";
48 StringTokenizer t(input, " ");
49 // The start of string, before returning any tokens, is considered a
50 // delimiter.
51 EXPECT_TRUE(t.token_is_delim());
52
53 EXPECT_THAT(t.GetNextTokenView(), Optional(Eq("this")));
54 EXPECT_FALSE(t.token_is_delim());
55
56 EXPECT_THAT(t.GetNextTokenView(), Optional(Eq("is")));
57 EXPECT_FALSE(t.token_is_delim());
58
59 EXPECT_THAT(t.GetNextTokenView(), Optional(Eq("a")));
60 EXPECT_FALSE(t.token_is_delim());
61
62 EXPECT_THAT(t.GetNextTokenView(), Optional(Eq("test")));
63 EXPECT_FALSE(t.token_is_delim());
64
65 EXPECT_THAT(t.GetNextTokenView(), Eq(std::nullopt));
66 // The end of string, after the last token tokens, is considered a delimiter.
67 EXPECT_TRUE(t.token_is_delim());
68 }
69
TEST(StringTokenizerTest,Reset)70 TEST(StringTokenizerTest, Reset) {
71 string input = "this is a test";
72 StringTokenizer t(input, " ");
73
74 for (int i = 0; i < 2; ++i) {
75 EXPECT_TRUE(t.token_is_delim());
76
77 EXPECT_TRUE(t.GetNext());
78 EXPECT_FALSE(t.token_is_delim());
79 EXPECT_EQ("this", t.token());
80
81 EXPECT_TRUE(t.GetNext());
82 EXPECT_FALSE(t.token_is_delim());
83 EXPECT_EQ("is", t.token());
84
85 EXPECT_TRUE(t.GetNext());
86 EXPECT_FALSE(t.token_is_delim());
87 EXPECT_EQ("a", t.token());
88
89 EXPECT_TRUE(t.GetNext());
90 EXPECT_FALSE(t.token_is_delim());
91 EXPECT_EQ("test", t.token());
92
93 EXPECT_FALSE(t.GetNext());
94 EXPECT_TRUE(t.token_is_delim());
95
96 t.Reset();
97 }
98 }
99
TEST(StringTokenizerTest,RetDelims)100 TEST(StringTokenizerTest, RetDelims) {
101 string input = "this is a test";
102 StringTokenizer t(input, " ");
103 t.set_options(StringTokenizer::RETURN_DELIMS);
104 EXPECT_TRUE(t.token_is_delim());
105
106 EXPECT_TRUE(t.GetNext());
107 EXPECT_FALSE(t.token_is_delim());
108 EXPECT_EQ("this", t.token());
109
110 EXPECT_TRUE(t.GetNext());
111 EXPECT_TRUE(t.token_is_delim());
112 EXPECT_EQ(" ", t.token());
113
114 EXPECT_TRUE(t.GetNext());
115 EXPECT_FALSE(t.token_is_delim());
116 EXPECT_EQ("is", t.token());
117
118 EXPECT_TRUE(t.GetNext());
119 EXPECT_TRUE(t.token_is_delim());
120 EXPECT_EQ(" ", t.token());
121
122 EXPECT_TRUE(t.GetNext());
123 EXPECT_FALSE(t.token_is_delim());
124 EXPECT_EQ("a", t.token());
125
126 EXPECT_TRUE(t.GetNext());
127 EXPECT_TRUE(t.token_is_delim());
128 EXPECT_EQ(" ", t.token());
129
130 EXPECT_TRUE(t.GetNext());
131 EXPECT_FALSE(t.token_is_delim());
132 EXPECT_EQ("test", t.token());
133
134 EXPECT_FALSE(t.GetNext());
135 EXPECT_TRUE(t.token_is_delim());
136 }
137
TEST(StringTokenizerTest,RetEmptyTokens)138 TEST(StringTokenizerTest, RetEmptyTokens) {
139 string input = "foo='a, b',,bar,,baz,quux";
140 StringTokenizer t(input, ",");
141 t.set_options(StringTokenizer::RETURN_EMPTY_TOKENS);
142 t.set_quote_chars("'");
143
144 ASSERT_TRUE(t.GetNext());
145 EXPECT_EQ("foo='a, b'", t.token());
146
147 ASSERT_TRUE(t.GetNext());
148 EXPECT_EQ("", t.token());
149
150 ASSERT_TRUE(t.GetNext());
151 EXPECT_EQ("bar", t.token());
152
153 ASSERT_TRUE(t.GetNext());
154 EXPECT_EQ("", t.token());
155
156 ASSERT_TRUE(t.GetNext());
157 EXPECT_EQ("baz", t.token());
158
159 ASSERT_TRUE(t.GetNext());
160 EXPECT_EQ("quux", t.token());
161
162 EXPECT_FALSE(t.GetNext());
163 }
164
TEST(StringTokenizerTest,RetEmptyTokens_AtStart)165 TEST(StringTokenizerTest, RetEmptyTokens_AtStart) {
166 string input = ",bar";
167 StringTokenizer t(input, ",");
168 t.set_options(StringTokenizer::RETURN_EMPTY_TOKENS);
169 t.set_quote_chars("'");
170
171 ASSERT_TRUE(t.GetNext());
172 EXPECT_EQ("", t.token());
173
174 ASSERT_TRUE(t.GetNext());
175 EXPECT_EQ("bar", t.token());
176
177 EXPECT_FALSE(t.GetNext());
178 }
179
TEST(StringTokenizerTest,RetEmptyTokens_AtEnd)180 TEST(StringTokenizerTest, RetEmptyTokens_AtEnd) {
181 string input = "bar,";
182 StringTokenizer t(input, ",");
183 t.set_options(StringTokenizer::RETURN_EMPTY_TOKENS);
184 t.set_quote_chars("'");
185
186 ASSERT_TRUE(t.GetNext());
187 EXPECT_EQ("bar", t.token());
188
189 ASSERT_TRUE(t.GetNext());
190 EXPECT_EQ("", t.token());
191
192 EXPECT_FALSE(t.GetNext());
193 }
194
TEST(StringTokenizerTest,RetEmptyTokens_Both)195 TEST(StringTokenizerTest, RetEmptyTokens_Both) {
196 string input = ",";
197 StringTokenizer t(input, ",");
198 t.set_options(StringTokenizer::RETURN_EMPTY_TOKENS);
199 t.set_quote_chars("'");
200
201 ASSERT_TRUE(t.GetNext());
202 EXPECT_EQ("", t.token());
203
204 ASSERT_TRUE(t.GetNext());
205 EXPECT_EQ("", t.token());
206
207 EXPECT_FALSE(t.GetNext());
208 }
209
TEST(StringTokenizerTest,RetEmptyTokens_Empty)210 TEST(StringTokenizerTest, RetEmptyTokens_Empty) {
211 string input = "";
212 StringTokenizer t(input, ",");
213 t.set_options(StringTokenizer::RETURN_EMPTY_TOKENS);
214
215 ASSERT_TRUE(t.GetNext());
216 EXPECT_EQ("", t.token());
217
218 EXPECT_FALSE(t.GetNext());
219 }
220
TEST(StringTokenizerTest,RetDelimsAndEmptyTokens)221 TEST(StringTokenizerTest, RetDelimsAndEmptyTokens) {
222 string input = "foo='a, b',,bar,,baz,quux";
223 StringTokenizer t(input, ",");
224 t.set_options(StringTokenizer::RETURN_DELIMS |
225 StringTokenizer::RETURN_EMPTY_TOKENS);
226 t.set_quote_chars("'");
227
228 ASSERT_TRUE(t.GetNext());
229 EXPECT_EQ("foo='a, b'", t.token());
230
231 ASSERT_TRUE(t.GetNext());
232 EXPECT_EQ(",", t.token());
233
234 ASSERT_TRUE(t.GetNext());
235 EXPECT_EQ("", t.token());
236
237 ASSERT_TRUE(t.GetNext());
238 EXPECT_EQ(",", t.token());
239
240 ASSERT_TRUE(t.GetNext());
241 EXPECT_EQ("bar", t.token());
242
243 ASSERT_TRUE(t.GetNext());
244 EXPECT_EQ(",", t.token());
245
246 ASSERT_TRUE(t.GetNext());
247 EXPECT_EQ("", t.token());
248
249 ASSERT_TRUE(t.GetNext());
250 EXPECT_EQ(",", t.token());
251
252 ASSERT_TRUE(t.GetNext());
253 EXPECT_EQ("baz", t.token());
254
255 ASSERT_TRUE(t.GetNext());
256 EXPECT_EQ(",", t.token());
257
258 ASSERT_TRUE(t.GetNext());
259 EXPECT_EQ("quux", t.token());
260
261 EXPECT_FALSE(t.GetNext());
262 }
263
TEST(StringTokenizerTest,ManyDelims)264 TEST(StringTokenizerTest, ManyDelims) {
265 string input = "this: is, a-test";
266 StringTokenizer t(input, ": ,-");
267
268 EXPECT_TRUE(t.GetNext());
269 EXPECT_EQ("this", t.token());
270
271 EXPECT_TRUE(t.GetNext());
272 EXPECT_EQ("is", t.token());
273
274 EXPECT_TRUE(t.GetNext());
275 EXPECT_EQ("a", t.token());
276
277 EXPECT_TRUE(t.GetNext());
278 EXPECT_EQ("test", t.token());
279
280 EXPECT_FALSE(t.GetNext());
281 }
282
TEST(StringTokenizerTest,ParseHeader)283 TEST(StringTokenizerTest, ParseHeader) {
284 string input = "Content-Type: text/html ; charset=UTF-8";
285 StringTokenizer t(input, ": ;=");
286 t.set_options(StringTokenizer::RETURN_DELIMS);
287 EXPECT_TRUE(t.token_is_delim());
288
289 EXPECT_TRUE(t.GetNext());
290 EXPECT_FALSE(t.token_is_delim());
291 EXPECT_EQ("Content-Type", t.token());
292
293 EXPECT_TRUE(t.GetNext());
294 EXPECT_TRUE(t.token_is_delim());
295 EXPECT_EQ(":", t.token());
296
297 EXPECT_TRUE(t.GetNext());
298 EXPECT_TRUE(t.token_is_delim());
299 EXPECT_EQ(" ", t.token());
300
301 EXPECT_TRUE(t.GetNext());
302 EXPECT_FALSE(t.token_is_delim());
303 EXPECT_EQ("text/html", t.token());
304
305 EXPECT_TRUE(t.GetNext());
306 EXPECT_TRUE(t.token_is_delim());
307 EXPECT_EQ(" ", t.token());
308
309 EXPECT_TRUE(t.GetNext());
310 EXPECT_TRUE(t.token_is_delim());
311 EXPECT_EQ(";", t.token());
312
313 EXPECT_TRUE(t.GetNext());
314 EXPECT_TRUE(t.token_is_delim());
315 EXPECT_EQ(" ", t.token());
316
317 EXPECT_TRUE(t.GetNext());
318 EXPECT_FALSE(t.token_is_delim());
319 EXPECT_EQ("charset", t.token());
320
321 EXPECT_TRUE(t.GetNext());
322 EXPECT_TRUE(t.token_is_delim());
323 EXPECT_EQ("=", t.token());
324
325 EXPECT_TRUE(t.GetNext());
326 EXPECT_FALSE(t.token_is_delim());
327 EXPECT_EQ("UTF-8", t.token());
328
329 EXPECT_FALSE(t.GetNext());
330 EXPECT_TRUE(t.token_is_delim());
331 }
332
TEST(StringTokenizerTest,ParseQuotedString)333 TEST(StringTokenizerTest, ParseQuotedString) {
334 string input = "foo bar 'hello world' baz";
335 StringTokenizer t(input, " ");
336 t.set_quote_chars("'");
337
338 EXPECT_TRUE(t.GetNext());
339 EXPECT_EQ("foo", t.token());
340
341 EXPECT_TRUE(t.GetNext());
342 EXPECT_EQ("bar", t.token());
343
344 EXPECT_TRUE(t.GetNext());
345 EXPECT_EQ("'hello world'", t.token());
346
347 EXPECT_TRUE(t.GetNext());
348 EXPECT_EQ("baz", t.token());
349
350 EXPECT_FALSE(t.GetNext());
351 }
352
TEST(StringTokenizerTest,ParseQuotedString_Malformed)353 TEST(StringTokenizerTest, ParseQuotedString_Malformed) {
354 string input = "bar 'hello wo";
355 StringTokenizer t(input, " ");
356 t.set_quote_chars("'");
357
358 EXPECT_TRUE(t.GetNext());
359 EXPECT_EQ("bar", t.token());
360
361 EXPECT_TRUE(t.GetNext());
362 EXPECT_EQ("'hello wo", t.token());
363
364 EXPECT_FALSE(t.GetNext());
365 }
366
TEST(StringTokenizerTest,ParseQuotedString_Multiple)367 TEST(StringTokenizerTest, ParseQuotedString_Multiple) {
368 string input = "bar 'hel\"lo\" wo' baz\"";
369 StringTokenizer t(input, " ");
370 t.set_quote_chars("'\"");
371
372 EXPECT_TRUE(t.GetNext());
373 EXPECT_EQ("bar", t.token());
374
375 EXPECT_TRUE(t.GetNext());
376 EXPECT_EQ("'hel\"lo\" wo'", t.token());
377
378 EXPECT_TRUE(t.GetNext());
379 EXPECT_EQ("baz\"", t.token());
380
381 EXPECT_FALSE(t.GetNext());
382 }
383
TEST(StringTokenizerTest,ParseQuotedString_EscapedQuotes)384 TEST(StringTokenizerTest, ParseQuotedString_EscapedQuotes) {
385 string input = "foo 'don\\'t do that'";
386 StringTokenizer t(input, " ");
387 t.set_quote_chars("'");
388
389 EXPECT_TRUE(t.GetNext());
390 EXPECT_EQ("foo", t.token());
391
392 EXPECT_TRUE(t.GetNext());
393 EXPECT_EQ("'don\\'t do that'", t.token());
394
395 EXPECT_FALSE(t.GetNext());
396 }
397
TEST(StringTokenizerTest,ParseQuotedString_EscapedQuotes2)398 TEST(StringTokenizerTest, ParseQuotedString_EscapedQuotes2) {
399 string input = "foo='a, b', bar";
400 StringTokenizer t(input, ", ");
401 t.set_quote_chars("'");
402
403 EXPECT_TRUE(t.GetNext());
404 EXPECT_EQ("foo='a, b'", t.token());
405
406 EXPECT_TRUE(t.GetNext());
407 EXPECT_EQ("bar", t.token());
408
409 EXPECT_FALSE(t.GetNext());
410 }
411
TEST(StringTokenizerTest,ParseWithWhitespace_NoQuotes)412 TEST(StringTokenizerTest, ParseWithWhitespace_NoQuotes) {
413 string input = "\t\t\t foo=a,\r\n b,\r\n\t\t\t bar\t ";
414 StringTokenizer t(input, ",", StringTokenizer::WhitespacePolicy::kSkipOver);
415
416 EXPECT_TRUE(t.GetNext());
417 EXPECT_EQ("foo=a", t.token());
418
419 EXPECT_TRUE(t.GetNext());
420 EXPECT_EQ("b", t.token());
421
422 EXPECT_TRUE(t.GetNext());
423 EXPECT_EQ("bar", t.token());
424
425 EXPECT_FALSE(t.GetNext());
426 }
427
TEST(StringTokenizerTest,ParseWithWhitespace_Quotes)428 TEST(StringTokenizerTest, ParseWithWhitespace_Quotes) {
429 string input = "\t\t\t foo='a, b',\t\t\t bar\t ";
430 StringTokenizer t(input, ",", StringTokenizer::WhitespacePolicy::kSkipOver);
431 t.set_quote_chars("'");
432
433 EXPECT_TRUE(t.GetNext());
434 EXPECT_EQ("foo='a, b'", t.token());
435
436 EXPECT_TRUE(t.GetNext());
437 EXPECT_EQ("bar", t.token());
438
439 EXPECT_FALSE(t.GetNext());
440 }
441
442 } // namespace
443
444 } // namespace base
445