1 // Protocol Buffers - Google's data interchange format
2 // Copyright 2008 Google Inc. All rights reserved.
3 //
4 // Use of this source code is governed by a BSD-style
5 // license that can be found in the LICENSE file or at
6 // https://developers.google.com/open-source/licenses/bsd
7
8 // Author: kenton@google.com (Kenton Varda)
9 // Based on original Protocol Buffers design by
10 // Sanjay Ghemawat, Jeff Dean, and others.
11
12 #include "google/protobuf/io/tokenizer.h"
13
14 #include <limits.h>
15 #include <math.h>
16
17 #include <vector>
18
19 #include "google/protobuf/stubs/common.h"
20 #include "absl/strings/escaping.h"
21 #include "absl/strings/substitute.h"
22 #include "google/protobuf/io/zero_copy_stream_impl.h"
23 #include "google/protobuf/testing/googletest.h"
24 #include <gtest/gtest.h>
25
26 namespace google {
27 namespace protobuf {
28 namespace io {
29 namespace {
30
31 // ===================================================================
32 // Data-Driven Test Infrastructure
33
34 // TODO: This is copied from coded_stream_unittest. This is
35 // temporary until these features are integrated into gTest itself.
36
37 // TEST_1D and TEST_2D are macros I'd eventually like to see added to
38 // gTest. These macros can be used to declare tests which should be
39 // run multiple times, once for each item in some input array. TEST_1D
40 // tests all cases in a single input array. TEST_2D tests all
41 // combinations of cases from two arrays. The arrays must be statically
42 // defined such that the ABSL_ARRAYSIZE() macro works on them. Example:
43 //
44 // int kCases[] = {1, 2, 3, 4}
45 // TEST_1D(MyFixture, MyTest, kCases) {
46 // EXPECT_GT(kCases_case, 0);
47 // }
48 //
49 // This test iterates through the numbers 1, 2, 3, and 4 and tests that
50 // they are all grater than zero. In case of failure, the exact case
51 // which failed will be printed. The case type must be printable using
52 // ostream::operator<<.
53
54 #define TEST_1D(FIXTURE, NAME, CASES) \
55 class FIXTURE##_##NAME##_DD : public FIXTURE { \
56 protected: \
57 template <typename CaseType> \
58 void DoSingleCase(const CaseType& CASES##_case); \
59 }; \
60 \
61 TEST_F(FIXTURE##_##NAME##_DD, NAME) { \
62 for (int i = 0; i < ABSL_ARRAYSIZE(CASES); i++) { \
63 SCOPED_TRACE(testing::Message() \
64 << #CASES " case #" << i << ": " << CASES[i]); \
65 DoSingleCase(CASES[i]); \
66 } \
67 } \
68 \
69 template <typename CaseType> \
70 void FIXTURE##_##NAME##_DD::DoSingleCase(const CaseType& CASES##_case)
71
72 #define TEST_2D(FIXTURE, NAME, CASES1, CASES2) \
73 class FIXTURE##_##NAME##_DD : public FIXTURE { \
74 protected: \
75 template <typename CaseType1, typename CaseType2> \
76 void DoSingleCase(const CaseType1& CASES1##_case, \
77 const CaseType2& CASES2##_case); \
78 }; \
79 \
80 TEST_F(FIXTURE##_##NAME##_DD, NAME) { \
81 for (int i = 0; i < ABSL_ARRAYSIZE(CASES1); i++) { \
82 for (int j = 0; j < ABSL_ARRAYSIZE(CASES2); j++) { \
83 SCOPED_TRACE(testing::Message() \
84 << #CASES1 " case #" << i << ": " << CASES1[i] << ", " \
85 << #CASES2 " case #" << j << ": " << CASES2[j]); \
86 DoSingleCase(CASES1[i], CASES2[j]); \
87 } \
88 } \
89 } \
90 \
91 template <typename CaseType1, typename CaseType2> \
92 void FIXTURE##_##NAME##_DD::DoSingleCase(const CaseType1& CASES1##_case, \
93 const CaseType2& CASES2##_case)
94
95 // -------------------------------------------------------------------
96
97 // An input stream that is basically like an ArrayInputStream but sometimes
98 // returns empty buffers, just to throw us off.
99 class TestInputStream : public ZeroCopyInputStream {
100 public:
TestInputStream(const void * data,int size,int block_size)101 TestInputStream(const void* data, int size, int block_size)
102 : array_stream_(data, size, block_size), counter_(0) {}
~TestInputStream()103 ~TestInputStream() {}
104
105 // implements ZeroCopyInputStream ----------------------------------
Next(const void ** data,int * size)106 bool Next(const void** data, int* size) override {
107 // We'll return empty buffers starting with the first buffer, and every
108 // 3 and 5 buffers after that.
109 if (counter_ % 3 == 0 || counter_ % 5 == 0) {
110 *data = nullptr;
111 *size = 0;
112 ++counter_;
113 return true;
114 } else {
115 ++counter_;
116 return array_stream_.Next(data, size);
117 }
118 }
119
BackUp(int count)120 void BackUp(int count) override { return array_stream_.BackUp(count); }
Skip(int count)121 bool Skip(int count) override { return array_stream_.Skip(count); }
ByteCount() const122 int64_t ByteCount() const override { return array_stream_.ByteCount(); }
123
124 private:
125 ArrayInputStream array_stream_;
126 int counter_;
127 };
128
129 // -------------------------------------------------------------------
130
131 // An error collector which simply concatenates all its errors into a big
132 // block of text which can be checked.
133 class TestErrorCollector : public ErrorCollector {
134 public:
TestErrorCollector()135 TestErrorCollector() {}
~TestErrorCollector()136 ~TestErrorCollector() {}
137
138 std::string text_;
139
140 // implements ErrorCollector ---------------------------------------
RecordError(int line,int column,absl::string_view message)141 void RecordError(int line, int column, absl::string_view message) override {
142 absl::SubstituteAndAppend(&text_, "$0:$1: $2\n", line, column, message);
143 }
144 };
145
146 // -------------------------------------------------------------------
147
148 // We test each operation over a variety of block sizes to insure that
149 // we test cases where reads cross buffer boundaries as well as cases
150 // where they don't. This is sort of a brute-force approach to this,
151 // but it's easy to write and easy to understand.
152 const int kBlockSizes[] = {1, 2, 3, 5, 7, 13, 32, 1024};
153
154 class TokenizerTest : public testing::Test {
155 protected:
156 // For easy testing.
ParseInteger(const std::string & text)157 uint64_t ParseInteger(const std::string& text) {
158 uint64_t result;
159 EXPECT_TRUE(Tokenizer::ParseInteger(text, kuint64max, &result))
160 << "'" << text << "'";
161 return result;
162 }
163 };
164
165 // ===================================================================
166
167 // These tests causes gcc 3.3.5 (and earlier?) to give the cryptic error:
168 // "sorry, unimplemented: `method_call_expr' not supported by dump_expr"
169 #if !defined(__GNUC__) || __GNUC__ > 3 || (__GNUC__ == 3 && __GNUC_MINOR__ > 3)
170
171 // In each test case, the entire input text should parse as a single token
172 // of the given type.
173 struct SimpleTokenCase {
174 std::string input;
175 Tokenizer::TokenType type;
176 };
177
operator <<(std::ostream & out,const SimpleTokenCase & test_case)178 inline std::ostream& operator<<(std::ostream& out,
179 const SimpleTokenCase& test_case) {
180 return out << absl::CEscape(test_case.input);
181 }
182
183 SimpleTokenCase kSimpleTokenCases[] = {
184 // Test identifiers.
185 {"hello", Tokenizer::TYPE_IDENTIFIER},
186
187 // Test integers.
188 {"123", Tokenizer::TYPE_INTEGER},
189 {"0xab6", Tokenizer::TYPE_INTEGER},
190 {"0XAB6", Tokenizer::TYPE_INTEGER},
191 {"0X1234567", Tokenizer::TYPE_INTEGER},
192 {"0x89abcdef", Tokenizer::TYPE_INTEGER},
193 {"0x89ABCDEF", Tokenizer::TYPE_INTEGER},
194 {"01234567", Tokenizer::TYPE_INTEGER},
195
196 // Test floats.
197 {"123.45", Tokenizer::TYPE_FLOAT},
198 {"1.", Tokenizer::TYPE_FLOAT},
199 {"1e3", Tokenizer::TYPE_FLOAT},
200 {"1E3", Tokenizer::TYPE_FLOAT},
201 {"1e-3", Tokenizer::TYPE_FLOAT},
202 {"1e+3", Tokenizer::TYPE_FLOAT},
203 {"1.e3", Tokenizer::TYPE_FLOAT},
204 {"1.2e3", Tokenizer::TYPE_FLOAT},
205 {".1", Tokenizer::TYPE_FLOAT},
206 {".1e3", Tokenizer::TYPE_FLOAT},
207 {".1e-3", Tokenizer::TYPE_FLOAT},
208 {".1e+3", Tokenizer::TYPE_FLOAT},
209
210 // Test strings.
211 {"'hello'", Tokenizer::TYPE_STRING},
212 {"\"foo\"", Tokenizer::TYPE_STRING},
213 {"'a\"b'", Tokenizer::TYPE_STRING},
214 {"\"a'b\"", Tokenizer::TYPE_STRING},
215 {"'a\\'b'", Tokenizer::TYPE_STRING},
216 {"\"a\\\"b\"", Tokenizer::TYPE_STRING},
217 {"'\\xf'", Tokenizer::TYPE_STRING},
218 {"'\\0'", Tokenizer::TYPE_STRING},
219
220 // Test symbols.
221 {"+", Tokenizer::TYPE_SYMBOL},
222 {".", Tokenizer::TYPE_SYMBOL},
223 };
224
TEST_2D(TokenizerTest,SimpleTokens,kSimpleTokenCases,kBlockSizes)225 TEST_2D(TokenizerTest, SimpleTokens, kSimpleTokenCases, kBlockSizes) {
226 // Set up the tokenizer.
227 TestInputStream input(kSimpleTokenCases_case.input.data(),
228 kSimpleTokenCases_case.input.size(), kBlockSizes_case);
229 TestErrorCollector error_collector;
230 Tokenizer tokenizer(&input, &error_collector);
231
232 // Before Next() is called, the initial token should always be TYPE_START.
233 EXPECT_EQ(Tokenizer::TYPE_START, tokenizer.current().type);
234 EXPECT_EQ("", tokenizer.current().text);
235 EXPECT_EQ(0, tokenizer.current().line);
236 EXPECT_EQ(0, tokenizer.current().column);
237 EXPECT_EQ(0, tokenizer.current().end_column);
238
239 // Parse the token.
240 ASSERT_TRUE(tokenizer.Next());
241
242 // Check that it has the right type.
243 EXPECT_EQ(kSimpleTokenCases_case.type, tokenizer.current().type);
244 // Check that it contains the complete input text.
245 EXPECT_EQ(kSimpleTokenCases_case.input, tokenizer.current().text);
246 // Check that it is located at the beginning of the input
247 EXPECT_EQ(0, tokenizer.current().line);
248 EXPECT_EQ(0, tokenizer.current().column);
249 EXPECT_EQ(kSimpleTokenCases_case.input.size(),
250 tokenizer.current().end_column);
251
252 // There should be no more input.
253 EXPECT_FALSE(tokenizer.Next());
254
255 // After Next() returns false, the token should have type TYPE_END.
256 EXPECT_EQ(Tokenizer::TYPE_END, tokenizer.current().type);
257 EXPECT_EQ("", tokenizer.current().text);
258 EXPECT_EQ(0, tokenizer.current().line);
259 EXPECT_EQ(kSimpleTokenCases_case.input.size(), tokenizer.current().column);
260 EXPECT_EQ(kSimpleTokenCases_case.input.size(),
261 tokenizer.current().end_column);
262
263 // There should be no errors.
264 EXPECT_TRUE(error_collector.text_.empty());
265 }
266
TEST_1D(TokenizerTest,FloatSuffix,kBlockSizes)267 TEST_1D(TokenizerTest, FloatSuffix, kBlockSizes) {
268 // Test the "allow_f_after_float" option.
269
270 // Set up the tokenizer.
271 const char* text = "1f 2.5f 6e3f 7F";
272 TestInputStream input(text, strlen(text), kBlockSizes_case);
273 TestErrorCollector error_collector;
274 Tokenizer tokenizer(&input, &error_collector);
275 tokenizer.set_allow_f_after_float(true);
276
277 // Advance through tokens and check that they are parsed as expected.
278 ASSERT_TRUE(tokenizer.Next());
279 EXPECT_EQ(tokenizer.current().text, "1f");
280 EXPECT_EQ(tokenizer.current().type, Tokenizer::TYPE_FLOAT);
281 ASSERT_TRUE(tokenizer.Next());
282 EXPECT_EQ(tokenizer.current().text, "2.5f");
283 EXPECT_EQ(tokenizer.current().type, Tokenizer::TYPE_FLOAT);
284 ASSERT_TRUE(tokenizer.Next());
285 EXPECT_EQ(tokenizer.current().text, "6e3f");
286 EXPECT_EQ(tokenizer.current().type, Tokenizer::TYPE_FLOAT);
287 ASSERT_TRUE(tokenizer.Next());
288 EXPECT_EQ(tokenizer.current().text, "7F");
289 EXPECT_EQ(tokenizer.current().type, Tokenizer::TYPE_FLOAT);
290
291 // There should be no more input.
292 EXPECT_FALSE(tokenizer.Next());
293 // There should be no errors.
294 EXPECT_TRUE(error_collector.text_.empty());
295 }
296
297 SimpleTokenCase kWhitespaceTokenCases[] = {
298 {" ", Tokenizer::TYPE_WHITESPACE},
299 {" ", Tokenizer::TYPE_WHITESPACE},
300 {"\t", Tokenizer::TYPE_WHITESPACE},
301 {"\v", Tokenizer::TYPE_WHITESPACE},
302 {"\t ", Tokenizer::TYPE_WHITESPACE},
303 {"\v\t", Tokenizer::TYPE_WHITESPACE},
304 {" \t\r", Tokenizer::TYPE_WHITESPACE},
305 // Newlines:
306 {"\n", Tokenizer::TYPE_NEWLINE},
307 };
308
TEST_2D(TokenizerTest,Whitespace,kWhitespaceTokenCases,kBlockSizes)309 TEST_2D(TokenizerTest, Whitespace, kWhitespaceTokenCases, kBlockSizes) {
310 {
311 TestInputStream input(kWhitespaceTokenCases_case.input.data(),
312 kWhitespaceTokenCases_case.input.size(),
313 kBlockSizes_case);
314 TestErrorCollector error_collector;
315 Tokenizer tokenizer(&input, &error_collector);
316
317 EXPECT_FALSE(tokenizer.Next());
318 }
319 {
320 TestInputStream input(kWhitespaceTokenCases_case.input.data(),
321 kWhitespaceTokenCases_case.input.size(),
322 kBlockSizes_case);
323 TestErrorCollector error_collector;
324 Tokenizer tokenizer(&input, &error_collector);
325 tokenizer.set_report_whitespace(true);
326 tokenizer.set_report_newlines(true);
327
328 ASSERT_TRUE(tokenizer.Next());
329 EXPECT_EQ(tokenizer.current().text, kWhitespaceTokenCases_case.input);
330 EXPECT_EQ(tokenizer.current().type, kWhitespaceTokenCases_case.type);
331
332 EXPECT_FALSE(tokenizer.Next());
333 }
334 }
335
336 #endif
337
338 // -------------------------------------------------------------------
339
340 // In each case, the input is parsed to produce a list of tokens. The
341 // last token in "output" must have type TYPE_END.
342 struct MultiTokenCase {
343 std::string input;
344 std::vector<Tokenizer::Token> output;
345 };
346
operator <<(std::ostream & out,const MultiTokenCase & test_case)347 inline std::ostream& operator<<(std::ostream& out,
348 const MultiTokenCase& test_case) {
349 return out << absl::CEscape(test_case.input);
350 }
351
352 MultiTokenCase kMultiTokenCases[] = {
353 // Test empty input.
354 {"",
355 {
356 {Tokenizer::TYPE_END, "", 0, 0, 0},
357 }},
358
359 // Test all token types at the same time.
360 {"foo 1 1.2 + 'bar'",
361 {
362 {Tokenizer::TYPE_IDENTIFIER, "foo", 0, 0, 3},
363 {Tokenizer::TYPE_INTEGER, "1", 0, 4, 5},
364 {Tokenizer::TYPE_FLOAT, "1.2", 0, 6, 9},
365 {Tokenizer::TYPE_SYMBOL, "+", 0, 10, 11},
366 {Tokenizer::TYPE_STRING, "'bar'", 0, 12, 17},
367 {Tokenizer::TYPE_END, "", 0, 17, 17},
368 }},
369
370 // Test that consecutive symbols are parsed as separate tokens.
371 {"!@+%",
372 {
373 {Tokenizer::TYPE_SYMBOL, "!", 0, 0, 1},
374 {Tokenizer::TYPE_SYMBOL, "@", 0, 1, 2},
375 {Tokenizer::TYPE_SYMBOL, "+", 0, 2, 3},
376 {Tokenizer::TYPE_SYMBOL, "%", 0, 3, 4},
377 {Tokenizer::TYPE_END, "", 0, 4, 4},
378 }},
379
380 // Test that newlines affect line numbers correctly.
381 {"foo bar\nrab oof",
382 {
383 {Tokenizer::TYPE_IDENTIFIER, "foo", 0, 0, 3},
384 {Tokenizer::TYPE_IDENTIFIER, "bar", 0, 4, 7},
385 {Tokenizer::TYPE_IDENTIFIER, "rab", 1, 0, 3},
386 {Tokenizer::TYPE_IDENTIFIER, "oof", 1, 4, 7},
387 {Tokenizer::TYPE_END, "", 1, 7, 7},
388 }},
389
390 // Test that tabs affect column numbers correctly.
391 {"foo\tbar \tbaz",
392 {
393 {Tokenizer::TYPE_IDENTIFIER, "foo", 0, 0, 3},
394 {Tokenizer::TYPE_IDENTIFIER, "bar", 0, 8, 11},
395 {Tokenizer::TYPE_IDENTIFIER, "baz", 0, 16, 19},
396 {Tokenizer::TYPE_END, "", 0, 19, 19},
397 }},
398
399 // Test that tabs in string literals affect column numbers correctly.
400 {"\"foo\tbar\" baz",
401 {
402 {Tokenizer::TYPE_STRING, "\"foo\tbar\"", 0, 0, 12},
403 {Tokenizer::TYPE_IDENTIFIER, "baz", 0, 13, 16},
404 {Tokenizer::TYPE_END, "", 0, 16, 16},
405 }},
406
407 // Test that line comments are ignored.
408 {"foo // This is a comment\n"
409 "bar // This is another comment",
410 {
411 {Tokenizer::TYPE_IDENTIFIER, "foo", 0, 0, 3},
412 {Tokenizer::TYPE_IDENTIFIER, "bar", 1, 0, 3},
413 {Tokenizer::TYPE_END, "", 1, 30, 30},
414 }},
415
416 // Test that block comments are ignored.
417 {"foo /* This is a block comment */ bar",
418 {
419 {Tokenizer::TYPE_IDENTIFIER, "foo", 0, 0, 3},
420 {Tokenizer::TYPE_IDENTIFIER, "bar", 0, 34, 37},
421 {Tokenizer::TYPE_END, "", 0, 37, 37},
422 }},
423
424 // Test that sh-style comments are not ignored by default.
425 {"foo # bar\n"
426 "baz",
427 {
428 {Tokenizer::TYPE_IDENTIFIER, "foo", 0, 0, 3},
429 {Tokenizer::TYPE_SYMBOL, "#", 0, 4, 5},
430 {Tokenizer::TYPE_IDENTIFIER, "bar", 0, 6, 9},
431 {Tokenizer::TYPE_IDENTIFIER, "baz", 1, 0, 3},
432 {Tokenizer::TYPE_END, "", 1, 3, 3},
433 }},
434
435 // Test all whitespace chars
436 {"foo\n\t\r\v\fbar",
437 {
438 {Tokenizer::TYPE_IDENTIFIER, "foo", 0, 0, 3},
439 {Tokenizer::TYPE_IDENTIFIER, "bar", 1, 11, 14},
440 {Tokenizer::TYPE_END, "", 1, 14, 14},
441 }},
442 };
443
TEST_2D(TokenizerTest,MultipleTokens,kMultiTokenCases,kBlockSizes)444 TEST_2D(TokenizerTest, MultipleTokens, kMultiTokenCases, kBlockSizes) {
445 // Set up the tokenizer.
446 TestInputStream input(kMultiTokenCases_case.input.data(),
447 kMultiTokenCases_case.input.size(), kBlockSizes_case);
448 TestErrorCollector error_collector;
449 Tokenizer tokenizer(&input, &error_collector);
450
451 // Before Next() is called, the initial token should always be TYPE_START.
452 EXPECT_EQ(Tokenizer::TYPE_START, tokenizer.current().type);
453 EXPECT_EQ("", tokenizer.current().text);
454 EXPECT_EQ(0, tokenizer.current().line);
455 EXPECT_EQ(0, tokenizer.current().column);
456 EXPECT_EQ(0, tokenizer.current().end_column);
457
458 // Loop through all expected tokens.
459 int i = 0;
460 Tokenizer::Token token;
461 do {
462 token = kMultiTokenCases_case.output[i++];
463
464 SCOPED_TRACE(testing::Message() << "Token #" << i << ": " << token.text);
465
466 Tokenizer::Token previous = tokenizer.current();
467
468 // Next() should only return false when it hits the end token.
469 if (token.type != Tokenizer::TYPE_END) {
470 ASSERT_TRUE(tokenizer.Next());
471 } else {
472 ASSERT_FALSE(tokenizer.Next());
473 }
474
475 // Check that the previous token is set correctly.
476 EXPECT_EQ(previous.type, tokenizer.previous().type);
477 EXPECT_EQ(previous.text, tokenizer.previous().text);
478 EXPECT_EQ(previous.line, tokenizer.previous().line);
479 EXPECT_EQ(previous.column, tokenizer.previous().column);
480 EXPECT_EQ(previous.end_column, tokenizer.previous().end_column);
481
482 // Check that the token matches the expected one.
483 EXPECT_EQ(token.type, tokenizer.current().type);
484 EXPECT_EQ(token.text, tokenizer.current().text);
485 EXPECT_EQ(token.line, tokenizer.current().line);
486 EXPECT_EQ(token.column, tokenizer.current().column);
487 EXPECT_EQ(token.end_column, tokenizer.current().end_column);
488
489 } while (token.type != Tokenizer::TYPE_END);
490
491 // There should be no errors.
492 EXPECT_TRUE(error_collector.text_.empty());
493 }
494
495 MultiTokenCase kMultiWhitespaceTokenCases[] = {
496 // Test all token types at the same time.
497 {"foo 1 \t1.2 \n +\v'bar'",
498 {
499 {Tokenizer::TYPE_IDENTIFIER, "foo", 0, 0, 3},
500 {Tokenizer::TYPE_WHITESPACE, " ", 0, 3, 4},
501 {Tokenizer::TYPE_INTEGER, "1", 0, 4, 5},
502 {Tokenizer::TYPE_WHITESPACE, " \t", 0, 5, 8},
503 {Tokenizer::TYPE_FLOAT, "1.2", 0, 8, 11},
504 {Tokenizer::TYPE_WHITESPACE, " ", 0, 11, 13},
505 {Tokenizer::TYPE_NEWLINE, "\n", 0, 13, 0},
506 {Tokenizer::TYPE_WHITESPACE, " ", 1, 0, 3},
507 {Tokenizer::TYPE_SYMBOL, "+", 1, 3, 4},
508 {Tokenizer::TYPE_WHITESPACE, "\v", 1, 4, 5},
509 {Tokenizer::TYPE_STRING, "'bar'", 1, 5, 10},
510 {Tokenizer::TYPE_END, "", 1, 10, 10},
511 }},
512
513 };
514
TEST_2D(TokenizerTest,MultipleWhitespaceTokens,kMultiWhitespaceTokenCases,kBlockSizes)515 TEST_2D(TokenizerTest, MultipleWhitespaceTokens, kMultiWhitespaceTokenCases,
516 kBlockSizes) {
517 // Set up the tokenizer.
518 TestInputStream input(kMultiWhitespaceTokenCases_case.input.data(),
519 kMultiWhitespaceTokenCases_case.input.size(),
520 kBlockSizes_case);
521 TestErrorCollector error_collector;
522 Tokenizer tokenizer(&input, &error_collector);
523 tokenizer.set_report_whitespace(true);
524 tokenizer.set_report_newlines(true);
525
526 // Before Next() is called, the initial token should always be TYPE_START.
527 EXPECT_EQ(Tokenizer::TYPE_START, tokenizer.current().type);
528 EXPECT_EQ("", tokenizer.current().text);
529 EXPECT_EQ(0, tokenizer.current().line);
530 EXPECT_EQ(0, tokenizer.current().column);
531 EXPECT_EQ(0, tokenizer.current().end_column);
532
533 // Loop through all expected tokens.
534 int i = 0;
535 Tokenizer::Token token;
536 do {
537 token = kMultiWhitespaceTokenCases_case.output[i++];
538
539 SCOPED_TRACE(testing::Message() << "Token #" << i << ": " << token.text);
540
541 Tokenizer::Token previous = tokenizer.current();
542
543 // Next() should only return false when it hits the end token.
544 if (token.type != Tokenizer::TYPE_END) {
545 ASSERT_TRUE(tokenizer.Next());
546 } else {
547 ASSERT_FALSE(tokenizer.Next());
548 }
549
550 // Check that the previous token is set correctly.
551 EXPECT_EQ(previous.type, tokenizer.previous().type);
552 EXPECT_EQ(previous.text, tokenizer.previous().text);
553 EXPECT_EQ(previous.line, tokenizer.previous().line);
554 EXPECT_EQ(previous.column, tokenizer.previous().column);
555 EXPECT_EQ(previous.end_column, tokenizer.previous().end_column);
556
557 // Check that the token matches the expected one.
558 EXPECT_EQ(token.type, tokenizer.current().type);
559 EXPECT_EQ(token.text, tokenizer.current().text);
560 EXPECT_EQ(token.line, tokenizer.current().line);
561 EXPECT_EQ(token.column, tokenizer.current().column);
562 EXPECT_EQ(token.end_column, tokenizer.current().end_column);
563
564 } while (token.type != Tokenizer::TYPE_END);
565
566 // There should be no errors.
567 EXPECT_TRUE(error_collector.text_.empty());
568 }
569
570 // This test causes gcc 3.3.5 (and earlier?) to give the cryptic error:
571 // "sorry, unimplemented: `method_call_expr' not supported by dump_expr"
572 #if !defined(__GNUC__) || __GNUC__ > 3 || (__GNUC__ == 3 && __GNUC_MINOR__ > 3)
573
TEST_1D(TokenizerTest,ShCommentStyle,kBlockSizes)574 TEST_1D(TokenizerTest, ShCommentStyle, kBlockSizes) {
575 // Test the "comment_style" option.
576
577 const char* text =
578 "foo # bar\n"
579 "baz // qux\n"
580 "corge /* grault */\n"
581 "garply";
582 const char* const kTokens[] = {"foo", // "# bar" is ignored
583 "baz", "/", "/", "qux", "corge", "/",
584 "*", "grault", "*", "/", "garply"};
585
586 // Set up the tokenizer.
587 TestInputStream input(text, strlen(text), kBlockSizes_case);
588 TestErrorCollector error_collector;
589 Tokenizer tokenizer(&input, &error_collector);
590 tokenizer.set_comment_style(Tokenizer::SH_COMMENT_STYLE);
591
592 // Advance through tokens and check that they are parsed as expected.
593 for (int i = 0; i < ABSL_ARRAYSIZE(kTokens); i++) {
594 EXPECT_TRUE(tokenizer.Next());
595 EXPECT_EQ(tokenizer.current().text, kTokens[i]);
596 }
597
598 // There should be no more input.
599 EXPECT_FALSE(tokenizer.Next());
600 // There should be no errors.
601 EXPECT_TRUE(error_collector.text_.empty());
602 }
603
604 #endif
605
606 // -------------------------------------------------------------------
607
608 // In each case, the input is expected to have two tokens named "prev" and
609 // "next" with comments in between.
610 struct DocCommentCase {
611 std::string input;
612
613 const char* prev_trailing_comments;
614 const char* detached_comments[10];
615 const char* next_leading_comments;
616 };
617
operator <<(std::ostream & out,const DocCommentCase & test_case)618 inline std::ostream& operator<<(std::ostream& out,
619 const DocCommentCase& test_case) {
620 return out << absl::CEscape(test_case.input);
621 }
622
623 // clang-format off
624 DocCommentCase kDocCommentCases[] = {
625 {"prev next",
626
627 "",
628 {},
629 ""},
630
631 {"prev // no next token\n",
632
633 " no next token\n",
634 {},
635 ""},
636
637 {"prev // no next token and no trailing newline",
638
639 " no next token and no trailing newline",
640 {},
641 ""},
642
643 {"prev /* detached */ next",
644
645 "",
646 {" detached "},
647 ""},
648
649 {"prev // trailing comment\n"
650 "next",
651
652 " trailing comment\n",
653 {},
654 ""},
655
656 {"prev\n"
657 "/* leading comment */ next",
658
659 "",
660 {},
661 " leading comment "},
662
663 {"prev\n"
664 "// leading comment\n"
665 "// line 2\n"
666 "next",
667
668 "",
669 {},
670 " leading comment\n"
671 " line 2\n"},
672
673 {"prev\n"
674 "// trailing comment\n"
675 "// line 2\n"
676 "\n"
677 "next",
678
679 " trailing comment\n"
680 " line 2\n",
681 {},
682 ""},
683
684 {"prev // trailing comment\n"
685 "// leading comment\n"
686 "// line 2\n"
687 "next",
688
689 " trailing comment\n",
690 {},
691 " leading comment\n"
692 " line 2\n"},
693
694 {"prev /* trailing block comment */\n"
695 "/* leading block comment\n"
696 " * line 2\n"
697 " * line 3 */"
698 "next",
699
700 " trailing block comment ",
701 {},
702 " leading block comment\n"
703 " line 2\n"
704 " line 3 "},
705
706 {"prev\n"
707 "/* trailing block comment\n"
708 " * line 2\n"
709 " * line 3\n"
710 " */\n"
711 "/* leading block comment\n"
712 " * line 2\n"
713 " * line 3 */"
714 "next",
715
716 " trailing block comment\n"
717 " line 2\n"
718 " line 3\n",
719 {},
720 " leading block comment\n"
721 " line 2\n"
722 " line 3 "},
723
724 {"prev\n"
725 "// trailing comment\n"
726 "\n"
727 "// detached comment\n"
728 "// line 2\n"
729 "\n"
730 "// second detached comment\n"
731 "/* third detached comment\n"
732 " * line 2 */\n"
733 "// leading comment\n"
734 "next",
735
736 " trailing comment\n",
737 {" detached comment\n"
738 " line 2\n",
739 " second detached comment\n",
740 " third detached comment\n"
741 " line 2 "},
742 " leading comment\n"},
743
744 {"prev /**/\n"
745 "\n"
746 "// detached comment\n"
747 "\n"
748 "// leading comment\n"
749 "next",
750
751 "",
752 {" detached comment\n"},
753 " leading comment\n"},
754
755 {"prev /**/\n"
756 "// leading comment\n"
757 "next",
758
759 "",
760 {},
761 " leading comment\n"},
762
763 {"prev /* many comments*/ /* all inline */ /* will be handled */ next",
764
765 " many comments",
766 {" all inline "},
767 " will be handled "},
768
769 {R"pb(
770 prev /* a single block comment
771 that spans multiple lines
772 is detached if it ends
773 on the same line as next */ next
774 )pb",
775
776 "",
777 {" a single block comment\n"
778 "that spans multiple lines\n"
779 "is detached if it ends\n"
780 "on the same line as next "},
781 ""},
782
783 {R"pb(
784 prev /* trailing */ /* leading */ next
785 )pb",
786
787 " trailing ",
788 {},
789 " leading "},
790
791 {R"pb(
792 prev /* multi-line
793 trailing */ /* an oddly
794 placed detached */ /* an oddly
795 placed leading */ next
796 )pb",
797
798 " multi-line\ntrailing ",
799 {" an oddly\nplaced detached "},
800 " an oddly\nplaced leading "},
801
802 {R"pb(
803 prev // trailing with newline
804 // detached
805 /* another detached */
806 // leading but no next token to attach it to
807 )pb",
808
809 " trailing with newline\n",
810 {" detached\n", " another detached ",
811 " leading but no next token to attach it to\n"},
812 ""},
813 };
814 // clang-format on
815
TEST_2D(TokenizerTest,DocComments,kDocCommentCases,kBlockSizes)816 TEST_2D(TokenizerTest, DocComments, kDocCommentCases, kBlockSizes) {
817 // Set up the tokenizer.
818 TestInputStream input(kDocCommentCases_case.input.data(),
819 kDocCommentCases_case.input.size(), kBlockSizes_case);
820 TestErrorCollector error_collector;
821 Tokenizer tokenizer(&input, &error_collector);
822
823 // Set up a second tokenizer where we'll pass all NULLs to NextWithComments().
824 TestInputStream input2(kDocCommentCases_case.input.data(),
825 kDocCommentCases_case.input.size(), kBlockSizes_case);
826 Tokenizer tokenizer2(&input2, &error_collector);
827
828 EXPECT_TRUE(tokenizer.Next());
829 EXPECT_TRUE(tokenizer2.Next());
830
831 EXPECT_EQ("prev", tokenizer.current().text);
832 EXPECT_EQ("prev", tokenizer2.current().text);
833
834 std::string prev_trailing_comments;
835 std::vector<std::string> detached_comments;
836 std::string next_leading_comments;
837 bool has_next = tokenizer.NextWithComments(
838 &prev_trailing_comments, &detached_comments, &next_leading_comments);
839 EXPECT_EQ(has_next, tokenizer2.NextWithComments(nullptr, nullptr, nullptr));
840 if (has_next) {
841 EXPECT_EQ("next", tokenizer.current().text);
842 EXPECT_EQ("next", tokenizer2.current().text);
843 }
844
845 EXPECT_EQ(kDocCommentCases_case.prev_trailing_comments,
846 prev_trailing_comments);
847
848 for (int i = 0; i < detached_comments.size(); i++) {
849 ASSERT_LT(i, ABSL_ARRAYSIZE(kDocCommentCases));
850 ASSERT_TRUE(kDocCommentCases_case.detached_comments[i] != nullptr);
851 EXPECT_EQ(kDocCommentCases_case.detached_comments[i], detached_comments[i]);
852 }
853
854 // Verify that we matched all the detached comments.
855 EXPECT_EQ(nullptr,
856 kDocCommentCases_case.detached_comments[detached_comments.size()]);
857
858 EXPECT_EQ(kDocCommentCases_case.next_leading_comments, next_leading_comments);
859 }
860
861 // -------------------------------------------------------------------
862
863 // Test parse helpers.
864 // TODO: Add a fuzz test for this.
TEST_F(TokenizerTest,ParseInteger)865 TEST_F(TokenizerTest, ParseInteger) {
866 EXPECT_EQ(0, ParseInteger("0"));
867 EXPECT_EQ(123, ParseInteger("123"));
868 EXPECT_EQ(0xabcdef12u, ParseInteger("0xabcdef12"));
869 EXPECT_EQ(0xabcdef12u, ParseInteger("0xABCDEF12"));
870 EXPECT_EQ(kuint64max, ParseInteger("0xFFFFFFFFFFFFFFFF"));
871 EXPECT_EQ(01234567, ParseInteger("01234567"));
872 EXPECT_EQ(0X123, ParseInteger("0X123"));
873
874 // Test invalid integers that may still be tokenized as integers.
875 EXPECT_EQ(0, ParseInteger("0x"));
876
877 uint64_t i;
878
879 // Test invalid integers that will never be tokenized as integers.
880 EXPECT_FALSE(Tokenizer::ParseInteger("zxy", kuint64max, &i));
881 EXPECT_FALSE(Tokenizer::ParseInteger("1.2", kuint64max, &i));
882 EXPECT_FALSE(Tokenizer::ParseInteger("08", kuint64max, &i));
883 EXPECT_FALSE(Tokenizer::ParseInteger("0xg", kuint64max, &i));
884 EXPECT_FALSE(Tokenizer::ParseInteger("-1", kuint64max, &i));
885
886 // Test overflows.
887 EXPECT_TRUE(Tokenizer::ParseInteger("0", 0, &i));
888 EXPECT_FALSE(Tokenizer::ParseInteger("1", 0, &i));
889 EXPECT_TRUE(Tokenizer::ParseInteger("1", 1, &i));
890 EXPECT_TRUE(Tokenizer::ParseInteger("12345", 12345, &i));
891 EXPECT_FALSE(Tokenizer::ParseInteger("12346", 12345, &i));
892 EXPECT_TRUE(Tokenizer::ParseInteger("0xFFFFFFFFFFFFFFFF", kuint64max, &i));
893 EXPECT_FALSE(Tokenizer::ParseInteger("0x10000000000000000", kuint64max, &i));
894
895 // Test near the limits of signed parsing (values in kint64max +/- 1600)
896 for (int64_t offset = -1600; offset <= 1600; ++offset) {
897 // We make sure to perform an unsigned addition so that we avoid signed
898 // overflow, which would be undefined behavior.
899 uint64_t i = 0x7FFFFFFFFFFFFFFFu + static_cast<uint64_t>(offset);
900 char decimal[32];
901 snprintf(decimal, 32, "%llu", static_cast<unsigned long long>(i));
902 if (offset > 0) {
903 uint64_t parsed = -1;
904 EXPECT_FALSE(Tokenizer::ParseInteger(decimal, kint64max, &parsed))
905 << decimal << "=>" << parsed;
906 } else {
907 uint64_t parsed = -1;
908 EXPECT_TRUE(Tokenizer::ParseInteger(decimal, kint64max, &parsed))
909 << decimal << "=>" << parsed;
910 EXPECT_EQ(parsed, i);
911 }
912 char octal[32];
913 snprintf(octal, 32, "0%llo", static_cast<unsigned long long>(i));
914 if (offset > 0) {
915 uint64_t parsed = -1;
916 EXPECT_FALSE(Tokenizer::ParseInteger(octal, kint64max, &parsed))
917 << octal << "=>" << parsed;
918 } else {
919 uint64_t parsed = -1;
920 EXPECT_TRUE(Tokenizer::ParseInteger(octal, kint64max, &parsed))
921 << octal << "=>" << parsed;
922 EXPECT_EQ(parsed, i);
923 }
924 char hex[32];
925 snprintf(hex, 32, "0x%llx", static_cast<unsigned long long>(i));
926 if (offset > 0) {
927 uint64_t parsed = -1;
928 EXPECT_FALSE(Tokenizer::ParseInteger(hex, kint64max, &parsed))
929 << hex << "=>" << parsed;
930 } else {
931 uint64_t parsed = -1;
932 EXPECT_TRUE(Tokenizer::ParseInteger(hex, kint64max, &parsed)) << hex;
933 EXPECT_EQ(parsed, i);
934 }
935 // EXPECT_NE(offset, -237);
936 }
937
938 // Test near the limits of unsigned parsing (values in kuint64max +/- 1600)
939 // By definition, values greater than kuint64max cannot be held in a uint64_t
940 // variable, so printing them is a little tricky; fortunately all but the
941 // last four digits are known, so we can hard-code them in the printf string,
942 // and we only need to format the last 4.
943 for (int64_t offset = -1600; offset <= 1600; ++offset) {
944 {
945 uint64_t i = 18446744073709551615u + offset;
946 char decimal[32];
947 snprintf(decimal, 32, "1844674407370955%04llu",
948 static_cast<unsigned long long>(1615 + offset));
949 if (offset > 0) {
950 uint64_t parsed = -1;
951 EXPECT_FALSE(Tokenizer::ParseInteger(decimal, kuint64max, &parsed))
952 << decimal << "=>" << parsed;
953 } else {
954 uint64_t parsed = -1;
955 EXPECT_TRUE(Tokenizer::ParseInteger(decimal, kuint64max, &parsed))
956 << decimal;
957 EXPECT_EQ(parsed, i);
958 }
959 }
960 {
961 uint64_t i = 01777777777777777777777u + offset;
962 if (offset > 0) {
963 char octal[32];
964 snprintf(octal, 32, "0200000000000000000%04llo",
965 static_cast<unsigned long long>(offset - 1));
966 uint64_t parsed = -1;
967 EXPECT_FALSE(Tokenizer::ParseInteger(octal, kuint64max, &parsed))
968 << octal << "=>" << parsed;
969 } else {
970 char octal[32];
971 snprintf(octal, 32, "0%llo", static_cast<unsigned long long>(i));
972 uint64_t parsed = -1;
973 EXPECT_TRUE(Tokenizer::ParseInteger(octal, kuint64max, &parsed))
974 << octal;
975 EXPECT_EQ(parsed, i);
976 }
977 }
978 {
979 uint64_t ui = 0xffffffffffffffffu + offset;
980 char hex[32];
981 if (offset > 0) {
982 snprintf(hex, 32, "0x1000000000000%04llx",
983 static_cast<unsigned long long>(offset - 1));
984 uint64_t parsed = -1;
985 EXPECT_FALSE(Tokenizer::ParseInteger(hex, kuint64max, &parsed))
986 << hex << "=>" << parsed;
987 } else {
988 snprintf(hex, 32, "0x%llx", static_cast<unsigned long long>(ui));
989 uint64_t parsed = -1;
990 EXPECT_TRUE(Tokenizer::ParseInteger(hex, kuint64max, &parsed)) << hex;
991 EXPECT_EQ(parsed, ui);
992 }
993 }
994 }
995 }
996
TEST_F(TokenizerTest,ParseFloat)997 TEST_F(TokenizerTest, ParseFloat) {
998 EXPECT_DOUBLE_EQ(1, Tokenizer::ParseFloat("1."));
999 EXPECT_DOUBLE_EQ(1e3, Tokenizer::ParseFloat("1e3"));
1000 EXPECT_DOUBLE_EQ(1e3, Tokenizer::ParseFloat("1E3"));
1001 EXPECT_DOUBLE_EQ(1.5e3, Tokenizer::ParseFloat("1.5e3"));
1002 EXPECT_DOUBLE_EQ(.1, Tokenizer::ParseFloat(".1"));
1003 EXPECT_DOUBLE_EQ(.25, Tokenizer::ParseFloat(".25"));
1004 EXPECT_DOUBLE_EQ(.1e3, Tokenizer::ParseFloat(".1e3"));
1005 EXPECT_DOUBLE_EQ(.25e3, Tokenizer::ParseFloat(".25e3"));
1006 EXPECT_DOUBLE_EQ(.1e+3, Tokenizer::ParseFloat(".1e+3"));
1007 EXPECT_DOUBLE_EQ(.1e-3, Tokenizer::ParseFloat(".1e-3"));
1008 EXPECT_DOUBLE_EQ(5, Tokenizer::ParseFloat("5"));
1009 EXPECT_DOUBLE_EQ(6e-12, Tokenizer::ParseFloat("6e-12"));
1010 EXPECT_DOUBLE_EQ(1.2, Tokenizer::ParseFloat("1.2"));
1011 EXPECT_DOUBLE_EQ(1.e2, Tokenizer::ParseFloat("1.e2"));
1012
1013 // Test invalid integers that may still be tokenized as integers.
1014 EXPECT_DOUBLE_EQ(1, Tokenizer::ParseFloat("1e"));
1015 EXPECT_DOUBLE_EQ(1, Tokenizer::ParseFloat("1e-"));
1016 EXPECT_DOUBLE_EQ(1, Tokenizer::ParseFloat("1.e"));
1017
1018 // Test 'f' suffix.
1019 EXPECT_DOUBLE_EQ(1, Tokenizer::ParseFloat("1f"));
1020 EXPECT_DOUBLE_EQ(1, Tokenizer::ParseFloat("1.0f"));
1021 EXPECT_DOUBLE_EQ(1, Tokenizer::ParseFloat("1F"));
1022
1023 // These should parse successfully even though they are out of range.
1024 // Overflows become infinity and underflows become zero.
1025 EXPECT_EQ(0.0, Tokenizer::ParseFloat("1e-9999999999999999999999999999"));
1026 EXPECT_EQ(HUGE_VAL, Tokenizer::ParseFloat("1e+9999999999999999999999999999"));
1027
1028 #if GTEST_HAS_DEATH_TEST // death tests do not work on Windows yet
1029 // Test invalid integers that will never be tokenized as integers.
1030 EXPECT_DEBUG_DEATH(
1031 Tokenizer::ParseFloat("zxy"),
1032 "passed text that could not have been tokenized as a float");
1033 EXPECT_DEBUG_DEATH(
1034 Tokenizer::ParseFloat("1-e0"),
1035 "passed text that could not have been tokenized as a float");
1036 EXPECT_DEBUG_DEATH(
1037 Tokenizer::ParseFloat("-1.0"),
1038 "passed text that could not have been tokenized as a float");
1039 #endif // GTEST_HAS_DEATH_TEST
1040 }
1041
TEST_F(TokenizerTest,ParseString)1042 TEST_F(TokenizerTest, ParseString) {
1043 std::string output;
1044 Tokenizer::ParseString("'hello'", &output);
1045 EXPECT_EQ("hello", output);
1046 Tokenizer::ParseString("\"blah\\nblah2\"", &output);
1047 EXPECT_EQ("blah\nblah2", output);
1048 Tokenizer::ParseString("'\\1x\\1\\123\\739\\52\\334n\\3'", &output);
1049 EXPECT_EQ("\1x\1\123\739\52\334n\3", output);
1050 Tokenizer::ParseString("'\\x20\\x4'", &output);
1051 EXPECT_EQ("\x20\x4", output);
1052 Tokenizer::ParseString("'\\X20\\X4'", &output);
1053 EXPECT_EQ("\x20\x4", output);
1054
1055 // Test invalid strings that may still be tokenized as strings.
1056 Tokenizer::ParseString("\"\\a\\l\\v\\t", &output); // \l is invalid
1057 EXPECT_EQ("\a?\v\t", output);
1058 Tokenizer::ParseString("'", &output);
1059 EXPECT_EQ("", output);
1060 Tokenizer::ParseString("'\\", &output);
1061 EXPECT_EQ("\\", output);
1062
1063 // Experiment with Unicode escapes. Here are one-, two- and three-byte Unicode
1064 // characters.
1065 Tokenizer::ParseString("'\\u0024\\u00a2\\u20ac\\U00024b62XX'", &output);
1066 EXPECT_EQ("$¢€XX", output);
1067 // Same thing encoded using UTF16.
1068 Tokenizer::ParseString("'\\u0024\\u00a2\\u20ac\\ud852\\udf62XX'", &output);
1069 EXPECT_EQ("$¢€XX", output);
1070 // Here's some broken UTF16; there's a head surrogate with no tail surrogate.
1071 // We just output this as if it were UTF8; it's not a defined code point, but
1072 // it has a defined encoding.
1073 Tokenizer::ParseString("'\\ud852XX'", &output);
1074 EXPECT_EQ("\xed\xa1\x92XX", output);
1075 // Malformed escape: Demons may fly out of the nose.
1076 Tokenizer::ParseString("'\\u0'", &output);
1077 EXPECT_EQ("u0", output);
1078 // Beyond the range of valid UTF-32 code units.
1079 Tokenizer::ParseString("'\\U00110000\\U00200000\\UFFFFFFFF'", &output);
1080 EXPECT_EQ("\\U00110000\\U00200000\\Uffffffff", output);
1081
1082 // Test invalid strings that will never be tokenized as strings.
1083 #if GTEST_HAS_DEATH_TEST // death tests do not work on Windows yet
1084 EXPECT_DEBUG_DEATH(
1085 Tokenizer::ParseString("", &output),
1086 "passed text that could not have been tokenized as a string");
1087 #endif // GTEST_HAS_DEATH_TEST
1088 }
1089
TEST_F(TokenizerTest,ParseStringAppend)1090 TEST_F(TokenizerTest, ParseStringAppend) {
1091 // Check that ParseString and ParseStringAppend differ.
1092 std::string output("stuff+");
1093 Tokenizer::ParseStringAppend("'hello'", &output);
1094 EXPECT_EQ("stuff+hello", output);
1095 Tokenizer::ParseString("'hello'", &output);
1096 EXPECT_EQ("hello", output);
1097 }
1098
1099 // -------------------------------------------------------------------
1100
1101 // Each case parses some input text, ignoring the tokens produced, and
1102 // checks that the error output matches what is expected.
1103 struct ErrorCase {
1104 std::string input;
1105 bool recoverable; // True if the tokenizer should be able to recover and
1106 // parse more tokens after seeing this error. Cases
1107 // for which this is true must end with "foo" as
1108 // the last token, which the test will check for.
1109 const char* errors;
1110 };
1111
operator <<(std::ostream & out,const ErrorCase & test_case)1112 inline std::ostream& operator<<(std::ostream& out, const ErrorCase& test_case) {
1113 return out << absl::CEscape(test_case.input);
1114 }
1115
1116 ErrorCase kErrorCases[] = {
1117 // String errors.
1118 {"'\\l' foo", true, "0:2: Invalid escape sequence in string literal.\n"},
1119 {"'\\X' foo", true, "0:3: Expected hex digits for escape sequence.\n"},
1120 {"'\\x' foo", true, "0:3: Expected hex digits for escape sequence.\n"},
1121 {"'foo", false, "0:4: Unexpected end of string.\n"},
1122 {"'bar\nfoo", true,
1123 "0:4: Multiline strings are not allowed. Did you miss a \"?.\n"},
1124 {"'\\u01' foo", true,
1125 "0:5: Expected four hex digits for \\u escape sequence.\n"},
1126 {"'\\u01' foo", true,
1127 "0:5: Expected four hex digits for \\u escape sequence.\n"},
1128 {"'\\uXYZ' foo", true,
1129 "0:3: Expected four hex digits for \\u escape sequence.\n"},
1130
1131 // Integer errors.
1132 {"123foo", true, "0:3: Need space between number and identifier.\n"},
1133
1134 // Hex/octal errors.
1135 {"0x foo", true, "0:2: \"0x\" must be followed by hex digits.\n"},
1136 {"0541823 foo", true,
1137 "0:4: Numbers starting with leading zero must be in octal.\n"},
1138 {"0x123z foo", true, "0:5: Need space between number and identifier.\n"},
1139 {"0x123.4 foo", true, "0:5: Hex and octal numbers must be integers.\n"},
1140 {"0123.4 foo", true, "0:4: Hex and octal numbers must be integers.\n"},
1141
1142 // Float errors.
1143 {"1e foo", true, "0:2: \"e\" must be followed by exponent.\n"},
1144 {"1e- foo", true, "0:3: \"e\" must be followed by exponent.\n"},
1145 {"1.2.3 foo", true,
1146 "0:3: Already saw decimal point or exponent; can't have another one.\n"},
1147 {"1e2.3 foo", true,
1148 "0:3: Already saw decimal point or exponent; can't have another one.\n"},
1149 {"a.1 foo", true,
1150 "0:1: Need space between identifier and decimal point.\n"},
1151 // allow_f_after_float not enabled, so this should be an error.
1152 {"1.0f foo", true, "0:3: Need space between number and identifier.\n"},
1153
1154 // Block comment errors.
1155 {"/*", false,
1156 "0:2: End-of-file inside block comment.\n"
1157 "0:0: Comment started here.\n"},
1158 {"/*/*/ foo", true,
1159 "0:3: \"/*\" inside block comment. Block comments cannot be nested.\n"},
1160
1161 // Control characters. Multiple consecutive control characters should only
1162 // produce one error.
1163 {"\b foo", true, "0:0: Invalid control characters encountered in text.\n"},
1164 {"\b\b foo", true,
1165 "0:0: Invalid control characters encountered in text.\n"},
1166
1167 // Check that control characters at end of input don't result in an
1168 // infinite loop.
1169 {"\b", false, "0:0: Invalid control characters encountered in text.\n"},
1170
1171 // Check recovery from '\0'. We have to explicitly specify the length of
1172 // these strings because otherwise the string constructor will just call
1173 // strlen() which will see the first '\0' and think that is the end of the
1174 // string.
1175 {std::string("\0foo", 4), true,
1176 "0:0: Invalid control characters encountered in text.\n"},
1177 {std::string("\0\0foo", 5), true,
1178 "0:0: Invalid control characters encountered in text.\n"},
1179
1180 // Check error from high order bits set
1181 {"\300foo", true, "0:0: Interpreting non ascii codepoint 192.\n"},
1182 };
1183
TEST_2D(TokenizerTest,Errors,kErrorCases,kBlockSizes)1184 TEST_2D(TokenizerTest, Errors, kErrorCases, kBlockSizes) {
1185 // Set up the tokenizer.
1186 TestInputStream input(kErrorCases_case.input.data(),
1187 kErrorCases_case.input.size(), kBlockSizes_case);
1188 TestErrorCollector error_collector;
1189 Tokenizer tokenizer(&input, &error_collector);
1190
1191 // Ignore all input, except remember if the last token was "foo".
1192 bool last_was_foo = false;
1193 while (tokenizer.Next()) {
1194 last_was_foo = tokenizer.current().text == "foo";
1195 }
1196
1197 // Check that the errors match what was expected.
1198 EXPECT_EQ(kErrorCases_case.errors, error_collector.text_);
1199
1200 // If the error was recoverable, make sure we saw "foo" after it.
1201 if (kErrorCases_case.recoverable) {
1202 EXPECT_TRUE(last_was_foo);
1203 }
1204 }
1205
1206 // -------------------------------------------------------------------
1207
TEST_1D(TokenizerTest,BackUpOnDestruction,kBlockSizes)1208 TEST_1D(TokenizerTest, BackUpOnDestruction, kBlockSizes) {
1209 std::string text = "foo bar";
1210 TestInputStream input(text.data(), text.size(), kBlockSizes_case);
1211
1212 // Create a tokenizer, read one token, then destroy it.
1213 {
1214 TestErrorCollector error_collector;
1215 Tokenizer tokenizer(&input, &error_collector);
1216
1217 tokenizer.Next();
1218 }
1219
1220 // Only "foo" should have been read.
1221 EXPECT_EQ(strlen("foo"), input.ByteCount());
1222 }
1223
1224
1225 } // namespace
1226 } // namespace io
1227 } // namespace protobuf
1228 } // namespace google
1229