1 // Protocol Buffers - Google's data interchange format
2 // Copyright 2008 Google Inc. All rights reserved.
3 // https://developers.google.com/protocol-buffers/
4 //
5 // Redistribution and use in source and binary forms, with or without
6 // modification, are permitted provided that the following conditions are
7 // met:
8 //
9 // * Redistributions of source code must retain the above copyright
10 // notice, this list of conditions and the following disclaimer.
11 // * Redistributions in binary form must reproduce the above
12 // copyright notice, this list of conditions and the following disclaimer
13 // in the documentation and/or other materials provided with the
14 // distribution.
15 // * Neither the name of Google Inc. nor the names of its
16 // contributors may be used to endorse or promote products derived from
17 // this software without specific prior written permission.
18 //
19 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
20 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
21 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
22 // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
23 // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
24 // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
25 // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
26 // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
27 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
28 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
29 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30
31 // Author: kenton@google.com (Kenton Varda)
32 // Based on original Protocol Buffers design by
33 // Sanjay Ghemawat, Jeff Dean, and others.
34
35 #include <limits.h>
36 #include <math.h>
37
38 #include <vector>
39
40 #include <google/protobuf/io/tokenizer.h>
41 #include <google/protobuf/io/zero_copy_stream_impl.h>
42
43 #include <google/protobuf/stubs/common.h>
44 #include <google/protobuf/stubs/strutil.h>
45 #include <google/protobuf/stubs/substitute.h>
46 #include <google/protobuf/testing/googletest.h>
47 #include <gtest/gtest.h>
48
49 namespace google {
50 namespace protobuf {
51 namespace io {
52 namespace {
53
54 // ===================================================================
55 // Data-Driven Test Infrastructure
56
57 // TODO(kenton): This is copied from coded_stream_unittest. This is
58 // temporary until these fetaures are integrated into gTest itself.
59
60 // TEST_1D and TEST_2D are macros I'd eventually like to see added to
61 // gTest. These macros can be used to declare tests which should be
62 // run multiple times, once for each item in some input array. TEST_1D
63 // tests all cases in a single input array. TEST_2D tests all
64 // combinations of cases from two arrays. The arrays must be statically
65 // defined such that the GOOGLE_ARRAYSIZE() macro works on them. Example:
66 //
67 // int kCases[] = {1, 2, 3, 4}
68 // TEST_1D(MyFixture, MyTest, kCases) {
69 // EXPECT_GT(kCases_case, 0);
70 // }
71 //
72 // This test iterates through the numbers 1, 2, 3, and 4 and tests that
73 // they are all grater than zero. In case of failure, the exact case
74 // which failed will be printed. The case type must be printable using
75 // ostream::operator<<.
76
77 #define TEST_1D(FIXTURE, NAME, CASES) \
78 class FIXTURE##_##NAME##_DD : public FIXTURE { \
79 protected: \
80 template <typename CaseType> \
81 void DoSingleCase(const CaseType& CASES##_case); \
82 }; \
83 \
84 TEST_F(FIXTURE##_##NAME##_DD, NAME) { \
85 for (int i = 0; i < GOOGLE_ARRAYSIZE(CASES); i++) { \
86 SCOPED_TRACE(testing::Message() \
87 << #CASES " case #" << i << ": " << CASES[i]); \
88 DoSingleCase(CASES[i]); \
89 } \
90 } \
91 \
92 template <typename CaseType> \
93 void FIXTURE##_##NAME##_DD::DoSingleCase(const CaseType& CASES##_case)
94
95 #define TEST_2D(FIXTURE, NAME, CASES1, CASES2) \
96 class FIXTURE##_##NAME##_DD : public FIXTURE { \
97 protected: \
98 template <typename CaseType1, typename CaseType2> \
99 void DoSingleCase(const CaseType1& CASES1##_case, \
100 const CaseType2& CASES2##_case); \
101 }; \
102 \
103 TEST_F(FIXTURE##_##NAME##_DD, NAME) { \
104 for (int i = 0; i < GOOGLE_ARRAYSIZE(CASES1); i++) { \
105 for (int j = 0; j < GOOGLE_ARRAYSIZE(CASES2); j++) { \
106 SCOPED_TRACE(testing::Message() \
107 << #CASES1 " case #" << i << ": " << CASES1[i] << ", " \
108 << #CASES2 " case #" << j << ": " << CASES2[j]); \
109 DoSingleCase(CASES1[i], CASES2[j]); \
110 } \
111 } \
112 } \
113 \
114 template <typename CaseType1, typename CaseType2> \
115 void FIXTURE##_##NAME##_DD::DoSingleCase(const CaseType1& CASES1##_case, \
116 const CaseType2& CASES2##_case)
117
118 // -------------------------------------------------------------------
119
120 // An input stream that is basically like an ArrayInputStream but sometimes
121 // returns empty buffers, just to throw us off.
122 class TestInputStream : public ZeroCopyInputStream {
123 public:
TestInputStream(const void * data,int size,int block_size)124 TestInputStream(const void* data, int size, int block_size)
125 : array_stream_(data, size, block_size), counter_(0) {}
~TestInputStream()126 ~TestInputStream() {}
127
128 // implements ZeroCopyInputStream ----------------------------------
Next(const void ** data,int * size)129 bool Next(const void** data, int* size) {
130 // We'll return empty buffers starting with the first buffer, and every
131 // 3 and 5 buffers after that.
132 if (counter_ % 3 == 0 || counter_ % 5 == 0) {
133 *data = NULL;
134 *size = 0;
135 ++counter_;
136 return true;
137 } else {
138 ++counter_;
139 return array_stream_.Next(data, size);
140 }
141 }
142
BackUp(int count)143 void BackUp(int count) { return array_stream_.BackUp(count); }
Skip(int count)144 bool Skip(int count) { return array_stream_.Skip(count); }
ByteCount() const145 int64 ByteCount() const { return array_stream_.ByteCount(); }
146
147 private:
148 ArrayInputStream array_stream_;
149 int counter_;
150 };
151
152 // -------------------------------------------------------------------
153
154 // An error collector which simply concatenates all its errors into a big
155 // block of text which can be checked.
156 class TestErrorCollector : public ErrorCollector {
157 public:
TestErrorCollector()158 TestErrorCollector() {}
~TestErrorCollector()159 ~TestErrorCollector() {}
160
161 string text_;
162
163 // implements ErrorCollector ---------------------------------------
AddError(int line,int column,const string & message)164 void AddError(int line, int column, const string& message) {
165 strings::SubstituteAndAppend(&text_, "$0:$1: $2\n",
166 line, column, message);
167 }
168 };
169
170 // -------------------------------------------------------------------
171
172 // We test each operation over a variety of block sizes to insure that
173 // we test cases where reads cross buffer boundaries as well as cases
174 // where they don't. This is sort of a brute-force approach to this,
175 // but it's easy to write and easy to understand.
176 const int kBlockSizes[] = {1, 2, 3, 5, 7, 13, 32, 1024};
177
178 class TokenizerTest : public testing::Test {
179 protected:
180 // For easy testing.
ParseInteger(const string & text)181 uint64 ParseInteger(const string& text) {
182 uint64 result;
183 EXPECT_TRUE(Tokenizer::ParseInteger(text, kuint64max, &result));
184 return result;
185 }
186 };
187
188 // ===================================================================
189
190 // These tests causes gcc 3.3.5 (and earlier?) to give the cryptic error:
191 // "sorry, unimplemented: `method_call_expr' not supported by dump_expr"
192 #if !defined(__GNUC__) || __GNUC__ > 3 || (__GNUC__ == 3 && __GNUC_MINOR__ > 3)
193
194 // In each test case, the entire input text should parse as a single token
195 // of the given type.
196 struct SimpleTokenCase {
197 string input;
198 Tokenizer::TokenType type;
199 };
200
operator <<(ostream & out,const SimpleTokenCase & test_case)201 inline ostream& operator<<(ostream& out,
202 const SimpleTokenCase& test_case) {
203 return out << CEscape(test_case.input);
204 }
205
206 SimpleTokenCase kSimpleTokenCases[] = {
207 // Test identifiers.
208 { "hello", Tokenizer::TYPE_IDENTIFIER },
209
210 // Test integers.
211 { "123", Tokenizer::TYPE_INTEGER },
212 { "0xab6", Tokenizer::TYPE_INTEGER },
213 { "0XAB6", Tokenizer::TYPE_INTEGER },
214 { "0X1234567", Tokenizer::TYPE_INTEGER },
215 { "0x89abcdef", Tokenizer::TYPE_INTEGER },
216 { "0x89ABCDEF", Tokenizer::TYPE_INTEGER },
217 { "01234567", Tokenizer::TYPE_INTEGER },
218
219 // Test floats.
220 { "123.45", Tokenizer::TYPE_FLOAT },
221 { "1.", Tokenizer::TYPE_FLOAT },
222 { "1e3", Tokenizer::TYPE_FLOAT },
223 { "1E3", Tokenizer::TYPE_FLOAT },
224 { "1e-3", Tokenizer::TYPE_FLOAT },
225 { "1e+3", Tokenizer::TYPE_FLOAT },
226 { "1.e3", Tokenizer::TYPE_FLOAT },
227 { "1.2e3", Tokenizer::TYPE_FLOAT },
228 { ".1", Tokenizer::TYPE_FLOAT },
229 { ".1e3", Tokenizer::TYPE_FLOAT },
230 { ".1e-3", Tokenizer::TYPE_FLOAT },
231 { ".1e+3", Tokenizer::TYPE_FLOAT },
232
233 // Test strings.
234 { "'hello'", Tokenizer::TYPE_STRING },
235 { "\"foo\"", Tokenizer::TYPE_STRING },
236 { "'a\"b'", Tokenizer::TYPE_STRING },
237 { "\"a'b\"", Tokenizer::TYPE_STRING },
238 { "'a\\'b'", Tokenizer::TYPE_STRING },
239 { "\"a\\\"b\"", Tokenizer::TYPE_STRING },
240 { "'\\xf'", Tokenizer::TYPE_STRING },
241 { "'\\0'", Tokenizer::TYPE_STRING },
242
243 // Test symbols.
244 { "+", Tokenizer::TYPE_SYMBOL },
245 { ".", Tokenizer::TYPE_SYMBOL },
246 };
247
TEST_2D(TokenizerTest,SimpleTokens,kSimpleTokenCases,kBlockSizes)248 TEST_2D(TokenizerTest, SimpleTokens, kSimpleTokenCases, kBlockSizes) {
249 // Set up the tokenizer.
250 TestInputStream input(kSimpleTokenCases_case.input.data(),
251 kSimpleTokenCases_case.input.size(),
252 kBlockSizes_case);
253 TestErrorCollector error_collector;
254 Tokenizer tokenizer(&input, &error_collector);
255
256 // Before Next() is called, the initial token should always be TYPE_START.
257 EXPECT_EQ(Tokenizer::TYPE_START, tokenizer.current().type);
258 EXPECT_EQ("", tokenizer.current().text);
259 EXPECT_EQ(0, tokenizer.current().line);
260 EXPECT_EQ(0, tokenizer.current().column);
261 EXPECT_EQ(0, tokenizer.current().end_column);
262
263 // Parse the token.
264 ASSERT_TRUE(tokenizer.Next());
265
266 // Check that it has the right type.
267 EXPECT_EQ(kSimpleTokenCases_case.type, tokenizer.current().type);
268 // Check that it contains the complete input text.
269 EXPECT_EQ(kSimpleTokenCases_case.input, tokenizer.current().text);
270 // Check that it is located at the beginning of the input
271 EXPECT_EQ(0, tokenizer.current().line);
272 EXPECT_EQ(0, tokenizer.current().column);
273 EXPECT_EQ(kSimpleTokenCases_case.input.size(),
274 tokenizer.current().end_column);
275
276 // There should be no more input.
277 EXPECT_FALSE(tokenizer.Next());
278
279 // After Next() returns false, the token should have type TYPE_END.
280 EXPECT_EQ(Tokenizer::TYPE_END, tokenizer.current().type);
281 EXPECT_EQ("", tokenizer.current().text);
282 EXPECT_EQ(0, tokenizer.current().line);
283 EXPECT_EQ(kSimpleTokenCases_case.input.size(), tokenizer.current().column);
284 EXPECT_EQ(kSimpleTokenCases_case.input.size(),
285 tokenizer.current().end_column);
286
287 // There should be no errors.
288 EXPECT_TRUE(error_collector.text_.empty());
289 }
290
TEST_1D(TokenizerTest,FloatSuffix,kBlockSizes)291 TEST_1D(TokenizerTest, FloatSuffix, kBlockSizes) {
292 // Test the "allow_f_after_float" option.
293
294 // Set up the tokenizer.
295 const char* text = "1f 2.5f 6e3f 7F";
296 TestInputStream input(text, strlen(text), kBlockSizes_case);
297 TestErrorCollector error_collector;
298 Tokenizer tokenizer(&input, &error_collector);
299 tokenizer.set_allow_f_after_float(true);
300
301 // Advance through tokens and check that they are parsed as expected.
302 ASSERT_TRUE(tokenizer.Next());
303 EXPECT_EQ(tokenizer.current().text, "1f");
304 EXPECT_EQ(tokenizer.current().type, Tokenizer::TYPE_FLOAT);
305 ASSERT_TRUE(tokenizer.Next());
306 EXPECT_EQ(tokenizer.current().text, "2.5f");
307 EXPECT_EQ(tokenizer.current().type, Tokenizer::TYPE_FLOAT);
308 ASSERT_TRUE(tokenizer.Next());
309 EXPECT_EQ(tokenizer.current().text, "6e3f");
310 EXPECT_EQ(tokenizer.current().type, Tokenizer::TYPE_FLOAT);
311 ASSERT_TRUE(tokenizer.Next());
312 EXPECT_EQ(tokenizer.current().text, "7F");
313 EXPECT_EQ(tokenizer.current().type, Tokenizer::TYPE_FLOAT);
314
315 // There should be no more input.
316 EXPECT_FALSE(tokenizer.Next());
317 // There should be no errors.
318 EXPECT_TRUE(error_collector.text_.empty());
319 }
320
321 #endif
322
323 // -------------------------------------------------------------------
324
325 // In each case, the input is parsed to produce a list of tokens. The
326 // last token in "output" must have type TYPE_END.
327 struct MultiTokenCase {
328 string input;
329 Tokenizer::Token output[10]; // The compiler wants a constant array
330 // size for initialization to work. There
331 // is no reason this can't be increased if
332 // needed.
333 };
334
operator <<(ostream & out,const MultiTokenCase & test_case)335 inline ostream& operator<<(ostream& out,
336 const MultiTokenCase& test_case) {
337 return out << CEscape(test_case.input);
338 }
339
340 MultiTokenCase kMultiTokenCases[] = {
341 // Test empty input.
342 { "", {
343 { Tokenizer::TYPE_END , "" , 0, 0 },
344 }},
345
346 // Test all token types at the same time.
347 { "foo 1 1.2 + 'bar'", {
348 { Tokenizer::TYPE_IDENTIFIER, "foo" , 0, 0, 3 },
349 { Tokenizer::TYPE_INTEGER , "1" , 0, 4, 5 },
350 { Tokenizer::TYPE_FLOAT , "1.2" , 0, 6, 9 },
351 { Tokenizer::TYPE_SYMBOL , "+" , 0, 10, 11 },
352 { Tokenizer::TYPE_STRING , "'bar'", 0, 12, 17 },
353 { Tokenizer::TYPE_END , "" , 0, 17, 17 },
354 }},
355
356 // Test that consecutive symbols are parsed as separate tokens.
357 { "!@+%", {
358 { Tokenizer::TYPE_SYMBOL , "!" , 0, 0, 1 },
359 { Tokenizer::TYPE_SYMBOL , "@" , 0, 1, 2 },
360 { Tokenizer::TYPE_SYMBOL , "+" , 0, 2, 3 },
361 { Tokenizer::TYPE_SYMBOL , "%" , 0, 3, 4 },
362 { Tokenizer::TYPE_END , "" , 0, 4, 4 },
363 }},
364
365 // Test that newlines affect line numbers correctly.
366 { "foo bar\nrab oof", {
367 { Tokenizer::TYPE_IDENTIFIER, "foo", 0, 0, 3 },
368 { Tokenizer::TYPE_IDENTIFIER, "bar", 0, 4, 7 },
369 { Tokenizer::TYPE_IDENTIFIER, "rab", 1, 0, 3 },
370 { Tokenizer::TYPE_IDENTIFIER, "oof", 1, 4, 7 },
371 { Tokenizer::TYPE_END , "" , 1, 7, 7 },
372 }},
373
374 // Test that tabs affect column numbers correctly.
375 { "foo\tbar \tbaz", {
376 { Tokenizer::TYPE_IDENTIFIER, "foo", 0, 0, 3 },
377 { Tokenizer::TYPE_IDENTIFIER, "bar", 0, 8, 11 },
378 { Tokenizer::TYPE_IDENTIFIER, "baz", 0, 16, 19 },
379 { Tokenizer::TYPE_END , "" , 0, 19, 19 },
380 }},
381
382 // Test that tabs in string literals affect column numbers correctly.
383 { "\"foo\tbar\" baz", {
384 { Tokenizer::TYPE_STRING , "\"foo\tbar\"", 0, 0, 12 },
385 { Tokenizer::TYPE_IDENTIFIER, "baz" , 0, 13, 16 },
386 { Tokenizer::TYPE_END , "" , 0, 16, 16 },
387 }},
388
389 // Test that line comments are ignored.
390 { "foo // This is a comment\n"
391 "bar // This is another comment", {
392 { Tokenizer::TYPE_IDENTIFIER, "foo", 0, 0, 3 },
393 { Tokenizer::TYPE_IDENTIFIER, "bar", 1, 0, 3 },
394 { Tokenizer::TYPE_END , "" , 1, 30, 30 },
395 }},
396
397 // Test that block comments are ignored.
398 { "foo /* This is a block comment */ bar", {
399 { Tokenizer::TYPE_IDENTIFIER, "foo", 0, 0, 3 },
400 { Tokenizer::TYPE_IDENTIFIER, "bar", 0, 34, 37 },
401 { Tokenizer::TYPE_END , "" , 0, 37, 37 },
402 }},
403
404 // Test that sh-style comments are not ignored by default.
405 { "foo # bar\n"
406 "baz", {
407 { Tokenizer::TYPE_IDENTIFIER, "foo", 0, 0, 3 },
408 { Tokenizer::TYPE_SYMBOL , "#" , 0, 4, 5 },
409 { Tokenizer::TYPE_IDENTIFIER, "bar", 0, 6, 9 },
410 { Tokenizer::TYPE_IDENTIFIER, "baz", 1, 0, 3 },
411 { Tokenizer::TYPE_END , "" , 1, 3, 3 },
412 }},
413
414 // Test all whitespace chars
415 { "foo\n\t\r\v\fbar", {
416 { Tokenizer::TYPE_IDENTIFIER, "foo", 0, 0, 3 },
417 { Tokenizer::TYPE_IDENTIFIER, "bar", 1, 11, 14 },
418 { Tokenizer::TYPE_END , "" , 1, 14, 14 },
419 }},
420 };
421
TEST_2D(TokenizerTest,MultipleTokens,kMultiTokenCases,kBlockSizes)422 TEST_2D(TokenizerTest, MultipleTokens, kMultiTokenCases, kBlockSizes) {
423 // Set up the tokenizer.
424 TestInputStream input(kMultiTokenCases_case.input.data(),
425 kMultiTokenCases_case.input.size(),
426 kBlockSizes_case);
427 TestErrorCollector error_collector;
428 Tokenizer tokenizer(&input, &error_collector);
429
430 // Before Next() is called, the initial token should always be TYPE_START.
431 EXPECT_EQ(Tokenizer::TYPE_START, tokenizer.current().type);
432 EXPECT_EQ("", tokenizer.current().text);
433 EXPECT_EQ(0, tokenizer.current().line);
434 EXPECT_EQ(0, tokenizer.current().column);
435 EXPECT_EQ(0, tokenizer.current().end_column);
436
437 // Loop through all expected tokens.
438 int i = 0;
439 Tokenizer::Token token;
440 do {
441 token = kMultiTokenCases_case.output[i++];
442
443 SCOPED_TRACE(testing::Message() << "Token #" << i << ": " << token.text);
444
445 Tokenizer::Token previous = tokenizer.current();
446
447 // Next() should only return false when it hits the end token.
448 if (token.type != Tokenizer::TYPE_END) {
449 ASSERT_TRUE(tokenizer.Next());
450 } else {
451 ASSERT_FALSE(tokenizer.Next());
452 }
453
454 // Check that the previous token is set correctly.
455 EXPECT_EQ(previous.type, tokenizer.previous().type);
456 EXPECT_EQ(previous.text, tokenizer.previous().text);
457 EXPECT_EQ(previous.line, tokenizer.previous().line);
458 EXPECT_EQ(previous.column, tokenizer.previous().column);
459 EXPECT_EQ(previous.end_column, tokenizer.previous().end_column);
460
461 // Check that the token matches the expected one.
462 EXPECT_EQ(token.type, tokenizer.current().type);
463 EXPECT_EQ(token.text, tokenizer.current().text);
464 EXPECT_EQ(token.line, tokenizer.current().line);
465 EXPECT_EQ(token.column, tokenizer.current().column);
466 EXPECT_EQ(token.end_column, tokenizer.current().end_column);
467
468 } while (token.type != Tokenizer::TYPE_END);
469
470 // There should be no errors.
471 EXPECT_TRUE(error_collector.text_.empty());
472 }
473
474 // This test causes gcc 3.3.5 (and earlier?) to give the cryptic error:
475 // "sorry, unimplemented: `method_call_expr' not supported by dump_expr"
476 #if !defined(__GNUC__) || __GNUC__ > 3 || (__GNUC__ == 3 && __GNUC_MINOR__ > 3)
477
TEST_1D(TokenizerTest,ShCommentStyle,kBlockSizes)478 TEST_1D(TokenizerTest, ShCommentStyle, kBlockSizes) {
479 // Test the "comment_style" option.
480
481 const char* text = "foo # bar\n"
482 "baz // qux\n"
483 "corge /* grault */\n"
484 "garply";
485 const char* const kTokens[] = {"foo", // "# bar" is ignored
486 "baz", "/", "/", "qux",
487 "corge", "/", "*", "grault", "*", "/",
488 "garply"};
489
490 // Set up the tokenizer.
491 TestInputStream input(text, strlen(text), kBlockSizes_case);
492 TestErrorCollector error_collector;
493 Tokenizer tokenizer(&input, &error_collector);
494 tokenizer.set_comment_style(Tokenizer::SH_COMMENT_STYLE);
495
496 // Advance through tokens and check that they are parsed as expected.
497 for (int i = 0; i < GOOGLE_ARRAYSIZE(kTokens); i++) {
498 EXPECT_TRUE(tokenizer.Next());
499 EXPECT_EQ(tokenizer.current().text, kTokens[i]);
500 }
501
502 // There should be no more input.
503 EXPECT_FALSE(tokenizer.Next());
504 // There should be no errors.
505 EXPECT_TRUE(error_collector.text_.empty());
506 }
507
508 #endif
509
510 // -------------------------------------------------------------------
511
512 // In each case, the input is expected to have two tokens named "prev" and
513 // "next" with comments in between.
514 struct DocCommentCase {
515 string input;
516
517 const char* prev_trailing_comments;
518 const char* detached_comments[10];
519 const char* next_leading_comments;
520 };
521
operator <<(ostream & out,const DocCommentCase & test_case)522 inline ostream& operator<<(ostream& out,
523 const DocCommentCase& test_case) {
524 return out << CEscape(test_case.input);
525 }
526
527 DocCommentCase kDocCommentCases[] = {
528 {
529 "prev next",
530
531 "",
532 {},
533 ""
534 },
535
536 {
537 "prev /* ignored */ next",
538
539 "",
540 {},
541 ""
542 },
543
544 {
545 "prev // trailing comment\n"
546 "next",
547
548 " trailing comment\n",
549 {},
550 ""
551 },
552
553 {
554 "prev\n"
555 "// leading comment\n"
556 "// line 2\n"
557 "next",
558
559 "",
560 {},
561 " leading comment\n"
562 " line 2\n"
563 },
564
565 {
566 "prev\n"
567 "// trailing comment\n"
568 "// line 2\n"
569 "\n"
570 "next",
571
572 " trailing comment\n"
573 " line 2\n",
574 {},
575 ""
576 },
577
578 {
579 "prev // trailing comment\n"
580 "// leading comment\n"
581 "// line 2\n"
582 "next",
583
584 " trailing comment\n",
585 {},
586 " leading comment\n"
587 " line 2\n"
588 },
589
590 {
591 "prev /* trailing block comment */\n"
592 "/* leading block comment\n"
593 " * line 2\n"
594 " * line 3 */"
595 "next",
596
597 " trailing block comment ",
598 {},
599 " leading block comment\n"
600 " line 2\n"
601 " line 3 "
602 },
603
604 {
605 "prev\n"
606 "/* trailing block comment\n"
607 " * line 2\n"
608 " * line 3\n"
609 " */\n"
610 "/* leading block comment\n"
611 " * line 2\n"
612 " * line 3 */"
613 "next",
614
615 " trailing block comment\n"
616 " line 2\n"
617 " line 3\n",
618 {},
619 " leading block comment\n"
620 " line 2\n"
621 " line 3 "
622 },
623
624 {
625 "prev\n"
626 "// trailing comment\n"
627 "\n"
628 "// detached comment\n"
629 "// line 2\n"
630 "\n"
631 "// second detached comment\n"
632 "/* third detached comment\n"
633 " * line 2 */\n"
634 "// leading comment\n"
635 "next",
636
637 " trailing comment\n",
638 {
639 " detached comment\n"
640 " line 2\n",
641 " second detached comment\n",
642 " third detached comment\n"
643 " line 2 "
644 },
645 " leading comment\n"
646 },
647
648 {
649 "prev /**/\n"
650 "\n"
651 "// detached comment\n"
652 "\n"
653 "// leading comment\n"
654 "next",
655
656 "",
657 {
658 " detached comment\n"
659 },
660 " leading comment\n"
661 },
662
663 {
664 "prev /**/\n"
665 "// leading comment\n"
666 "next",
667
668 "",
669 {},
670 " leading comment\n"
671 },
672 };
673
TEST_2D(TokenizerTest,DocComments,kDocCommentCases,kBlockSizes)674 TEST_2D(TokenizerTest, DocComments, kDocCommentCases, kBlockSizes) {
675 // Set up the tokenizer.
676 TestInputStream input(kDocCommentCases_case.input.data(),
677 kDocCommentCases_case.input.size(),
678 kBlockSizes_case);
679 TestErrorCollector error_collector;
680 Tokenizer tokenizer(&input, &error_collector);
681
682 // Set up a second tokenizer where we'll pass all NULLs to NextWithComments().
683 TestInputStream input2(kDocCommentCases_case.input.data(),
684 kDocCommentCases_case.input.size(),
685 kBlockSizes_case);
686 Tokenizer tokenizer2(&input2, &error_collector);
687
688 tokenizer.Next();
689 tokenizer2.Next();
690
691 EXPECT_EQ("prev", tokenizer.current().text);
692 EXPECT_EQ("prev", tokenizer2.current().text);
693
694 string prev_trailing_comments;
695 vector<string> detached_comments;
696 string next_leading_comments;
697 tokenizer.NextWithComments(&prev_trailing_comments, &detached_comments,
698 &next_leading_comments);
699 tokenizer2.NextWithComments(NULL, NULL, NULL);
700 EXPECT_EQ("next", tokenizer.current().text);
701 EXPECT_EQ("next", tokenizer2.current().text);
702
703 EXPECT_EQ(kDocCommentCases_case.prev_trailing_comments,
704 prev_trailing_comments);
705
706 for (int i = 0; i < detached_comments.size(); i++) {
707 ASSERT_LT(i, GOOGLE_ARRAYSIZE(kDocCommentCases));
708 ASSERT_TRUE(kDocCommentCases_case.detached_comments[i] != NULL);
709 EXPECT_EQ(kDocCommentCases_case.detached_comments[i],
710 detached_comments[i]);
711 }
712
713 // Verify that we matched all the detached comments.
714 EXPECT_EQ(NULL,
715 kDocCommentCases_case.detached_comments[detached_comments.size()]);
716
717 EXPECT_EQ(kDocCommentCases_case.next_leading_comments,
718 next_leading_comments);
719 }
720
721 // -------------------------------------------------------------------
722
723 // Test parse helpers. It's not really worth setting up a full data-driven
724 // test here.
TEST_F(TokenizerTest,ParseInteger)725 TEST_F(TokenizerTest, ParseInteger) {
726 EXPECT_EQ(0, ParseInteger("0"));
727 EXPECT_EQ(123, ParseInteger("123"));
728 EXPECT_EQ(0xabcdef12u, ParseInteger("0xabcdef12"));
729 EXPECT_EQ(0xabcdef12u, ParseInteger("0xABCDEF12"));
730 EXPECT_EQ(kuint64max, ParseInteger("0xFFFFFFFFFFFFFFFF"));
731 EXPECT_EQ(01234567, ParseInteger("01234567"));
732 EXPECT_EQ(0X123, ParseInteger("0X123"));
733
734 // Test invalid integers that may still be tokenized as integers.
735 EXPECT_EQ(0, ParseInteger("0x"));
736
737 uint64 i;
738 #ifdef PROTOBUF_HAS_DEATH_TEST // death tests do not work on Windows yet
739 // Test invalid integers that will never be tokenized as integers.
740 EXPECT_DEBUG_DEATH(Tokenizer::ParseInteger("zxy", kuint64max, &i),
741 "passed text that could not have been tokenized as an integer");
742 EXPECT_DEBUG_DEATH(Tokenizer::ParseInteger("1.2", kuint64max, &i),
743 "passed text that could not have been tokenized as an integer");
744 EXPECT_DEBUG_DEATH(Tokenizer::ParseInteger("08", kuint64max, &i),
745 "passed text that could not have been tokenized as an integer");
746 EXPECT_DEBUG_DEATH(Tokenizer::ParseInteger("0xg", kuint64max, &i),
747 "passed text that could not have been tokenized as an integer");
748 EXPECT_DEBUG_DEATH(Tokenizer::ParseInteger("-1", kuint64max, &i),
749 "passed text that could not have been tokenized as an integer");
750 #endif // PROTOBUF_HAS_DEATH_TEST
751
752 // Test overflows.
753 EXPECT_TRUE (Tokenizer::ParseInteger("0", 0, &i));
754 EXPECT_FALSE(Tokenizer::ParseInteger("1", 0, &i));
755 EXPECT_TRUE (Tokenizer::ParseInteger("1", 1, &i));
756 EXPECT_TRUE (Tokenizer::ParseInteger("12345", 12345, &i));
757 EXPECT_FALSE(Tokenizer::ParseInteger("12346", 12345, &i));
758 EXPECT_TRUE (Tokenizer::ParseInteger("0xFFFFFFFFFFFFFFFF" , kuint64max, &i));
759 EXPECT_FALSE(Tokenizer::ParseInteger("0x10000000000000000", kuint64max, &i));
760 }
761
TEST_F(TokenizerTest,ParseFloat)762 TEST_F(TokenizerTest, ParseFloat) {
763 EXPECT_DOUBLE_EQ(1 , Tokenizer::ParseFloat("1."));
764 EXPECT_DOUBLE_EQ(1e3 , Tokenizer::ParseFloat("1e3"));
765 EXPECT_DOUBLE_EQ(1e3 , Tokenizer::ParseFloat("1E3"));
766 EXPECT_DOUBLE_EQ(1.5e3, Tokenizer::ParseFloat("1.5e3"));
767 EXPECT_DOUBLE_EQ(.1 , Tokenizer::ParseFloat(".1"));
768 EXPECT_DOUBLE_EQ(.25 , Tokenizer::ParseFloat(".25"));
769 EXPECT_DOUBLE_EQ(.1e3 , Tokenizer::ParseFloat(".1e3"));
770 EXPECT_DOUBLE_EQ(.25e3, Tokenizer::ParseFloat(".25e3"));
771 EXPECT_DOUBLE_EQ(.1e+3, Tokenizer::ParseFloat(".1e+3"));
772 EXPECT_DOUBLE_EQ(.1e-3, Tokenizer::ParseFloat(".1e-3"));
773 EXPECT_DOUBLE_EQ(5 , Tokenizer::ParseFloat("5"));
774 EXPECT_DOUBLE_EQ(6e-12, Tokenizer::ParseFloat("6e-12"));
775 EXPECT_DOUBLE_EQ(1.2 , Tokenizer::ParseFloat("1.2"));
776 EXPECT_DOUBLE_EQ(1.e2 , Tokenizer::ParseFloat("1.e2"));
777
778 // Test invalid integers that may still be tokenized as integers.
779 EXPECT_DOUBLE_EQ(1, Tokenizer::ParseFloat("1e"));
780 EXPECT_DOUBLE_EQ(1, Tokenizer::ParseFloat("1e-"));
781 EXPECT_DOUBLE_EQ(1, Tokenizer::ParseFloat("1.e"));
782
783 // Test 'f' suffix.
784 EXPECT_DOUBLE_EQ(1, Tokenizer::ParseFloat("1f"));
785 EXPECT_DOUBLE_EQ(1, Tokenizer::ParseFloat("1.0f"));
786 EXPECT_DOUBLE_EQ(1, Tokenizer::ParseFloat("1F"));
787
788 // These should parse successfully even though they are out of range.
789 // Overflows become infinity and underflows become zero.
790 EXPECT_EQ( 0.0, Tokenizer::ParseFloat("1e-9999999999999999999999999999"));
791 EXPECT_EQ(HUGE_VAL, Tokenizer::ParseFloat("1e+9999999999999999999999999999"));
792
793 #ifdef PROTOBUF_HAS_DEATH_TEST // death tests do not work on Windows yet
794 // Test invalid integers that will never be tokenized as integers.
795 EXPECT_DEBUG_DEATH(Tokenizer::ParseFloat("zxy"),
796 "passed text that could not have been tokenized as a float");
797 EXPECT_DEBUG_DEATH(Tokenizer::ParseFloat("1-e0"),
798 "passed text that could not have been tokenized as a float");
799 EXPECT_DEBUG_DEATH(Tokenizer::ParseFloat("-1.0"),
800 "passed text that could not have been tokenized as a float");
801 #endif // PROTOBUF_HAS_DEATH_TEST
802 }
803
TEST_F(TokenizerTest,ParseString)804 TEST_F(TokenizerTest, ParseString) {
805 string output;
806 Tokenizer::ParseString("'hello'", &output);
807 EXPECT_EQ("hello", output);
808 Tokenizer::ParseString("\"blah\\nblah2\"", &output);
809 EXPECT_EQ("blah\nblah2", output);
810 Tokenizer::ParseString("'\\1x\\1\\123\\739\\52\\334n\\3'", &output);
811 EXPECT_EQ("\1x\1\123\739\52\334n\3", output);
812 Tokenizer::ParseString("'\\x20\\x4'", &output);
813 EXPECT_EQ("\x20\x4", output);
814
815 // Test invalid strings that may still be tokenized as strings.
816 Tokenizer::ParseString("\"\\a\\l\\v\\t", &output); // \l is invalid
817 EXPECT_EQ("\a?\v\t", output);
818 Tokenizer::ParseString("'", &output);
819 EXPECT_EQ("", output);
820 Tokenizer::ParseString("'\\", &output);
821 EXPECT_EQ("\\", output);
822
823 // Experiment with Unicode escapes. Here are one-, two- and three-byte Unicode
824 // characters.
825 Tokenizer::ParseString("'\\u0024\\u00a2\\u20ac\\U00024b62XX'", &output);
826 EXPECT_EQ("$¢€XX", output);
827 // Same thing encoded using UTF16.
828 Tokenizer::ParseString("'\\u0024\\u00a2\\u20ac\\ud852\\udf62XX'", &output);
829 EXPECT_EQ("$¢€XX", output);
830 // Here's some broken UTF16; there's a head surrogate with no tail surrogate.
831 // We just output this as if it were UTF8; it's not a defined code point, but
832 // it has a defined encoding.
833 Tokenizer::ParseString("'\\ud852XX'", &output);
834 EXPECT_EQ("\xed\xa1\x92XX", output);
835 // Malformed escape: Demons may fly out of the nose.
836 Tokenizer::ParseString("\\u0", &output);
837 EXPECT_EQ("u0", output);
838
839 // Test invalid strings that will never be tokenized as strings.
840 #ifdef PROTOBUF_HAS_DEATH_TEST // death tests do not work on Windows yet
841 EXPECT_DEBUG_DEATH(Tokenizer::ParseString("", &output),
842 "passed text that could not have been tokenized as a string");
843 #endif // PROTOBUF_HAS_DEATH_TEST
844 }
845
TEST_F(TokenizerTest,ParseStringAppend)846 TEST_F(TokenizerTest, ParseStringAppend) {
847 // Check that ParseString and ParseStringAppend differ.
848 string output("stuff+");
849 Tokenizer::ParseStringAppend("'hello'", &output);
850 EXPECT_EQ("stuff+hello", output);
851 Tokenizer::ParseString("'hello'", &output);
852 EXPECT_EQ("hello", output);
853 }
854
855 // -------------------------------------------------------------------
856
857 // Each case parses some input text, ignoring the tokens produced, and
858 // checks that the error output matches what is expected.
859 struct ErrorCase {
860 string input;
861 bool recoverable; // True if the tokenizer should be able to recover and
862 // parse more tokens after seeing this error. Cases
863 // for which this is true must end with "foo" as
864 // the last token, which the test will check for.
865 const char* errors;
866 };
867
operator <<(ostream & out,const ErrorCase & test_case)868 inline ostream& operator<<(ostream& out,
869 const ErrorCase& test_case) {
870 return out << CEscape(test_case.input);
871 }
872
873 ErrorCase kErrorCases[] = {
874 // String errors.
875 { "'\\l' foo", true,
876 "0:2: Invalid escape sequence in string literal.\n" },
877 { "'\\x' foo", true,
878 "0:3: Expected hex digits for escape sequence.\n" },
879 { "'foo", false,
880 "0:4: Unexpected end of string.\n" },
881 { "'bar\nfoo", true,
882 "0:4: String literals cannot cross line boundaries.\n" },
883 { "'\\u01' foo", true,
884 "0:5: Expected four hex digits for \\u escape sequence.\n" },
885 { "'\\u01' foo", true,
886 "0:5: Expected four hex digits for \\u escape sequence.\n" },
887 { "'\\uXYZ' foo", true,
888 "0:3: Expected four hex digits for \\u escape sequence.\n" },
889
890 // Integer errors.
891 { "123foo", true,
892 "0:3: Need space between number and identifier.\n" },
893
894 // Hex/octal errors.
895 { "0x foo", true,
896 "0:2: \"0x\" must be followed by hex digits.\n" },
897 { "0541823 foo", true,
898 "0:4: Numbers starting with leading zero must be in octal.\n" },
899 { "0x123z foo", true,
900 "0:5: Need space between number and identifier.\n" },
901 { "0x123.4 foo", true,
902 "0:5: Hex and octal numbers must be integers.\n" },
903 { "0123.4 foo", true,
904 "0:4: Hex and octal numbers must be integers.\n" },
905
906 // Float errors.
907 { "1e foo", true,
908 "0:2: \"e\" must be followed by exponent.\n" },
909 { "1e- foo", true,
910 "0:3: \"e\" must be followed by exponent.\n" },
911 { "1.2.3 foo", true,
912 "0:3: Already saw decimal point or exponent; can't have another one.\n" },
913 { "1e2.3 foo", true,
914 "0:3: Already saw decimal point or exponent; can't have another one.\n" },
915 { "a.1 foo", true,
916 "0:1: Need space between identifier and decimal point.\n" },
917 // allow_f_after_float not enabled, so this should be an error.
918 { "1.0f foo", true,
919 "0:3: Need space between number and identifier.\n" },
920
921 // Block comment errors.
922 { "/*", false,
923 "0:2: End-of-file inside block comment.\n"
924 "0:0: Comment started here.\n"},
925 { "/*/*/ foo", true,
926 "0:3: \"/*\" inside block comment. Block comments cannot be nested.\n"},
927
928 // Control characters. Multiple consecutive control characters should only
929 // produce one error.
930 { "\b foo", true,
931 "0:0: Invalid control characters encountered in text.\n" },
932 { "\b\b foo", true,
933 "0:0: Invalid control characters encountered in text.\n" },
934
935 // Check that control characters at end of input don't result in an
936 // infinite loop.
937 { "\b", false,
938 "0:0: Invalid control characters encountered in text.\n" },
939
940 // Check recovery from '\0'. We have to explicitly specify the length of
941 // these strings because otherwise the string constructor will just call
942 // strlen() which will see the first '\0' and think that is the end of the
943 // string.
944 { string("\0foo", 4), true,
945 "0:0: Invalid control characters encountered in text.\n" },
946 { string("\0\0foo", 5), true,
947 "0:0: Invalid control characters encountered in text.\n" },
948
949 // Check error from high order bits set
950 { "\300foo", true,
951 "0:0: Interpreting non ascii codepoint 192.\n" },
952 };
953
TEST_2D(TokenizerTest,Errors,kErrorCases,kBlockSizes)954 TEST_2D(TokenizerTest, Errors, kErrorCases, kBlockSizes) {
955 // Set up the tokenizer.
956 TestInputStream input(kErrorCases_case.input.data(),
957 kErrorCases_case.input.size(),
958 kBlockSizes_case);
959 TestErrorCollector error_collector;
960 Tokenizer tokenizer(&input, &error_collector);
961
962 // Ignore all input, except remember if the last token was "foo".
963 bool last_was_foo = false;
964 while (tokenizer.Next()) {
965 last_was_foo = tokenizer.current().text == "foo";
966 }
967
968 // Check that the errors match what was expected.
969 EXPECT_EQ(kErrorCases_case.errors, error_collector.text_);
970
971 // If the error was recoverable, make sure we saw "foo" after it.
972 if (kErrorCases_case.recoverable) {
973 EXPECT_TRUE(last_was_foo);
974 }
975 }
976
977 // -------------------------------------------------------------------
978
TEST_1D(TokenizerTest,BackUpOnDestruction,kBlockSizes)979 TEST_1D(TokenizerTest, BackUpOnDestruction, kBlockSizes) {
980 string text = "foo bar";
981 TestInputStream input(text.data(), text.size(), kBlockSizes_case);
982
983 // Create a tokenizer, read one token, then destroy it.
984 {
985 TestErrorCollector error_collector;
986 Tokenizer tokenizer(&input, &error_collector);
987
988 tokenizer.Next();
989 }
990
991 // Only "foo" should have been read.
992 EXPECT_EQ(strlen("foo"), input.ByteCount());
993 }
994
995
996 } // namespace
997 } // namespace io
998 } // namespace protobuf
999 } // namespace google
1000