• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Protocol Buffers - Google's data interchange format
2 // Copyright 2008 Google Inc.  All rights reserved.
3 // https://developers.google.com/protocol-buffers/
4 //
5 // Redistribution and use in source and binary forms, with or without
6 // modification, are permitted provided that the following conditions are
7 // met:
8 //
9 //     * Redistributions of source code must retain the above copyright
10 // notice, this list of conditions and the following disclaimer.
11 //     * Redistributions in binary form must reproduce the above
12 // copyright notice, this list of conditions and the following disclaimer
13 // in the documentation and/or other materials provided with the
14 // distribution.
15 //     * Neither the name of Google Inc. nor the names of its
16 // contributors may be used to endorse or promote products derived from
17 // this software without specific prior written permission.
18 //
19 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
20 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
21 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
22 // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
23 // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
24 // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
25 // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
26 // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
27 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
28 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
29 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30 
31 // Author: kenton@google.com (Kenton Varda)
32 //  Based on original Protocol Buffers design by
33 //  Sanjay Ghemawat, Jeff Dean, and others.
34 
35 #include <limits.h>
36 #include <math.h>
37 
38 #include <vector>
39 
40 #include <google/protobuf/io/tokenizer.h>
41 #include <google/protobuf/io/zero_copy_stream_impl.h>
42 
43 #include <google/protobuf/stubs/common.h>
44 #include <google/protobuf/stubs/strutil.h>
45 #include <google/protobuf/stubs/substitute.h>
46 #include <google/protobuf/testing/googletest.h>
47 #include <gtest/gtest.h>
48 
49 namespace google {
50 namespace protobuf {
51 namespace io {
52 namespace {
53 
54 // ===================================================================
55 // Data-Driven Test Infrastructure
56 
57 // TODO(kenton):  This is copied from coded_stream_unittest.  This is
58 //   temporary until these fetaures are integrated into gTest itself.
59 
60 // TEST_1D and TEST_2D are macros I'd eventually like to see added to
61 // gTest.  These macros can be used to declare tests which should be
62 // run multiple times, once for each item in some input array.  TEST_1D
63 // tests all cases in a single input array.  TEST_2D tests all
64 // combinations of cases from two arrays.  The arrays must be statically
65 // defined such that the GOOGLE_ARRAYSIZE() macro works on them.  Example:
66 //
67 // int kCases[] = {1, 2, 3, 4}
68 // TEST_1D(MyFixture, MyTest, kCases) {
69 //   EXPECT_GT(kCases_case, 0);
70 // }
71 //
72 // This test iterates through the numbers 1, 2, 3, and 4 and tests that
73 // they are all grater than zero.  In case of failure, the exact case
74 // which failed will be printed.  The case type must be printable using
75 // ostream::operator<<.
76 
77 #define TEST_1D(FIXTURE, NAME, CASES)                                      \
78   class FIXTURE##_##NAME##_DD : public FIXTURE {                           \
79    protected:                                                              \
80     template <typename CaseType>                                           \
81     void DoSingleCase(const CaseType& CASES##_case);                       \
82   };                                                                       \
83                                                                            \
84   TEST_F(FIXTURE##_##NAME##_DD, NAME) {                                    \
85     for (int i = 0; i < GOOGLE_ARRAYSIZE(CASES); i++) {                           \
86       SCOPED_TRACE(testing::Message()                                      \
87         << #CASES " case #" << i << ": " << CASES[i]);                     \
88       DoSingleCase(CASES[i]);                                              \
89     }                                                                      \
90   }                                                                        \
91                                                                            \
92   template <typename CaseType>                                             \
93   void FIXTURE##_##NAME##_DD::DoSingleCase(const CaseType& CASES##_case)
94 
95 #define TEST_2D(FIXTURE, NAME, CASES1, CASES2)                             \
96   class FIXTURE##_##NAME##_DD : public FIXTURE {                           \
97    protected:                                                              \
98     template <typename CaseType1, typename CaseType2>                      \
99     void DoSingleCase(const CaseType1& CASES1##_case,                      \
100                       const CaseType2& CASES2##_case);                     \
101   };                                                                       \
102                                                                            \
103   TEST_F(FIXTURE##_##NAME##_DD, NAME) {                                    \
104     for (int i = 0; i < GOOGLE_ARRAYSIZE(CASES1); i++) {                          \
105       for (int j = 0; j < GOOGLE_ARRAYSIZE(CASES2); j++) {                        \
106         SCOPED_TRACE(testing::Message()                                    \
107           << #CASES1 " case #" << i << ": " << CASES1[i] << ", "           \
108           << #CASES2 " case #" << j << ": " << CASES2[j]);                 \
109         DoSingleCase(CASES1[i], CASES2[j]);                                \
110       }                                                                    \
111     }                                                                      \
112   }                                                                        \
113                                                                            \
114   template <typename CaseType1, typename CaseType2>                        \
115   void FIXTURE##_##NAME##_DD::DoSingleCase(const CaseType1& CASES1##_case, \
116                                            const CaseType2& CASES2##_case)
117 
118 // -------------------------------------------------------------------
119 
120 // An input stream that is basically like an ArrayInputStream but sometimes
121 // returns empty buffers, just to throw us off.
122 class TestInputStream : public ZeroCopyInputStream {
123  public:
TestInputStream(const void * data,int size,int block_size)124   TestInputStream(const void* data, int size, int block_size)
125     : array_stream_(data, size, block_size), counter_(0) {}
~TestInputStream()126   ~TestInputStream() {}
127 
128   // implements ZeroCopyInputStream ----------------------------------
Next(const void ** data,int * size)129   bool Next(const void** data, int* size) {
130     // We'll return empty buffers starting with the first buffer, and every
131     // 3 and 5 buffers after that.
132     if (counter_ % 3 == 0 || counter_ % 5 == 0) {
133       *data = NULL;
134       *size = 0;
135       ++counter_;
136       return true;
137     } else {
138       ++counter_;
139       return array_stream_.Next(data, size);
140     }
141   }
142 
BackUp(int count)143   void BackUp(int count)  { return array_stream_.BackUp(count); }
Skip(int count)144   bool Skip(int count)    { return array_stream_.Skip(count);   }
ByteCount() const145   int64 ByteCount() const { return array_stream_.ByteCount();   }
146 
147  private:
148   ArrayInputStream array_stream_;
149   int counter_;
150 };
151 
152 // -------------------------------------------------------------------
153 
154 // An error collector which simply concatenates all its errors into a big
155 // block of text which can be checked.
156 class TestErrorCollector : public ErrorCollector {
157  public:
TestErrorCollector()158   TestErrorCollector() {}
~TestErrorCollector()159   ~TestErrorCollector() {}
160 
161   string text_;
162 
163   // implements ErrorCollector ---------------------------------------
AddError(int line,int column,const string & message)164   void AddError(int line, int column, const string& message) {
165     strings::SubstituteAndAppend(&text_, "$0:$1: $2\n",
166                                  line, column, message);
167   }
168 };
169 
170 // -------------------------------------------------------------------
171 
172 // We test each operation over a variety of block sizes to insure that
173 // we test cases where reads cross buffer boundaries as well as cases
174 // where they don't.  This is sort of a brute-force approach to this,
175 // but it's easy to write and easy to understand.
176 const int kBlockSizes[] = {1, 2, 3, 5, 7, 13, 32, 1024};
177 
178 class TokenizerTest : public testing::Test {
179  protected:
180   // For easy testing.
ParseInteger(const string & text)181   uint64 ParseInteger(const string& text) {
182     uint64 result;
183     EXPECT_TRUE(Tokenizer::ParseInteger(text, kuint64max, &result));
184     return result;
185   }
186 };
187 
188 // ===================================================================
189 
190 // These tests causes gcc 3.3.5 (and earlier?) to give the cryptic error:
191 //   "sorry, unimplemented: `method_call_expr' not supported by dump_expr"
192 #if !defined(__GNUC__) || __GNUC__ > 3 || (__GNUC__ == 3 && __GNUC_MINOR__ > 3)
193 
194 // In each test case, the entire input text should parse as a single token
195 // of the given type.
196 struct SimpleTokenCase {
197   string input;
198   Tokenizer::TokenType type;
199 };
200 
operator <<(ostream & out,const SimpleTokenCase & test_case)201 inline ostream& operator<<(ostream& out,
202                            const SimpleTokenCase& test_case) {
203   return out << CEscape(test_case.input);
204 }
205 
206 SimpleTokenCase kSimpleTokenCases[] = {
207   // Test identifiers.
208   { "hello",       Tokenizer::TYPE_IDENTIFIER },
209 
210   // Test integers.
211   { "123",         Tokenizer::TYPE_INTEGER },
212   { "0xab6",       Tokenizer::TYPE_INTEGER },
213   { "0XAB6",       Tokenizer::TYPE_INTEGER },
214   { "0X1234567",   Tokenizer::TYPE_INTEGER },
215   { "0x89abcdef",  Tokenizer::TYPE_INTEGER },
216   { "0x89ABCDEF",  Tokenizer::TYPE_INTEGER },
217   { "01234567",    Tokenizer::TYPE_INTEGER },
218 
219   // Test floats.
220   { "123.45",      Tokenizer::TYPE_FLOAT },
221   { "1.",          Tokenizer::TYPE_FLOAT },
222   { "1e3",         Tokenizer::TYPE_FLOAT },
223   { "1E3",         Tokenizer::TYPE_FLOAT },
224   { "1e-3",        Tokenizer::TYPE_FLOAT },
225   { "1e+3",        Tokenizer::TYPE_FLOAT },
226   { "1.e3",        Tokenizer::TYPE_FLOAT },
227   { "1.2e3",       Tokenizer::TYPE_FLOAT },
228   { ".1",          Tokenizer::TYPE_FLOAT },
229   { ".1e3",        Tokenizer::TYPE_FLOAT },
230   { ".1e-3",       Tokenizer::TYPE_FLOAT },
231   { ".1e+3",       Tokenizer::TYPE_FLOAT },
232 
233   // Test strings.
234   { "'hello'",     Tokenizer::TYPE_STRING },
235   { "\"foo\"",     Tokenizer::TYPE_STRING },
236   { "'a\"b'",      Tokenizer::TYPE_STRING },
237   { "\"a'b\"",     Tokenizer::TYPE_STRING },
238   { "'a\\'b'",     Tokenizer::TYPE_STRING },
239   { "\"a\\\"b\"",  Tokenizer::TYPE_STRING },
240   { "'\\xf'",      Tokenizer::TYPE_STRING },
241   { "'\\0'",       Tokenizer::TYPE_STRING },
242 
243   // Test symbols.
244   { "+",           Tokenizer::TYPE_SYMBOL },
245   { ".",           Tokenizer::TYPE_SYMBOL },
246 };
247 
TEST_2D(TokenizerTest,SimpleTokens,kSimpleTokenCases,kBlockSizes)248 TEST_2D(TokenizerTest, SimpleTokens, kSimpleTokenCases, kBlockSizes) {
249   // Set up the tokenizer.
250   TestInputStream input(kSimpleTokenCases_case.input.data(),
251                         kSimpleTokenCases_case.input.size(),
252                         kBlockSizes_case);
253   TestErrorCollector error_collector;
254   Tokenizer tokenizer(&input, &error_collector);
255 
256   // Before Next() is called, the initial token should always be TYPE_START.
257   EXPECT_EQ(Tokenizer::TYPE_START, tokenizer.current().type);
258   EXPECT_EQ("", tokenizer.current().text);
259   EXPECT_EQ(0, tokenizer.current().line);
260   EXPECT_EQ(0, tokenizer.current().column);
261   EXPECT_EQ(0, tokenizer.current().end_column);
262 
263   // Parse the token.
264   ASSERT_TRUE(tokenizer.Next());
265 
266   // Check that it has the right type.
267   EXPECT_EQ(kSimpleTokenCases_case.type, tokenizer.current().type);
268   // Check that it contains the complete input text.
269   EXPECT_EQ(kSimpleTokenCases_case.input, tokenizer.current().text);
270   // Check that it is located at the beginning of the input
271   EXPECT_EQ(0, tokenizer.current().line);
272   EXPECT_EQ(0, tokenizer.current().column);
273   EXPECT_EQ(kSimpleTokenCases_case.input.size(),
274             tokenizer.current().end_column);
275 
276   // There should be no more input.
277   EXPECT_FALSE(tokenizer.Next());
278 
279   // After Next() returns false, the token should have type TYPE_END.
280   EXPECT_EQ(Tokenizer::TYPE_END, tokenizer.current().type);
281   EXPECT_EQ("", tokenizer.current().text);
282   EXPECT_EQ(0, tokenizer.current().line);
283   EXPECT_EQ(kSimpleTokenCases_case.input.size(), tokenizer.current().column);
284   EXPECT_EQ(kSimpleTokenCases_case.input.size(),
285             tokenizer.current().end_column);
286 
287   // There should be no errors.
288   EXPECT_TRUE(error_collector.text_.empty());
289 }
290 
TEST_1D(TokenizerTest,FloatSuffix,kBlockSizes)291 TEST_1D(TokenizerTest, FloatSuffix, kBlockSizes) {
292   // Test the "allow_f_after_float" option.
293 
294   // Set up the tokenizer.
295   const char* text = "1f 2.5f 6e3f 7F";
296   TestInputStream input(text, strlen(text), kBlockSizes_case);
297   TestErrorCollector error_collector;
298   Tokenizer tokenizer(&input, &error_collector);
299   tokenizer.set_allow_f_after_float(true);
300 
301   // Advance through tokens and check that they are parsed as expected.
302   ASSERT_TRUE(tokenizer.Next());
303   EXPECT_EQ(tokenizer.current().text, "1f");
304   EXPECT_EQ(tokenizer.current().type, Tokenizer::TYPE_FLOAT);
305   ASSERT_TRUE(tokenizer.Next());
306   EXPECT_EQ(tokenizer.current().text, "2.5f");
307   EXPECT_EQ(tokenizer.current().type, Tokenizer::TYPE_FLOAT);
308   ASSERT_TRUE(tokenizer.Next());
309   EXPECT_EQ(tokenizer.current().text, "6e3f");
310   EXPECT_EQ(tokenizer.current().type, Tokenizer::TYPE_FLOAT);
311   ASSERT_TRUE(tokenizer.Next());
312   EXPECT_EQ(tokenizer.current().text, "7F");
313   EXPECT_EQ(tokenizer.current().type, Tokenizer::TYPE_FLOAT);
314 
315   // There should be no more input.
316   EXPECT_FALSE(tokenizer.Next());
317   // There should be no errors.
318   EXPECT_TRUE(error_collector.text_.empty());
319 }
320 
321 #endif
322 
323 // -------------------------------------------------------------------
324 
325 // In each case, the input is parsed to produce a list of tokens.  The
326 // last token in "output" must have type TYPE_END.
327 struct MultiTokenCase {
328   string input;
329   Tokenizer::Token output[10];  // The compiler wants a constant array
330                                 // size for initialization to work.  There
331                                 // is no reason this can't be increased if
332                                 // needed.
333 };
334 
operator <<(ostream & out,const MultiTokenCase & test_case)335 inline ostream& operator<<(ostream& out,
336                            const MultiTokenCase& test_case) {
337   return out << CEscape(test_case.input);
338 }
339 
340 MultiTokenCase kMultiTokenCases[] = {
341   // Test empty input.
342   { "", {
343     { Tokenizer::TYPE_END       , ""     , 0,  0 },
344   }},
345 
346   // Test all token types at the same time.
347   { "foo 1 1.2 + 'bar'", {
348     { Tokenizer::TYPE_IDENTIFIER, "foo"  , 0,  0,  3 },
349     { Tokenizer::TYPE_INTEGER   , "1"    , 0,  4,  5 },
350     { Tokenizer::TYPE_FLOAT     , "1.2"  , 0,  6,  9 },
351     { Tokenizer::TYPE_SYMBOL    , "+"    , 0, 10, 11 },
352     { Tokenizer::TYPE_STRING    , "'bar'", 0, 12, 17 },
353     { Tokenizer::TYPE_END       , ""     , 0, 17, 17 },
354   }},
355 
356   // Test that consecutive symbols are parsed as separate tokens.
357   { "!@+%", {
358     { Tokenizer::TYPE_SYMBOL    , "!"    , 0, 0, 1 },
359     { Tokenizer::TYPE_SYMBOL    , "@"    , 0, 1, 2 },
360     { Tokenizer::TYPE_SYMBOL    , "+"    , 0, 2, 3 },
361     { Tokenizer::TYPE_SYMBOL    , "%"    , 0, 3, 4 },
362     { Tokenizer::TYPE_END       , ""     , 0, 4, 4 },
363   }},
364 
365   // Test that newlines affect line numbers correctly.
366   { "foo bar\nrab oof", {
367     { Tokenizer::TYPE_IDENTIFIER, "foo", 0,  0, 3 },
368     { Tokenizer::TYPE_IDENTIFIER, "bar", 0,  4, 7 },
369     { Tokenizer::TYPE_IDENTIFIER, "rab", 1,  0, 3 },
370     { Tokenizer::TYPE_IDENTIFIER, "oof", 1,  4, 7 },
371     { Tokenizer::TYPE_END       , ""   , 1,  7, 7 },
372   }},
373 
374   // Test that tabs affect column numbers correctly.
375   { "foo\tbar  \tbaz", {
376     { Tokenizer::TYPE_IDENTIFIER, "foo", 0,  0,  3 },
377     { Tokenizer::TYPE_IDENTIFIER, "bar", 0,  8, 11 },
378     { Tokenizer::TYPE_IDENTIFIER, "baz", 0, 16, 19 },
379     { Tokenizer::TYPE_END       , ""   , 0, 19, 19 },
380   }},
381 
382   // Test that tabs in string literals affect column numbers correctly.
383   { "\"foo\tbar\" baz", {
384     { Tokenizer::TYPE_STRING    , "\"foo\tbar\"", 0,  0, 12 },
385     { Tokenizer::TYPE_IDENTIFIER, "baz"         , 0, 13, 16 },
386     { Tokenizer::TYPE_END       , ""            , 0, 16, 16 },
387   }},
388 
389   // Test that line comments are ignored.
390   { "foo // This is a comment\n"
391     "bar // This is another comment", {
392     { Tokenizer::TYPE_IDENTIFIER, "foo", 0,  0,  3 },
393     { Tokenizer::TYPE_IDENTIFIER, "bar", 1,  0,  3 },
394     { Tokenizer::TYPE_END       , ""   , 1, 30, 30 },
395   }},
396 
397   // Test that block comments are ignored.
398   { "foo /* This is a block comment */ bar", {
399     { Tokenizer::TYPE_IDENTIFIER, "foo", 0,  0,  3 },
400     { Tokenizer::TYPE_IDENTIFIER, "bar", 0, 34, 37 },
401     { Tokenizer::TYPE_END       , ""   , 0, 37, 37 },
402   }},
403 
404   // Test that sh-style comments are not ignored by default.
405   { "foo # bar\n"
406     "baz", {
407     { Tokenizer::TYPE_IDENTIFIER, "foo", 0, 0, 3 },
408     { Tokenizer::TYPE_SYMBOL    , "#"  , 0, 4, 5 },
409     { Tokenizer::TYPE_IDENTIFIER, "bar", 0, 6, 9 },
410     { Tokenizer::TYPE_IDENTIFIER, "baz", 1, 0, 3 },
411     { Tokenizer::TYPE_END       , ""   , 1, 3, 3 },
412   }},
413 
414   // Test all whitespace chars
415   { "foo\n\t\r\v\fbar", {
416     { Tokenizer::TYPE_IDENTIFIER, "foo", 0,  0,  3 },
417     { Tokenizer::TYPE_IDENTIFIER, "bar", 1, 11, 14 },
418     { Tokenizer::TYPE_END       , ""   , 1, 14, 14 },
419   }},
420 };
421 
TEST_2D(TokenizerTest,MultipleTokens,kMultiTokenCases,kBlockSizes)422 TEST_2D(TokenizerTest, MultipleTokens, kMultiTokenCases, kBlockSizes) {
423   // Set up the tokenizer.
424   TestInputStream input(kMultiTokenCases_case.input.data(),
425                         kMultiTokenCases_case.input.size(),
426                         kBlockSizes_case);
427   TestErrorCollector error_collector;
428   Tokenizer tokenizer(&input, &error_collector);
429 
430   // Before Next() is called, the initial token should always be TYPE_START.
431   EXPECT_EQ(Tokenizer::TYPE_START, tokenizer.current().type);
432   EXPECT_EQ("", tokenizer.current().text);
433   EXPECT_EQ(0, tokenizer.current().line);
434   EXPECT_EQ(0, tokenizer.current().column);
435   EXPECT_EQ(0, tokenizer.current().end_column);
436 
437   // Loop through all expected tokens.
438   int i = 0;
439   Tokenizer::Token token;
440   do {
441     token = kMultiTokenCases_case.output[i++];
442 
443     SCOPED_TRACE(testing::Message() << "Token #" << i << ": " << token.text);
444 
445     Tokenizer::Token previous = tokenizer.current();
446 
447     // Next() should only return false when it hits the end token.
448     if (token.type != Tokenizer::TYPE_END) {
449       ASSERT_TRUE(tokenizer.Next());
450     } else {
451       ASSERT_FALSE(tokenizer.Next());
452     }
453 
454     // Check that the previous token is set correctly.
455     EXPECT_EQ(previous.type, tokenizer.previous().type);
456     EXPECT_EQ(previous.text, tokenizer.previous().text);
457     EXPECT_EQ(previous.line, tokenizer.previous().line);
458     EXPECT_EQ(previous.column, tokenizer.previous().column);
459     EXPECT_EQ(previous.end_column, tokenizer.previous().end_column);
460 
461     // Check that the token matches the expected one.
462     EXPECT_EQ(token.type, tokenizer.current().type);
463     EXPECT_EQ(token.text, tokenizer.current().text);
464     EXPECT_EQ(token.line, tokenizer.current().line);
465     EXPECT_EQ(token.column, tokenizer.current().column);
466     EXPECT_EQ(token.end_column, tokenizer.current().end_column);
467 
468   } while (token.type != Tokenizer::TYPE_END);
469 
470   // There should be no errors.
471   EXPECT_TRUE(error_collector.text_.empty());
472 }
473 
474 // This test causes gcc 3.3.5 (and earlier?) to give the cryptic error:
475 //   "sorry, unimplemented: `method_call_expr' not supported by dump_expr"
476 #if !defined(__GNUC__) || __GNUC__ > 3 || (__GNUC__ == 3 && __GNUC_MINOR__ > 3)
477 
TEST_1D(TokenizerTest,ShCommentStyle,kBlockSizes)478 TEST_1D(TokenizerTest, ShCommentStyle, kBlockSizes) {
479   // Test the "comment_style" option.
480 
481   const char* text = "foo # bar\n"
482                      "baz // qux\n"
483                      "corge /* grault */\n"
484                      "garply";
485   const char* const kTokens[] = {"foo",  // "# bar" is ignored
486                                  "baz", "/", "/", "qux",
487                                  "corge", "/", "*", "grault", "*", "/",
488                                  "garply"};
489 
490   // Set up the tokenizer.
491   TestInputStream input(text, strlen(text), kBlockSizes_case);
492   TestErrorCollector error_collector;
493   Tokenizer tokenizer(&input, &error_collector);
494   tokenizer.set_comment_style(Tokenizer::SH_COMMENT_STYLE);
495 
496   // Advance through tokens and check that they are parsed as expected.
497   for (int i = 0; i < GOOGLE_ARRAYSIZE(kTokens); i++) {
498     EXPECT_TRUE(tokenizer.Next());
499     EXPECT_EQ(tokenizer.current().text, kTokens[i]);
500   }
501 
502   // There should be no more input.
503   EXPECT_FALSE(tokenizer.Next());
504   // There should be no errors.
505   EXPECT_TRUE(error_collector.text_.empty());
506 }
507 
508 #endif
509 
510 // -------------------------------------------------------------------
511 
512 // In each case, the input is expected to have two tokens named "prev" and
513 // "next" with comments in between.
514 struct DocCommentCase {
515   string input;
516 
517   const char* prev_trailing_comments;
518   const char* detached_comments[10];
519   const char* next_leading_comments;
520 };
521 
operator <<(ostream & out,const DocCommentCase & test_case)522 inline ostream& operator<<(ostream& out,
523                            const DocCommentCase& test_case) {
524   return out << CEscape(test_case.input);
525 }
526 
527 DocCommentCase kDocCommentCases[] = {
528   {
529     "prev next",
530 
531     "",
532     {},
533     ""
534       },
535 
536         {
537       "prev /* ignored */ next",
538 
539       "",
540       {},
541       ""
542         },
543 
544           {
545         "prev // trailing comment\n"
546             "next",
547 
548             " trailing comment\n",
549             {},
550             ""
551           },
552 
553             {
554           "prev\n"
555               "// leading comment\n"
556               "// line 2\n"
557               "next",
558 
559               "",
560               {},
561               " leading comment\n"
562               " line 2\n"
563             },
564 
565               {
566             "prev\n"
567                 "// trailing comment\n"
568                 "// line 2\n"
569                 "\n"
570                 "next",
571 
572                 " trailing comment\n"
573                 " line 2\n",
574                 {},
575                 ""
576               },
577 
578                 {
579               "prev // trailing comment\n"
580                   "// leading comment\n"
581                   "// line 2\n"
582                   "next",
583 
584                   " trailing comment\n",
585                   {},
586                   " leading comment\n"
587                   " line 2\n"
588                 },
589 
590                   {
591                 "prev /* trailing block comment */\n"
592                     "/* leading block comment\n"
593                     " * line 2\n"
594                     " * line 3 */"
595                     "next",
596 
597                     " trailing block comment ",
598                     {},
599                     " leading block comment\n"
600                     " line 2\n"
601                     " line 3 "
602                   },
603 
604                     {
605                   "prev\n"
606                       "/* trailing block comment\n"
607                       " * line 2\n"
608                       " * line 3\n"
609                       " */\n"
610                       "/* leading block comment\n"
611                       " * line 2\n"
612                       " * line 3 */"
613                       "next",
614 
615                       " trailing block comment\n"
616                       " line 2\n"
617                       " line 3\n",
618                       {},
619                       " leading block comment\n"
620                       " line 2\n"
621                       " line 3 "
622                     },
623 
624                       {
625                     "prev\n"
626                         "// trailing comment\n"
627                         "\n"
628                         "// detached comment\n"
629                         "// line 2\n"
630                         "\n"
631                         "// second detached comment\n"
632                         "/* third detached comment\n"
633                         " * line 2 */\n"
634                         "// leading comment\n"
635                         "next",
636 
637                         " trailing comment\n",
638                         {
639                       " detached comment\n"
640                           " line 2\n",
641                           " second detached comment\n",
642                           " third detached comment\n"
643                           " line 2 "
644                         },
645                           " leading comment\n"
646                         },
647 
648                           {
649                         "prev /**/\n"
650                             "\n"
651                             "// detached comment\n"
652                             "\n"
653                             "// leading comment\n"
654                             "next",
655 
656                             "",
657                             {
658                           " detached comment\n"
659                             },
660                               " leading comment\n"
661                             },
662 
663                               {
664                             "prev /**/\n"
665                                 "// leading comment\n"
666                                 "next",
667 
668                                 "",
669                                 {},
670                                 " leading comment\n"
671                               },
672                               };
673 
TEST_2D(TokenizerTest,DocComments,kDocCommentCases,kBlockSizes)674 TEST_2D(TokenizerTest, DocComments, kDocCommentCases, kBlockSizes) {
675   // Set up the tokenizer.
676   TestInputStream input(kDocCommentCases_case.input.data(),
677                         kDocCommentCases_case.input.size(),
678                         kBlockSizes_case);
679   TestErrorCollector error_collector;
680   Tokenizer tokenizer(&input, &error_collector);
681 
682   // Set up a second tokenizer where we'll pass all NULLs to NextWithComments().
683   TestInputStream input2(kDocCommentCases_case.input.data(),
684                         kDocCommentCases_case.input.size(),
685                         kBlockSizes_case);
686   Tokenizer tokenizer2(&input2, &error_collector);
687 
688   tokenizer.Next();
689   tokenizer2.Next();
690 
691   EXPECT_EQ("prev", tokenizer.current().text);
692   EXPECT_EQ("prev", tokenizer2.current().text);
693 
694   string prev_trailing_comments;
695   vector<string> detached_comments;
696   string next_leading_comments;
697   tokenizer.NextWithComments(&prev_trailing_comments, &detached_comments,
698                              &next_leading_comments);
699   tokenizer2.NextWithComments(NULL, NULL, NULL);
700   EXPECT_EQ("next", tokenizer.current().text);
701   EXPECT_EQ("next", tokenizer2.current().text);
702 
703   EXPECT_EQ(kDocCommentCases_case.prev_trailing_comments,
704             prev_trailing_comments);
705 
706   for (int i = 0; i < detached_comments.size(); i++) {
707     ASSERT_LT(i, GOOGLE_ARRAYSIZE(kDocCommentCases));
708     ASSERT_TRUE(kDocCommentCases_case.detached_comments[i] != NULL);
709     EXPECT_EQ(kDocCommentCases_case.detached_comments[i],
710               detached_comments[i]);
711   }
712 
713   // Verify that we matched all the detached comments.
714   EXPECT_EQ(NULL,
715       kDocCommentCases_case.detached_comments[detached_comments.size()]);
716 
717   EXPECT_EQ(kDocCommentCases_case.next_leading_comments,
718             next_leading_comments);
719 }
720 
721 // -------------------------------------------------------------------
722 
723 // Test parse helpers.  It's not really worth setting up a full data-driven
724 // test here.
TEST_F(TokenizerTest,ParseInteger)725 TEST_F(TokenizerTest, ParseInteger) {
726   EXPECT_EQ(0, ParseInteger("0"));
727   EXPECT_EQ(123, ParseInteger("123"));
728   EXPECT_EQ(0xabcdef12u, ParseInteger("0xabcdef12"));
729   EXPECT_EQ(0xabcdef12u, ParseInteger("0xABCDEF12"));
730   EXPECT_EQ(kuint64max, ParseInteger("0xFFFFFFFFFFFFFFFF"));
731   EXPECT_EQ(01234567, ParseInteger("01234567"));
732   EXPECT_EQ(0X123, ParseInteger("0X123"));
733 
734   // Test invalid integers that may still be tokenized as integers.
735   EXPECT_EQ(0, ParseInteger("0x"));
736 
737   uint64 i;
738 #ifdef PROTOBUF_HAS_DEATH_TEST  // death tests do not work on Windows yet
739   // Test invalid integers that will never be tokenized as integers.
740   EXPECT_DEBUG_DEATH(Tokenizer::ParseInteger("zxy", kuint64max, &i),
741     "passed text that could not have been tokenized as an integer");
742   EXPECT_DEBUG_DEATH(Tokenizer::ParseInteger("1.2", kuint64max, &i),
743     "passed text that could not have been tokenized as an integer");
744   EXPECT_DEBUG_DEATH(Tokenizer::ParseInteger("08", kuint64max, &i),
745     "passed text that could not have been tokenized as an integer");
746   EXPECT_DEBUG_DEATH(Tokenizer::ParseInteger("0xg", kuint64max, &i),
747     "passed text that could not have been tokenized as an integer");
748   EXPECT_DEBUG_DEATH(Tokenizer::ParseInteger("-1", kuint64max, &i),
749     "passed text that could not have been tokenized as an integer");
750 #endif  // PROTOBUF_HAS_DEATH_TEST
751 
752   // Test overflows.
753   EXPECT_TRUE (Tokenizer::ParseInteger("0", 0, &i));
754   EXPECT_FALSE(Tokenizer::ParseInteger("1", 0, &i));
755   EXPECT_TRUE (Tokenizer::ParseInteger("1", 1, &i));
756   EXPECT_TRUE (Tokenizer::ParseInteger("12345", 12345, &i));
757   EXPECT_FALSE(Tokenizer::ParseInteger("12346", 12345, &i));
758   EXPECT_TRUE (Tokenizer::ParseInteger("0xFFFFFFFFFFFFFFFF" , kuint64max, &i));
759   EXPECT_FALSE(Tokenizer::ParseInteger("0x10000000000000000", kuint64max, &i));
760 }
761 
TEST_F(TokenizerTest,ParseFloat)762 TEST_F(TokenizerTest, ParseFloat) {
763   EXPECT_DOUBLE_EQ(1    , Tokenizer::ParseFloat("1."));
764   EXPECT_DOUBLE_EQ(1e3  , Tokenizer::ParseFloat("1e3"));
765   EXPECT_DOUBLE_EQ(1e3  , Tokenizer::ParseFloat("1E3"));
766   EXPECT_DOUBLE_EQ(1.5e3, Tokenizer::ParseFloat("1.5e3"));
767   EXPECT_DOUBLE_EQ(.1   , Tokenizer::ParseFloat(".1"));
768   EXPECT_DOUBLE_EQ(.25  , Tokenizer::ParseFloat(".25"));
769   EXPECT_DOUBLE_EQ(.1e3 , Tokenizer::ParseFloat(".1e3"));
770   EXPECT_DOUBLE_EQ(.25e3, Tokenizer::ParseFloat(".25e3"));
771   EXPECT_DOUBLE_EQ(.1e+3, Tokenizer::ParseFloat(".1e+3"));
772   EXPECT_DOUBLE_EQ(.1e-3, Tokenizer::ParseFloat(".1e-3"));
773   EXPECT_DOUBLE_EQ(5    , Tokenizer::ParseFloat("5"));
774   EXPECT_DOUBLE_EQ(6e-12, Tokenizer::ParseFloat("6e-12"));
775   EXPECT_DOUBLE_EQ(1.2  , Tokenizer::ParseFloat("1.2"));
776   EXPECT_DOUBLE_EQ(1.e2 , Tokenizer::ParseFloat("1.e2"));
777 
778   // Test invalid integers that may still be tokenized as integers.
779   EXPECT_DOUBLE_EQ(1, Tokenizer::ParseFloat("1e"));
780   EXPECT_DOUBLE_EQ(1, Tokenizer::ParseFloat("1e-"));
781   EXPECT_DOUBLE_EQ(1, Tokenizer::ParseFloat("1.e"));
782 
783   // Test 'f' suffix.
784   EXPECT_DOUBLE_EQ(1, Tokenizer::ParseFloat("1f"));
785   EXPECT_DOUBLE_EQ(1, Tokenizer::ParseFloat("1.0f"));
786   EXPECT_DOUBLE_EQ(1, Tokenizer::ParseFloat("1F"));
787 
788   // These should parse successfully even though they are out of range.
789   // Overflows become infinity and underflows become zero.
790   EXPECT_EQ(     0.0, Tokenizer::ParseFloat("1e-9999999999999999999999999999"));
791   EXPECT_EQ(HUGE_VAL, Tokenizer::ParseFloat("1e+9999999999999999999999999999"));
792 
793 #ifdef PROTOBUF_HAS_DEATH_TEST  // death tests do not work on Windows yet
794   // Test invalid integers that will never be tokenized as integers.
795   EXPECT_DEBUG_DEATH(Tokenizer::ParseFloat("zxy"),
796     "passed text that could not have been tokenized as a float");
797   EXPECT_DEBUG_DEATH(Tokenizer::ParseFloat("1-e0"),
798     "passed text that could not have been tokenized as a float");
799   EXPECT_DEBUG_DEATH(Tokenizer::ParseFloat("-1.0"),
800     "passed text that could not have been tokenized as a float");
801 #endif  // PROTOBUF_HAS_DEATH_TEST
802 }
803 
TEST_F(TokenizerTest,ParseString)804 TEST_F(TokenizerTest, ParseString) {
805   string output;
806   Tokenizer::ParseString("'hello'", &output);
807   EXPECT_EQ("hello", output);
808   Tokenizer::ParseString("\"blah\\nblah2\"", &output);
809   EXPECT_EQ("blah\nblah2", output);
810   Tokenizer::ParseString("'\\1x\\1\\123\\739\\52\\334n\\3'", &output);
811   EXPECT_EQ("\1x\1\123\739\52\334n\3", output);
812   Tokenizer::ParseString("'\\x20\\x4'", &output);
813   EXPECT_EQ("\x20\x4", output);
814 
815   // Test invalid strings that may still be tokenized as strings.
816   Tokenizer::ParseString("\"\\a\\l\\v\\t", &output);  // \l is invalid
817   EXPECT_EQ("\a?\v\t", output);
818   Tokenizer::ParseString("'", &output);
819   EXPECT_EQ("", output);
820   Tokenizer::ParseString("'\\", &output);
821   EXPECT_EQ("\\", output);
822 
823   // Experiment with Unicode escapes. Here are one-, two- and three-byte Unicode
824   // characters.
825   Tokenizer::ParseString("'\\u0024\\u00a2\\u20ac\\U00024b62XX'", &output);
826   EXPECT_EQ("$¢€��XX", output);
827   // Same thing encoded using UTF16.
828   Tokenizer::ParseString("'\\u0024\\u00a2\\u20ac\\ud852\\udf62XX'", &output);
829   EXPECT_EQ("$¢€��XX", output);
830   // Here's some broken UTF16; there's a head surrogate with no tail surrogate.
831   // We just output this as if it were UTF8; it's not a defined code point, but
832   // it has a defined encoding.
833   Tokenizer::ParseString("'\\ud852XX'", &output);
834   EXPECT_EQ("\xed\xa1\x92XX", output);
835   // Malformed escape: Demons may fly out of the nose.
836   Tokenizer::ParseString("\\u0", &output);
837   EXPECT_EQ("u0", output);
838 
839   // Test invalid strings that will never be tokenized as strings.
840 #ifdef PROTOBUF_HAS_DEATH_TEST  // death tests do not work on Windows yet
841   EXPECT_DEBUG_DEATH(Tokenizer::ParseString("", &output),
842     "passed text that could not have been tokenized as a string");
843 #endif  // PROTOBUF_HAS_DEATH_TEST
844 }
845 
TEST_F(TokenizerTest,ParseStringAppend)846 TEST_F(TokenizerTest, ParseStringAppend) {
847   // Check that ParseString and ParseStringAppend differ.
848   string output("stuff+");
849   Tokenizer::ParseStringAppend("'hello'", &output);
850   EXPECT_EQ("stuff+hello", output);
851   Tokenizer::ParseString("'hello'", &output);
852   EXPECT_EQ("hello", output);
853 }
854 
855 // -------------------------------------------------------------------
856 
857 // Each case parses some input text, ignoring the tokens produced, and
858 // checks that the error output matches what is expected.
859 struct ErrorCase {
860   string input;
861   bool recoverable;  // True if the tokenizer should be able to recover and
862                      // parse more tokens after seeing this error.  Cases
863                      // for which this is true must end with "foo" as
864                      // the last token, which the test will check for.
865   const char* errors;
866 };
867 
operator <<(ostream & out,const ErrorCase & test_case)868 inline ostream& operator<<(ostream& out,
869                            const ErrorCase& test_case) {
870   return out << CEscape(test_case.input);
871 }
872 
873 ErrorCase kErrorCases[] = {
874   // String errors.
875   { "'\\l' foo", true,
876     "0:2: Invalid escape sequence in string literal.\n" },
877   { "'\\x' foo", true,
878     "0:3: Expected hex digits for escape sequence.\n" },
879   { "'foo", false,
880     "0:4: Unexpected end of string.\n" },
881   { "'bar\nfoo", true,
882     "0:4: String literals cannot cross line boundaries.\n" },
883   { "'\\u01' foo", true,
884     "0:5: Expected four hex digits for \\u escape sequence.\n" },
885   { "'\\u01' foo", true,
886     "0:5: Expected four hex digits for \\u escape sequence.\n" },
887   { "'\\uXYZ' foo", true,
888     "0:3: Expected four hex digits for \\u escape sequence.\n" },
889 
890   // Integer errors.
891   { "123foo", true,
892     "0:3: Need space between number and identifier.\n" },
893 
894   // Hex/octal errors.
895   { "0x foo", true,
896     "0:2: \"0x\" must be followed by hex digits.\n" },
897   { "0541823 foo", true,
898     "0:4: Numbers starting with leading zero must be in octal.\n" },
899   { "0x123z foo", true,
900     "0:5: Need space between number and identifier.\n" },
901   { "0x123.4 foo", true,
902     "0:5: Hex and octal numbers must be integers.\n" },
903   { "0123.4 foo", true,
904     "0:4: Hex and octal numbers must be integers.\n" },
905 
906   // Float errors.
907   { "1e foo", true,
908     "0:2: \"e\" must be followed by exponent.\n" },
909   { "1e- foo", true,
910     "0:3: \"e\" must be followed by exponent.\n" },
911   { "1.2.3 foo", true,
912     "0:3: Already saw decimal point or exponent; can't have another one.\n" },
913   { "1e2.3 foo", true,
914     "0:3: Already saw decimal point or exponent; can't have another one.\n" },
915   { "a.1 foo", true,
916     "0:1: Need space between identifier and decimal point.\n" },
917   // allow_f_after_float not enabled, so this should be an error.
918   { "1.0f foo", true,
919     "0:3: Need space between number and identifier.\n" },
920 
921   // Block comment errors.
922   { "/*", false,
923     "0:2: End-of-file inside block comment.\n"
924     "0:0:   Comment started here.\n"},
925   { "/*/*/ foo", true,
926     "0:3: \"/*\" inside block comment.  Block comments cannot be nested.\n"},
927 
928   // Control characters.  Multiple consecutive control characters should only
929   // produce one error.
930   { "\b foo", true,
931     "0:0: Invalid control characters encountered in text.\n" },
932   { "\b\b foo", true,
933     "0:0: Invalid control characters encountered in text.\n" },
934 
935   // Check that control characters at end of input don't result in an
936   // infinite loop.
937   { "\b", false,
938     "0:0: Invalid control characters encountered in text.\n" },
939 
940   // Check recovery from '\0'.  We have to explicitly specify the length of
941   // these strings because otherwise the string constructor will just call
942   // strlen() which will see the first '\0' and think that is the end of the
943   // string.
944   { string("\0foo", 4), true,
945     "0:0: Invalid control characters encountered in text.\n" },
946   { string("\0\0foo", 5), true,
947     "0:0: Invalid control characters encountered in text.\n" },
948 
949   // Check error from high order bits set
950   { "\300foo", true,
951     "0:0: Interpreting non ascii codepoint 192.\n" },
952 };
953 
TEST_2D(TokenizerTest,Errors,kErrorCases,kBlockSizes)954 TEST_2D(TokenizerTest, Errors, kErrorCases, kBlockSizes) {
955   // Set up the tokenizer.
956   TestInputStream input(kErrorCases_case.input.data(),
957                         kErrorCases_case.input.size(),
958                         kBlockSizes_case);
959   TestErrorCollector error_collector;
960   Tokenizer tokenizer(&input, &error_collector);
961 
962   // Ignore all input, except remember if the last token was "foo".
963   bool last_was_foo = false;
964   while (tokenizer.Next()) {
965     last_was_foo = tokenizer.current().text == "foo";
966   }
967 
968   // Check that the errors match what was expected.
969   EXPECT_EQ(kErrorCases_case.errors, error_collector.text_);
970 
971   // If the error was recoverable, make sure we saw "foo" after it.
972   if (kErrorCases_case.recoverable) {
973     EXPECT_TRUE(last_was_foo);
974   }
975 }
976 
977 // -------------------------------------------------------------------
978 
TEST_1D(TokenizerTest,BackUpOnDestruction,kBlockSizes)979 TEST_1D(TokenizerTest, BackUpOnDestruction, kBlockSizes) {
980   string text = "foo bar";
981   TestInputStream input(text.data(), text.size(), kBlockSizes_case);
982 
983   // Create a tokenizer, read one token, then destroy it.
984   {
985     TestErrorCollector error_collector;
986     Tokenizer tokenizer(&input, &error_collector);
987 
988     tokenizer.Next();
989   }
990 
991   // Only "foo" should have been read.
992   EXPECT_EQ(strlen("foo"), input.ByteCount());
993 }
994 
995 
996 }  // namespace
997 }  // namespace io
998 }  // namespace protobuf
999 }  // namespace google
1000