• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Protocol Buffers - Google's data interchange format
2 // Copyright 2008 Google Inc.  All rights reserved.
3 // https://developers.google.com/protocol-buffers/
4 //
5 // Redistribution and use in source and binary forms, with or without
6 // modification, are permitted provided that the following conditions are
7 // met:
8 //
9 //     * Redistributions of source code must retain the above copyright
10 // notice, this list of conditions and the following disclaimer.
11 //     * Redistributions in binary form must reproduce the above
12 // copyright notice, this list of conditions and the following disclaimer
13 // in the documentation and/or other materials provided with the
14 // distribution.
15 //     * Neither the name of Google Inc. nor the names of its
16 // contributors may be used to endorse or promote products derived from
17 // this software without specific prior written permission.
18 //
19 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
20 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
21 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
22 // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
23 // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
24 // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
25 // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
26 // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
27 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
28 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
29 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30 
31 // Author: kenton@google.com (Kenton Varda)
32 //  Based on original Protocol Buffers design by
33 //  Sanjay Ghemawat, Jeff Dean, and others.
34 
35 #include <limits.h>
36 #include <math.h>
37 
38 #include <vector>
39 
40 #include <google/protobuf/io/tokenizer.h>
41 #include <google/protobuf/io/zero_copy_stream_impl.h>
42 
43 #include <google/protobuf/stubs/common.h>
44 #include <google/protobuf/stubs/logging.h>
45 #include <google/protobuf/stubs/strutil.h>
46 #include <google/protobuf/stubs/substitute.h>
47 #include <google/protobuf/testing/googletest.h>
48 #include <gtest/gtest.h>
49 
50 namespace google {
51 namespace protobuf {
52 namespace io {
53 namespace {
54 
55 // ===================================================================
56 // Data-Driven Test Infrastructure
57 
58 // TODO(kenton):  This is copied from coded_stream_unittest.  This is
59 //   temporary until these fetaures are integrated into gTest itself.
60 
61 // TEST_1D and TEST_2D are macros I'd eventually like to see added to
62 // gTest.  These macros can be used to declare tests which should be
63 // run multiple times, once for each item in some input array.  TEST_1D
64 // tests all cases in a single input array.  TEST_2D tests all
65 // combinations of cases from two arrays.  The arrays must be statically
66 // defined such that the GOOGLE_ARRAYSIZE() macro works on them.  Example:
67 //
68 // int kCases[] = {1, 2, 3, 4}
69 // TEST_1D(MyFixture, MyTest, kCases) {
70 //   EXPECT_GT(kCases_case, 0);
71 // }
72 //
73 // This test iterates through the numbers 1, 2, 3, and 4 and tests that
74 // they are all grater than zero.  In case of failure, the exact case
75 // which failed will be printed.  The case type must be printable using
76 // ostream::operator<<.
77 
78 #define TEST_1D(FIXTURE, NAME, CASES)                             \
79   class FIXTURE##_##NAME##_DD : public FIXTURE {                  \
80    protected:                                                     \
81     template <typename CaseType>                                  \
82     void DoSingleCase(const CaseType& CASES##_case);              \
83   };                                                              \
84                                                                   \
85   TEST_F(FIXTURE##_##NAME##_DD, NAME) {                           \
86     for (int i = 0; i < GOOGLE_ARRAYSIZE(CASES); i++) {                  \
87       SCOPED_TRACE(testing::Message()                             \
88                    << #CASES " case #" << i << ": " << CASES[i]); \
89       DoSingleCase(CASES[i]);                                     \
90     }                                                             \
91   }                                                               \
92                                                                   \
93   template <typename CaseType>                                    \
94   void FIXTURE##_##NAME##_DD::DoSingleCase(const CaseType& CASES##_case)
95 
96 #define TEST_2D(FIXTURE, NAME, CASES1, CASES2)                              \
97   class FIXTURE##_##NAME##_DD : public FIXTURE {                            \
98    protected:                                                               \
99     template <typename CaseType1, typename CaseType2>                       \
100     void DoSingleCase(const CaseType1& CASES1##_case,                       \
101                       const CaseType2& CASES2##_case);                      \
102   };                                                                        \
103                                                                             \
104   TEST_F(FIXTURE##_##NAME##_DD, NAME) {                                     \
105     for (int i = 0; i < GOOGLE_ARRAYSIZE(CASES1); i++) {                           \
106       for (int j = 0; j < GOOGLE_ARRAYSIZE(CASES2); j++) {                         \
107         SCOPED_TRACE(testing::Message()                                     \
108                      << #CASES1 " case #" << i << ": " << CASES1[i] << ", " \
109                      << #CASES2 " case #" << j << ": " << CASES2[j]);       \
110         DoSingleCase(CASES1[i], CASES2[j]);                                 \
111       }                                                                     \
112     }                                                                       \
113   }                                                                         \
114                                                                             \
115   template <typename CaseType1, typename CaseType2>                         \
116   void FIXTURE##_##NAME##_DD::DoSingleCase(const CaseType1& CASES1##_case,  \
117                                            const CaseType2& CASES2##_case)
118 
119 // -------------------------------------------------------------------
120 
121 // An input stream that is basically like an ArrayInputStream but sometimes
122 // returns empty buffers, just to throw us off.
123 class TestInputStream : public ZeroCopyInputStream {
124  public:
TestInputStream(const void * data,int size,int block_size)125   TestInputStream(const void* data, int size, int block_size)
126       : array_stream_(data, size, block_size), counter_(0) {}
~TestInputStream()127   ~TestInputStream() {}
128 
129   // implements ZeroCopyInputStream ----------------------------------
Next(const void ** data,int * size)130   bool Next(const void** data, int* size) {
131     // We'll return empty buffers starting with the first buffer, and every
132     // 3 and 5 buffers after that.
133     if (counter_ % 3 == 0 || counter_ % 5 == 0) {
134       *data = NULL;
135       *size = 0;
136       ++counter_;
137       return true;
138     } else {
139       ++counter_;
140       return array_stream_.Next(data, size);
141     }
142   }
143 
BackUp(int count)144   void BackUp(int count) { return array_stream_.BackUp(count); }
Skip(int count)145   bool Skip(int count) { return array_stream_.Skip(count); }
ByteCount() const146   int64 ByteCount() const { return array_stream_.ByteCount(); }
147 
148  private:
149   ArrayInputStream array_stream_;
150   int counter_;
151 };
152 
153 // -------------------------------------------------------------------
154 
155 // An error collector which simply concatenates all its errors into a big
156 // block of text which can be checked.
157 class TestErrorCollector : public ErrorCollector {
158  public:
TestErrorCollector()159   TestErrorCollector() {}
~TestErrorCollector()160   ~TestErrorCollector() {}
161 
162   std::string text_;
163 
164   // implements ErrorCollector ---------------------------------------
AddError(int line,int column,const std::string & message)165   void AddError(int line, int column, const std::string& message) {
166     strings::SubstituteAndAppend(&text_, "$0:$1: $2\n", line, column, message);
167   }
168 };
169 
170 // -------------------------------------------------------------------
171 
172 // We test each operation over a variety of block sizes to insure that
173 // we test cases where reads cross buffer boundaries as well as cases
174 // where they don't.  This is sort of a brute-force approach to this,
175 // but it's easy to write and easy to understand.
176 const int kBlockSizes[] = {1, 2, 3, 5, 7, 13, 32, 1024};
177 
178 class TokenizerTest : public testing::Test {
179  protected:
180   // For easy testing.
ParseInteger(const std::string & text)181   uint64 ParseInteger(const std::string& text) {
182     uint64 result;
183     EXPECT_TRUE(Tokenizer::ParseInteger(text, kuint64max, &result));
184     return result;
185   }
186 };
187 
188 // ===================================================================
189 
190 // These tests causes gcc 3.3.5 (and earlier?) to give the cryptic error:
191 //   "sorry, unimplemented: `method_call_expr' not supported by dump_expr"
192 #if !defined(__GNUC__) || __GNUC__ > 3 || (__GNUC__ == 3 && __GNUC_MINOR__ > 3)
193 
194 // In each test case, the entire input text should parse as a single token
195 // of the given type.
196 struct SimpleTokenCase {
197   std::string input;
198   Tokenizer::TokenType type;
199 };
200 
operator <<(std::ostream & out,const SimpleTokenCase & test_case)201 inline std::ostream& operator<<(std::ostream& out,
202                                 const SimpleTokenCase& test_case) {
203   return out << CEscape(test_case.input);
204 }
205 
206 SimpleTokenCase kSimpleTokenCases[] = {
207     // Test identifiers.
208     {"hello", Tokenizer::TYPE_IDENTIFIER},
209 
210     // Test integers.
211     {"123", Tokenizer::TYPE_INTEGER},
212     {"0xab6", Tokenizer::TYPE_INTEGER},
213     {"0XAB6", Tokenizer::TYPE_INTEGER},
214     {"0X1234567", Tokenizer::TYPE_INTEGER},
215     {"0x89abcdef", Tokenizer::TYPE_INTEGER},
216     {"0x89ABCDEF", Tokenizer::TYPE_INTEGER},
217     {"01234567", Tokenizer::TYPE_INTEGER},
218 
219     // Test floats.
220     {"123.45", Tokenizer::TYPE_FLOAT},
221     {"1.", Tokenizer::TYPE_FLOAT},
222     {"1e3", Tokenizer::TYPE_FLOAT},
223     {"1E3", Tokenizer::TYPE_FLOAT},
224     {"1e-3", Tokenizer::TYPE_FLOAT},
225     {"1e+3", Tokenizer::TYPE_FLOAT},
226     {"1.e3", Tokenizer::TYPE_FLOAT},
227     {"1.2e3", Tokenizer::TYPE_FLOAT},
228     {".1", Tokenizer::TYPE_FLOAT},
229     {".1e3", Tokenizer::TYPE_FLOAT},
230     {".1e-3", Tokenizer::TYPE_FLOAT},
231     {".1e+3", Tokenizer::TYPE_FLOAT},
232 
233     // Test strings.
234     {"'hello'", Tokenizer::TYPE_STRING},
235     {"\"foo\"", Tokenizer::TYPE_STRING},
236     {"'a\"b'", Tokenizer::TYPE_STRING},
237     {"\"a'b\"", Tokenizer::TYPE_STRING},
238     {"'a\\'b'", Tokenizer::TYPE_STRING},
239     {"\"a\\\"b\"", Tokenizer::TYPE_STRING},
240     {"'\\xf'", Tokenizer::TYPE_STRING},
241     {"'\\0'", Tokenizer::TYPE_STRING},
242 
243     // Test symbols.
244     {"+", Tokenizer::TYPE_SYMBOL},
245     {".", Tokenizer::TYPE_SYMBOL},
246 };
247 
TEST_2D(TokenizerTest,SimpleTokens,kSimpleTokenCases,kBlockSizes)248 TEST_2D(TokenizerTest, SimpleTokens, kSimpleTokenCases, kBlockSizes) {
249   // Set up the tokenizer.
250   TestInputStream input(kSimpleTokenCases_case.input.data(),
251                         kSimpleTokenCases_case.input.size(), kBlockSizes_case);
252   TestErrorCollector error_collector;
253   Tokenizer tokenizer(&input, &error_collector);
254 
255   // Before Next() is called, the initial token should always be TYPE_START.
256   EXPECT_EQ(Tokenizer::TYPE_START, tokenizer.current().type);
257   EXPECT_EQ("", tokenizer.current().text);
258   EXPECT_EQ(0, tokenizer.current().line);
259   EXPECT_EQ(0, tokenizer.current().column);
260   EXPECT_EQ(0, tokenizer.current().end_column);
261 
262   // Parse the token.
263   ASSERT_TRUE(tokenizer.Next());
264 
265   // Check that it has the right type.
266   EXPECT_EQ(kSimpleTokenCases_case.type, tokenizer.current().type);
267   // Check that it contains the complete input text.
268   EXPECT_EQ(kSimpleTokenCases_case.input, tokenizer.current().text);
269   // Check that it is located at the beginning of the input
270   EXPECT_EQ(0, tokenizer.current().line);
271   EXPECT_EQ(0, tokenizer.current().column);
272   EXPECT_EQ(kSimpleTokenCases_case.input.size(),
273             tokenizer.current().end_column);
274 
275   // There should be no more input.
276   EXPECT_FALSE(tokenizer.Next());
277 
278   // After Next() returns false, the token should have type TYPE_END.
279   EXPECT_EQ(Tokenizer::TYPE_END, tokenizer.current().type);
280   EXPECT_EQ("", tokenizer.current().text);
281   EXPECT_EQ(0, tokenizer.current().line);
282   EXPECT_EQ(kSimpleTokenCases_case.input.size(), tokenizer.current().column);
283   EXPECT_EQ(kSimpleTokenCases_case.input.size(),
284             tokenizer.current().end_column);
285 
286   // There should be no errors.
287   EXPECT_TRUE(error_collector.text_.empty());
288 }
289 
TEST_1D(TokenizerTest,FloatSuffix,kBlockSizes)290 TEST_1D(TokenizerTest, FloatSuffix, kBlockSizes) {
291   // Test the "allow_f_after_float" option.
292 
293   // Set up the tokenizer.
294   const char* text = "1f 2.5f 6e3f 7F";
295   TestInputStream input(text, strlen(text), kBlockSizes_case);
296   TestErrorCollector error_collector;
297   Tokenizer tokenizer(&input, &error_collector);
298   tokenizer.set_allow_f_after_float(true);
299 
300   // Advance through tokens and check that they are parsed as expected.
301   ASSERT_TRUE(tokenizer.Next());
302   EXPECT_EQ(tokenizer.current().text, "1f");
303   EXPECT_EQ(tokenizer.current().type, Tokenizer::TYPE_FLOAT);
304   ASSERT_TRUE(tokenizer.Next());
305   EXPECT_EQ(tokenizer.current().text, "2.5f");
306   EXPECT_EQ(tokenizer.current().type, Tokenizer::TYPE_FLOAT);
307   ASSERT_TRUE(tokenizer.Next());
308   EXPECT_EQ(tokenizer.current().text, "6e3f");
309   EXPECT_EQ(tokenizer.current().type, Tokenizer::TYPE_FLOAT);
310   ASSERT_TRUE(tokenizer.Next());
311   EXPECT_EQ(tokenizer.current().text, "7F");
312   EXPECT_EQ(tokenizer.current().type, Tokenizer::TYPE_FLOAT);
313 
314   // There should be no more input.
315   EXPECT_FALSE(tokenizer.Next());
316   // There should be no errors.
317   EXPECT_TRUE(error_collector.text_.empty());
318 }
319 
320 #endif
321 
322 // -------------------------------------------------------------------
323 
324 // In each case, the input is parsed to produce a list of tokens.  The
325 // last token in "output" must have type TYPE_END.
326 struct MultiTokenCase {
327   std::string input;
328   Tokenizer::Token output[10];  // The compiler wants a constant array
329                                 // size for initialization to work.  There
330                                 // is no reason this can't be increased if
331                                 // needed.
332 };
333 
operator <<(std::ostream & out,const MultiTokenCase & test_case)334 inline std::ostream& operator<<(std::ostream& out,
335                                 const MultiTokenCase& test_case) {
336   return out << CEscape(test_case.input);
337 }
338 
339 MultiTokenCase kMultiTokenCases[] = {
340     // Test empty input.
341     {"",
342      {
343          {Tokenizer::TYPE_END, "", 0, 0, 0},
344      }},
345 
346     // Test all token types at the same time.
347     {"foo 1 1.2 + 'bar'",
348      {
349          {Tokenizer::TYPE_IDENTIFIER, "foo", 0, 0, 3},
350          {Tokenizer::TYPE_INTEGER, "1", 0, 4, 5},
351          {Tokenizer::TYPE_FLOAT, "1.2", 0, 6, 9},
352          {Tokenizer::TYPE_SYMBOL, "+", 0, 10, 11},
353          {Tokenizer::TYPE_STRING, "'bar'", 0, 12, 17},
354          {Tokenizer::TYPE_END, "", 0, 17, 17},
355      }},
356 
357     // Test that consecutive symbols are parsed as separate tokens.
358     {"!@+%",
359      {
360          {Tokenizer::TYPE_SYMBOL, "!", 0, 0, 1},
361          {Tokenizer::TYPE_SYMBOL, "@", 0, 1, 2},
362          {Tokenizer::TYPE_SYMBOL, "+", 0, 2, 3},
363          {Tokenizer::TYPE_SYMBOL, "%", 0, 3, 4},
364          {Tokenizer::TYPE_END, "", 0, 4, 4},
365      }},
366 
367     // Test that newlines affect line numbers correctly.
368     {"foo bar\nrab oof",
369      {
370          {Tokenizer::TYPE_IDENTIFIER, "foo", 0, 0, 3},
371          {Tokenizer::TYPE_IDENTIFIER, "bar", 0, 4, 7},
372          {Tokenizer::TYPE_IDENTIFIER, "rab", 1, 0, 3},
373          {Tokenizer::TYPE_IDENTIFIER, "oof", 1, 4, 7},
374          {Tokenizer::TYPE_END, "", 1, 7, 7},
375      }},
376 
377     // Test that tabs affect column numbers correctly.
378     {"foo\tbar  \tbaz",
379      {
380          {Tokenizer::TYPE_IDENTIFIER, "foo", 0, 0, 3},
381          {Tokenizer::TYPE_IDENTIFIER, "bar", 0, 8, 11},
382          {Tokenizer::TYPE_IDENTIFIER, "baz", 0, 16, 19},
383          {Tokenizer::TYPE_END, "", 0, 19, 19},
384      }},
385 
386     // Test that tabs in string literals affect column numbers correctly.
387     {"\"foo\tbar\" baz",
388      {
389          {Tokenizer::TYPE_STRING, "\"foo\tbar\"", 0, 0, 12},
390          {Tokenizer::TYPE_IDENTIFIER, "baz", 0, 13, 16},
391          {Tokenizer::TYPE_END, "", 0, 16, 16},
392      }},
393 
394     // Test that line comments are ignored.
395     {"foo // This is a comment\n"
396      "bar // This is another comment",
397      {
398          {Tokenizer::TYPE_IDENTIFIER, "foo", 0, 0, 3},
399          {Tokenizer::TYPE_IDENTIFIER, "bar", 1, 0, 3},
400          {Tokenizer::TYPE_END, "", 1, 30, 30},
401      }},
402 
403     // Test that block comments are ignored.
404     {"foo /* This is a block comment */ bar",
405      {
406          {Tokenizer::TYPE_IDENTIFIER, "foo", 0, 0, 3},
407          {Tokenizer::TYPE_IDENTIFIER, "bar", 0, 34, 37},
408          {Tokenizer::TYPE_END, "", 0, 37, 37},
409      }},
410 
411     // Test that sh-style comments are not ignored by default.
412     {"foo # bar\n"
413      "baz",
414      {
415          {Tokenizer::TYPE_IDENTIFIER, "foo", 0, 0, 3},
416          {Tokenizer::TYPE_SYMBOL, "#", 0, 4, 5},
417          {Tokenizer::TYPE_IDENTIFIER, "bar", 0, 6, 9},
418          {Tokenizer::TYPE_IDENTIFIER, "baz", 1, 0, 3},
419          {Tokenizer::TYPE_END, "", 1, 3, 3},
420      }},
421 
422     // Test all whitespace chars
423     {"foo\n\t\r\v\fbar",
424      {
425          {Tokenizer::TYPE_IDENTIFIER, "foo", 0, 0, 3},
426          {Tokenizer::TYPE_IDENTIFIER, "bar", 1, 11, 14},
427          {Tokenizer::TYPE_END, "", 1, 14, 14},
428      }},
429 };
430 
TEST_2D(TokenizerTest,MultipleTokens,kMultiTokenCases,kBlockSizes)431 TEST_2D(TokenizerTest, MultipleTokens, kMultiTokenCases, kBlockSizes) {
432   // Set up the tokenizer.
433   TestInputStream input(kMultiTokenCases_case.input.data(),
434                         kMultiTokenCases_case.input.size(), kBlockSizes_case);
435   TestErrorCollector error_collector;
436   Tokenizer tokenizer(&input, &error_collector);
437 
438   // Before Next() is called, the initial token should always be TYPE_START.
439   EXPECT_EQ(Tokenizer::TYPE_START, tokenizer.current().type);
440   EXPECT_EQ("", tokenizer.current().text);
441   EXPECT_EQ(0, tokenizer.current().line);
442   EXPECT_EQ(0, tokenizer.current().column);
443   EXPECT_EQ(0, tokenizer.current().end_column);
444 
445   // Loop through all expected tokens.
446   int i = 0;
447   Tokenizer::Token token;
448   do {
449     token = kMultiTokenCases_case.output[i++];
450 
451     SCOPED_TRACE(testing::Message() << "Token #" << i << ": " << token.text);
452 
453     Tokenizer::Token previous = tokenizer.current();
454 
455     // Next() should only return false when it hits the end token.
456     if (token.type != Tokenizer::TYPE_END) {
457       ASSERT_TRUE(tokenizer.Next());
458     } else {
459       ASSERT_FALSE(tokenizer.Next());
460     }
461 
462     // Check that the previous token is set correctly.
463     EXPECT_EQ(previous.type, tokenizer.previous().type);
464     EXPECT_EQ(previous.text, tokenizer.previous().text);
465     EXPECT_EQ(previous.line, tokenizer.previous().line);
466     EXPECT_EQ(previous.column, tokenizer.previous().column);
467     EXPECT_EQ(previous.end_column, tokenizer.previous().end_column);
468 
469     // Check that the token matches the expected one.
470     EXPECT_EQ(token.type, tokenizer.current().type);
471     EXPECT_EQ(token.text, tokenizer.current().text);
472     EXPECT_EQ(token.line, tokenizer.current().line);
473     EXPECT_EQ(token.column, tokenizer.current().column);
474     EXPECT_EQ(token.end_column, tokenizer.current().end_column);
475 
476   } while (token.type != Tokenizer::TYPE_END);
477 
478   // There should be no errors.
479   EXPECT_TRUE(error_collector.text_.empty());
480 }
481 
482 // This test causes gcc 3.3.5 (and earlier?) to give the cryptic error:
483 //   "sorry, unimplemented: `method_call_expr' not supported by dump_expr"
484 #if !defined(__GNUC__) || __GNUC__ > 3 || (__GNUC__ == 3 && __GNUC_MINOR__ > 3)
485 
TEST_1D(TokenizerTest,ShCommentStyle,kBlockSizes)486 TEST_1D(TokenizerTest, ShCommentStyle, kBlockSizes) {
487   // Test the "comment_style" option.
488 
489   const char* text =
490       "foo # bar\n"
491       "baz // qux\n"
492       "corge /* grault */\n"
493       "garply";
494   const char* const kTokens[] = {"foo",  // "# bar" is ignored
495                                  "baz", "/",      "/", "qux", "corge", "/",
496                                  "*",   "grault", "*", "/",   "garply"};
497 
498   // Set up the tokenizer.
499   TestInputStream input(text, strlen(text), kBlockSizes_case);
500   TestErrorCollector error_collector;
501   Tokenizer tokenizer(&input, &error_collector);
502   tokenizer.set_comment_style(Tokenizer::SH_COMMENT_STYLE);
503 
504   // Advance through tokens and check that they are parsed as expected.
505   for (int i = 0; i < GOOGLE_ARRAYSIZE(kTokens); i++) {
506     EXPECT_TRUE(tokenizer.Next());
507     EXPECT_EQ(tokenizer.current().text, kTokens[i]);
508   }
509 
510   // There should be no more input.
511   EXPECT_FALSE(tokenizer.Next());
512   // There should be no errors.
513   EXPECT_TRUE(error_collector.text_.empty());
514 }
515 
516 #endif
517 
518 // -------------------------------------------------------------------
519 
520 // In each case, the input is expected to have two tokens named "prev" and
521 // "next" with comments in between.
522 struct DocCommentCase {
523   std::string input;
524 
525   const char* prev_trailing_comments;
526   const char* detached_comments[10];
527   const char* next_leading_comments;
528 };
529 
operator <<(std::ostream & out,const DocCommentCase & test_case)530 inline std::ostream& operator<<(std::ostream& out,
531                                 const DocCommentCase& test_case) {
532   return out << CEscape(test_case.input);
533 }
534 
535 DocCommentCase kDocCommentCases[] = {
536     {"prev next",
537 
538      "",
539      {},
540      ""},
541 
542     {"prev /* ignored */ next",
543 
544      "",
545      {},
546      ""},
547 
548     {"prev // trailing comment\n"
549      "next",
550 
551      " trailing comment\n",
552      {},
553      ""},
554 
555     {"prev\n"
556      "// leading comment\n"
557      "// line 2\n"
558      "next",
559 
560      "",
561      {},
562      " leading comment\n"
563      " line 2\n"},
564 
565     {"prev\n"
566      "// trailing comment\n"
567      "// line 2\n"
568      "\n"
569      "next",
570 
571      " trailing comment\n"
572      " line 2\n",
573      {},
574      ""},
575 
576     {"prev // trailing comment\n"
577      "// leading comment\n"
578      "// line 2\n"
579      "next",
580 
581      " trailing comment\n",
582      {},
583      " leading comment\n"
584      " line 2\n"},
585 
586     {"prev /* trailing block comment */\n"
587      "/* leading block comment\n"
588      " * line 2\n"
589      " * line 3 */"
590      "next",
591 
592      " trailing block comment ",
593      {},
594      " leading block comment\n"
595      " line 2\n"
596      " line 3 "},
597 
598     {"prev\n"
599      "/* trailing block comment\n"
600      " * line 2\n"
601      " * line 3\n"
602      " */\n"
603      "/* leading block comment\n"
604      " * line 2\n"
605      " * line 3 */"
606      "next",
607 
608      " trailing block comment\n"
609      " line 2\n"
610      " line 3\n",
611      {},
612      " leading block comment\n"
613      " line 2\n"
614      " line 3 "},
615 
616     {"prev\n"
617      "// trailing comment\n"
618      "\n"
619      "// detached comment\n"
620      "// line 2\n"
621      "\n"
622      "// second detached comment\n"
623      "/* third detached comment\n"
624      " * line 2 */\n"
625      "// leading comment\n"
626      "next",
627 
628      " trailing comment\n",
629      {" detached comment\n"
630       " line 2\n",
631       " second detached comment\n",
632       " third detached comment\n"
633       " line 2 "},
634      " leading comment\n"},
635 
636     {"prev /**/\n"
637      "\n"
638      "// detached comment\n"
639      "\n"
640      "// leading comment\n"
641      "next",
642 
643      "",
644      {" detached comment\n"},
645      " leading comment\n"},
646 
647     {"prev /**/\n"
648      "// leading comment\n"
649      "next",
650 
651      "",
652      {},
653      " leading comment\n"},
654 };
655 
TEST_2D(TokenizerTest,DocComments,kDocCommentCases,kBlockSizes)656 TEST_2D(TokenizerTest, DocComments, kDocCommentCases, kBlockSizes) {
657   // Set up the tokenizer.
658   TestInputStream input(kDocCommentCases_case.input.data(),
659                         kDocCommentCases_case.input.size(), kBlockSizes_case);
660   TestErrorCollector error_collector;
661   Tokenizer tokenizer(&input, &error_collector);
662 
663   // Set up a second tokenizer where we'll pass all NULLs to NextWithComments().
664   TestInputStream input2(kDocCommentCases_case.input.data(),
665                          kDocCommentCases_case.input.size(), kBlockSizes_case);
666   Tokenizer tokenizer2(&input2, &error_collector);
667 
668   tokenizer.Next();
669   tokenizer2.Next();
670 
671   EXPECT_EQ("prev", tokenizer.current().text);
672   EXPECT_EQ("prev", tokenizer2.current().text);
673 
674   std::string prev_trailing_comments;
675   std::vector<std::string> detached_comments;
676   std::string next_leading_comments;
677   tokenizer.NextWithComments(&prev_trailing_comments, &detached_comments,
678                              &next_leading_comments);
679   tokenizer2.NextWithComments(NULL, NULL, NULL);
680   EXPECT_EQ("next", tokenizer.current().text);
681   EXPECT_EQ("next", tokenizer2.current().text);
682 
683   EXPECT_EQ(kDocCommentCases_case.prev_trailing_comments,
684             prev_trailing_comments);
685 
686   for (int i = 0; i < detached_comments.size(); i++) {
687     ASSERT_LT(i, GOOGLE_ARRAYSIZE(kDocCommentCases));
688     ASSERT_TRUE(kDocCommentCases_case.detached_comments[i] != NULL);
689     EXPECT_EQ(kDocCommentCases_case.detached_comments[i], detached_comments[i]);
690   }
691 
692   // Verify that we matched all the detached comments.
693   EXPECT_EQ(NULL,
694             kDocCommentCases_case.detached_comments[detached_comments.size()]);
695 
696   EXPECT_EQ(kDocCommentCases_case.next_leading_comments, next_leading_comments);
697 }
698 
699 // -------------------------------------------------------------------
700 
701 // Test parse helpers.  It's not really worth setting up a full data-driven
702 // test here.
TEST_F(TokenizerTest,ParseInteger)703 TEST_F(TokenizerTest, ParseInteger) {
704   EXPECT_EQ(0, ParseInteger("0"));
705   EXPECT_EQ(123, ParseInteger("123"));
706   EXPECT_EQ(0xabcdef12u, ParseInteger("0xabcdef12"));
707   EXPECT_EQ(0xabcdef12u, ParseInteger("0xABCDEF12"));
708   EXPECT_EQ(kuint64max, ParseInteger("0xFFFFFFFFFFFFFFFF"));
709   EXPECT_EQ(01234567, ParseInteger("01234567"));
710   EXPECT_EQ(0X123, ParseInteger("0X123"));
711 
712   // Test invalid integers that may still be tokenized as integers.
713   EXPECT_EQ(0, ParseInteger("0x"));
714 
715   uint64 i;
716 
717   // Test invalid integers that will never be tokenized as integers.
718   EXPECT_FALSE(Tokenizer::ParseInteger("zxy", kuint64max, &i));
719   EXPECT_FALSE(Tokenizer::ParseInteger("1.2", kuint64max, &i));
720   EXPECT_FALSE(Tokenizer::ParseInteger("08", kuint64max, &i));
721   EXPECT_FALSE(Tokenizer::ParseInteger("0xg", kuint64max, &i));
722   EXPECT_FALSE(Tokenizer::ParseInteger("-1", kuint64max, &i));
723 
724   // Test overflows.
725   EXPECT_TRUE(Tokenizer::ParseInteger("0", 0, &i));
726   EXPECT_FALSE(Tokenizer::ParseInteger("1", 0, &i));
727   EXPECT_TRUE(Tokenizer::ParseInteger("1", 1, &i));
728   EXPECT_TRUE(Tokenizer::ParseInteger("12345", 12345, &i));
729   EXPECT_FALSE(Tokenizer::ParseInteger("12346", 12345, &i));
730   EXPECT_TRUE(Tokenizer::ParseInteger("0xFFFFFFFFFFFFFFFF", kuint64max, &i));
731   EXPECT_FALSE(Tokenizer::ParseInteger("0x10000000000000000", kuint64max, &i));
732 }
733 
TEST_F(TokenizerTest,ParseFloat)734 TEST_F(TokenizerTest, ParseFloat) {
735   EXPECT_DOUBLE_EQ(1, Tokenizer::ParseFloat("1."));
736   EXPECT_DOUBLE_EQ(1e3, Tokenizer::ParseFloat("1e3"));
737   EXPECT_DOUBLE_EQ(1e3, Tokenizer::ParseFloat("1E3"));
738   EXPECT_DOUBLE_EQ(1.5e3, Tokenizer::ParseFloat("1.5e3"));
739   EXPECT_DOUBLE_EQ(.1, Tokenizer::ParseFloat(".1"));
740   EXPECT_DOUBLE_EQ(.25, Tokenizer::ParseFloat(".25"));
741   EXPECT_DOUBLE_EQ(.1e3, Tokenizer::ParseFloat(".1e3"));
742   EXPECT_DOUBLE_EQ(.25e3, Tokenizer::ParseFloat(".25e3"));
743   EXPECT_DOUBLE_EQ(.1e+3, Tokenizer::ParseFloat(".1e+3"));
744   EXPECT_DOUBLE_EQ(.1e-3, Tokenizer::ParseFloat(".1e-3"));
745   EXPECT_DOUBLE_EQ(5, Tokenizer::ParseFloat("5"));
746   EXPECT_DOUBLE_EQ(6e-12, Tokenizer::ParseFloat("6e-12"));
747   EXPECT_DOUBLE_EQ(1.2, Tokenizer::ParseFloat("1.2"));
748   EXPECT_DOUBLE_EQ(1.e2, Tokenizer::ParseFloat("1.e2"));
749 
750   // Test invalid integers that may still be tokenized as integers.
751   EXPECT_DOUBLE_EQ(1, Tokenizer::ParseFloat("1e"));
752   EXPECT_DOUBLE_EQ(1, Tokenizer::ParseFloat("1e-"));
753   EXPECT_DOUBLE_EQ(1, Tokenizer::ParseFloat("1.e"));
754 
755   // Test 'f' suffix.
756   EXPECT_DOUBLE_EQ(1, Tokenizer::ParseFloat("1f"));
757   EXPECT_DOUBLE_EQ(1, Tokenizer::ParseFloat("1.0f"));
758   EXPECT_DOUBLE_EQ(1, Tokenizer::ParseFloat("1F"));
759 
760   // These should parse successfully even though they are out of range.
761   // Overflows become infinity and underflows become zero.
762   EXPECT_EQ(0.0, Tokenizer::ParseFloat("1e-9999999999999999999999999999"));
763   EXPECT_EQ(HUGE_VAL, Tokenizer::ParseFloat("1e+9999999999999999999999999999"));
764 
765 #ifdef PROTOBUF_HAS_DEATH_TEST  // death tests do not work on Windows yet
766   // Test invalid integers that will never be tokenized as integers.
767   EXPECT_DEBUG_DEATH(
768       Tokenizer::ParseFloat("zxy"),
769       "passed text that could not have been tokenized as a float");
770   EXPECT_DEBUG_DEATH(
771       Tokenizer::ParseFloat("1-e0"),
772       "passed text that could not have been tokenized as a float");
773   EXPECT_DEBUG_DEATH(
774       Tokenizer::ParseFloat("-1.0"),
775       "passed text that could not have been tokenized as a float");
776 #endif  // PROTOBUF_HAS_DEATH_TEST
777 }
778 
TEST_F(TokenizerTest,ParseString)779 TEST_F(TokenizerTest, ParseString) {
780   std::string output;
781   Tokenizer::ParseString("'hello'", &output);
782   EXPECT_EQ("hello", output);
783   Tokenizer::ParseString("\"blah\\nblah2\"", &output);
784   EXPECT_EQ("blah\nblah2", output);
785   Tokenizer::ParseString("'\\1x\\1\\123\\739\\52\\334n\\3'", &output);
786   EXPECT_EQ("\1x\1\123\739\52\334n\3", output);
787   Tokenizer::ParseString("'\\x20\\x4'", &output);
788   EXPECT_EQ("\x20\x4", output);
789 
790   // Test invalid strings that may still be tokenized as strings.
791   Tokenizer::ParseString("\"\\a\\l\\v\\t", &output);  // \l is invalid
792   EXPECT_EQ("\a?\v\t", output);
793   Tokenizer::ParseString("'", &output);
794   EXPECT_EQ("", output);
795   Tokenizer::ParseString("'\\", &output);
796   EXPECT_EQ("\\", output);
797 
798   // Experiment with Unicode escapes. Here are one-, two- and three-byte Unicode
799   // characters.
800   Tokenizer::ParseString("'\\u0024\\u00a2\\u20ac\\U00024b62XX'", &output);
801   EXPECT_EQ("$¢€��XX", output);
802   // Same thing encoded using UTF16.
803   Tokenizer::ParseString("'\\u0024\\u00a2\\u20ac\\ud852\\udf62XX'", &output);
804   EXPECT_EQ("$¢€��XX", output);
805   // Here's some broken UTF16; there's a head surrogate with no tail surrogate.
806   // We just output this as if it were UTF8; it's not a defined code point, but
807   // it has a defined encoding.
808   Tokenizer::ParseString("'\\ud852XX'", &output);
809   EXPECT_EQ("\xed\xa1\x92XX", output);
810   // Malformed escape: Demons may fly out of the nose.
811   Tokenizer::ParseString("\\u0", &output);
812   EXPECT_EQ("u0", output);
813 
814   // Test invalid strings that will never be tokenized as strings.
815 #ifdef PROTOBUF_HAS_DEATH_TEST  // death tests do not work on Windows yet
816   EXPECT_DEBUG_DEATH(
817       Tokenizer::ParseString("", &output),
818       "passed text that could not have been tokenized as a string");
819 #endif  // PROTOBUF_HAS_DEATH_TEST
820 }
821 
TEST_F(TokenizerTest,ParseStringAppend)822 TEST_F(TokenizerTest, ParseStringAppend) {
823   // Check that ParseString and ParseStringAppend differ.
824   std::string output("stuff+");
825   Tokenizer::ParseStringAppend("'hello'", &output);
826   EXPECT_EQ("stuff+hello", output);
827   Tokenizer::ParseString("'hello'", &output);
828   EXPECT_EQ("hello", output);
829 }
830 
831 // -------------------------------------------------------------------
832 
833 // Each case parses some input text, ignoring the tokens produced, and
834 // checks that the error output matches what is expected.
835 struct ErrorCase {
836   std::string input;
837   bool recoverable;  // True if the tokenizer should be able to recover and
838                      // parse more tokens after seeing this error.  Cases
839                      // for which this is true must end with "foo" as
840                      // the last token, which the test will check for.
841   const char* errors;
842 };
843 
operator <<(std::ostream & out,const ErrorCase & test_case)844 inline std::ostream& operator<<(std::ostream& out, const ErrorCase& test_case) {
845   return out << CEscape(test_case.input);
846 }
847 
848 ErrorCase kErrorCases[] = {
849     // String errors.
850     {"'\\l' foo", true, "0:2: Invalid escape sequence in string literal.\n"},
851     {"'\\X' foo", true, "0:2: Invalid escape sequence in string literal.\n"},
852     {"'\\x' foo", true, "0:3: Expected hex digits for escape sequence.\n"},
853     {"'foo", false, "0:4: Unexpected end of string.\n"},
854     {"'bar\nfoo", true, "0:4: String literals cannot cross line boundaries.\n"},
855     {"'\\u01' foo", true,
856      "0:5: Expected four hex digits for \\u escape sequence.\n"},
857     {"'\\u01' foo", true,
858      "0:5: Expected four hex digits for \\u escape sequence.\n"},
859     {"'\\uXYZ' foo", true,
860      "0:3: Expected four hex digits for \\u escape sequence.\n"},
861 
862     // Integer errors.
863     {"123foo", true, "0:3: Need space between number and identifier.\n"},
864 
865     // Hex/octal errors.
866     {"0x foo", true, "0:2: \"0x\" must be followed by hex digits.\n"},
867     {"0541823 foo", true,
868      "0:4: Numbers starting with leading zero must be in octal.\n"},
869     {"0x123z foo", true, "0:5: Need space between number and identifier.\n"},
870     {"0x123.4 foo", true, "0:5: Hex and octal numbers must be integers.\n"},
871     {"0123.4 foo", true, "0:4: Hex and octal numbers must be integers.\n"},
872 
873     // Float errors.
874     {"1e foo", true, "0:2: \"e\" must be followed by exponent.\n"},
875     {"1e- foo", true, "0:3: \"e\" must be followed by exponent.\n"},
876     {"1.2.3 foo", true,
877      "0:3: Already saw decimal point or exponent; can't have another one.\n"},
878     {"1e2.3 foo", true,
879      "0:3: Already saw decimal point or exponent; can't have another one.\n"},
880     {"a.1 foo", true,
881      "0:1: Need space between identifier and decimal point.\n"},
882     // allow_f_after_float not enabled, so this should be an error.
883     {"1.0f foo", true, "0:3: Need space between number and identifier.\n"},
884 
885     // Block comment errors.
886     {"/*", false,
887      "0:2: End-of-file inside block comment.\n"
888      "0:0:   Comment started here.\n"},
889     {"/*/*/ foo", true,
890      "0:3: \"/*\" inside block comment.  Block comments cannot be nested.\n"},
891 
892     // Control characters.  Multiple consecutive control characters should only
893     // produce one error.
894     {"\b foo", true, "0:0: Invalid control characters encountered in text.\n"},
895     {"\b\b foo", true,
896      "0:0: Invalid control characters encountered in text.\n"},
897 
898     // Check that control characters at end of input don't result in an
899     // infinite loop.
900     {"\b", false, "0:0: Invalid control characters encountered in text.\n"},
901 
902     // Check recovery from '\0'.  We have to explicitly specify the length of
903     // these strings because otherwise the string constructor will just call
904     // strlen() which will see the first '\0' and think that is the end of the
905     // string.
906     {std::string("\0foo", 4), true,
907      "0:0: Invalid control characters encountered in text.\n"},
908     {std::string("\0\0foo", 5), true,
909      "0:0: Invalid control characters encountered in text.\n"},
910 
911     // Check error from high order bits set
912     {"\300foo", true, "0:0: Interpreting non ascii codepoint 192.\n"},
913 };
914 
TEST_2D(TokenizerTest,Errors,kErrorCases,kBlockSizes)915 TEST_2D(TokenizerTest, Errors, kErrorCases, kBlockSizes) {
916   // Set up the tokenizer.
917   TestInputStream input(kErrorCases_case.input.data(),
918                         kErrorCases_case.input.size(), kBlockSizes_case);
919   TestErrorCollector error_collector;
920   Tokenizer tokenizer(&input, &error_collector);
921 
922   // Ignore all input, except remember if the last token was "foo".
923   bool last_was_foo = false;
924   while (tokenizer.Next()) {
925     last_was_foo = tokenizer.current().text == "foo";
926   }
927 
928   // Check that the errors match what was expected.
929   EXPECT_EQ(kErrorCases_case.errors, error_collector.text_);
930 
931   // If the error was recoverable, make sure we saw "foo" after it.
932   if (kErrorCases_case.recoverable) {
933     EXPECT_TRUE(last_was_foo);
934   }
935 }
936 
937 // -------------------------------------------------------------------
938 
TEST_1D(TokenizerTest,BackUpOnDestruction,kBlockSizes)939 TEST_1D(TokenizerTest, BackUpOnDestruction, kBlockSizes) {
940   std::string text = "foo bar";
941   TestInputStream input(text.data(), text.size(), kBlockSizes_case);
942 
943   // Create a tokenizer, read one token, then destroy it.
944   {
945     TestErrorCollector error_collector;
946     Tokenizer tokenizer(&input, &error_collector);
947 
948     tokenizer.Next();
949   }
950 
951   // Only "foo" should have been read.
952   EXPECT_EQ(strlen("foo"), input.ByteCount());
953 }
954 
955 
956 }  // namespace
957 }  // namespace io
958 }  // namespace protobuf
959 }  // namespace google
960