• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Protocol Buffers - Google's data interchange format
2 // Copyright 2008 Google Inc.  All rights reserved.
3 //
4 // Use of this source code is governed by a BSD-style
5 // license that can be found in the LICENSE file or at
6 // https://developers.google.com/open-source/licenses/bsd
7 
8 // Author: kenton@google.com (Kenton Varda)
9 //  Based on original Protocol Buffers design by
10 //  Sanjay Ghemawat, Jeff Dean, and others.
11 
12 #include "google/protobuf/io/tokenizer.h"
13 
14 #include <limits.h>
15 #include <math.h>
16 
17 #include <vector>
18 
19 #include "google/protobuf/stubs/common.h"
20 #include "absl/strings/escaping.h"
21 #include "absl/strings/substitute.h"
22 #include "google/protobuf/io/zero_copy_stream_impl.h"
23 #include "google/protobuf/testing/googletest.h"
24 #include <gtest/gtest.h>
25 
26 namespace google {
27 namespace protobuf {
28 namespace io {
29 namespace {
30 
31 // ===================================================================
32 // Data-Driven Test Infrastructure
33 
34 // TODO:  This is copied from coded_stream_unittest.  This is
35 //   temporary until these features are integrated into gTest itself.
36 
37 // TEST_1D and TEST_2D are macros I'd eventually like to see added to
38 // gTest.  These macros can be used to declare tests which should be
39 // run multiple times, once for each item in some input array.  TEST_1D
40 // tests all cases in a single input array.  TEST_2D tests all
41 // combinations of cases from two arrays.  The arrays must be statically
42 // defined such that the ABSL_ARRAYSIZE() macro works on them.  Example:
43 //
44 // int kCases[] = {1, 2, 3, 4}
45 // TEST_1D(MyFixture, MyTest, kCases) {
46 //   EXPECT_GT(kCases_case, 0);
47 // }
48 //
49 // This test iterates through the numbers 1, 2, 3, and 4 and tests that
50 // they are all grater than zero.  In case of failure, the exact case
51 // which failed will be printed.  The case type must be printable using
52 // ostream::operator<<.
53 
54 #define TEST_1D(FIXTURE, NAME, CASES)                             \
55   class FIXTURE##_##NAME##_DD : public FIXTURE {                  \
56    protected:                                                     \
57     template <typename CaseType>                                  \
58     void DoSingleCase(const CaseType& CASES##_case);              \
59   };                                                              \
60                                                                   \
61   TEST_F(FIXTURE##_##NAME##_DD, NAME) {                           \
62     for (int i = 0; i < ABSL_ARRAYSIZE(CASES); i++) {             \
63       SCOPED_TRACE(testing::Message()                             \
64                    << #CASES " case #" << i << ": " << CASES[i]); \
65       DoSingleCase(CASES[i]);                                     \
66     }                                                             \
67   }                                                               \
68                                                                   \
69   template <typename CaseType>                                    \
70   void FIXTURE##_##NAME##_DD::DoSingleCase(const CaseType& CASES##_case)
71 
72 #define TEST_2D(FIXTURE, NAME, CASES1, CASES2)                              \
73   class FIXTURE##_##NAME##_DD : public FIXTURE {                            \
74    protected:                                                               \
75     template <typename CaseType1, typename CaseType2>                       \
76     void DoSingleCase(const CaseType1& CASES1##_case,                       \
77                       const CaseType2& CASES2##_case);                      \
78   };                                                                        \
79                                                                             \
80   TEST_F(FIXTURE##_##NAME##_DD, NAME) {                                     \
81     for (int i = 0; i < ABSL_ARRAYSIZE(CASES1); i++) {                      \
82       for (int j = 0; j < ABSL_ARRAYSIZE(CASES2); j++) {                    \
83         SCOPED_TRACE(testing::Message()                                     \
84                      << #CASES1 " case #" << i << ": " << CASES1[i] << ", " \
85                      << #CASES2 " case #" << j << ": " << CASES2[j]);       \
86         DoSingleCase(CASES1[i], CASES2[j]);                                 \
87       }                                                                     \
88     }                                                                       \
89   }                                                                         \
90                                                                             \
91   template <typename CaseType1, typename CaseType2>                         \
92   void FIXTURE##_##NAME##_DD::DoSingleCase(const CaseType1& CASES1##_case,  \
93                                            const CaseType2& CASES2##_case)
94 
95 // -------------------------------------------------------------------
96 
97 // An input stream that is basically like an ArrayInputStream but sometimes
98 // returns empty buffers, just to throw us off.
99 class TestInputStream : public ZeroCopyInputStream {
100  public:
TestInputStream(const void * data,int size,int block_size)101   TestInputStream(const void* data, int size, int block_size)
102       : array_stream_(data, size, block_size), counter_(0) {}
~TestInputStream()103   ~TestInputStream() {}
104 
105   // implements ZeroCopyInputStream ----------------------------------
Next(const void ** data,int * size)106   bool Next(const void** data, int* size) override {
107     // We'll return empty buffers starting with the first buffer, and every
108     // 3 and 5 buffers after that.
109     if (counter_ % 3 == 0 || counter_ % 5 == 0) {
110       *data = nullptr;
111       *size = 0;
112       ++counter_;
113       return true;
114     } else {
115       ++counter_;
116       return array_stream_.Next(data, size);
117     }
118   }
119 
BackUp(int count)120   void BackUp(int count) override { return array_stream_.BackUp(count); }
Skip(int count)121   bool Skip(int count) override { return array_stream_.Skip(count); }
ByteCount() const122   int64_t ByteCount() const override { return array_stream_.ByteCount(); }
123 
124  private:
125   ArrayInputStream array_stream_;
126   int counter_;
127 };
128 
129 // -------------------------------------------------------------------
130 
131 // An error collector which simply concatenates all its errors into a big
132 // block of text which can be checked.
133 class TestErrorCollector : public ErrorCollector {
134  public:
TestErrorCollector()135   TestErrorCollector() {}
~TestErrorCollector()136   ~TestErrorCollector() {}
137 
138   std::string text_;
139 
140   // implements ErrorCollector ---------------------------------------
RecordError(int line,int column,absl::string_view message)141   void RecordError(int line, int column, absl::string_view message) override {
142     absl::SubstituteAndAppend(&text_, "$0:$1: $2\n", line, column, message);
143   }
144 };
145 
146 // -------------------------------------------------------------------
147 
148 // We test each operation over a variety of block sizes to insure that
149 // we test cases where reads cross buffer boundaries as well as cases
150 // where they don't.  This is sort of a brute-force approach to this,
151 // but it's easy to write and easy to understand.
152 const int kBlockSizes[] = {1, 2, 3, 5, 7, 13, 32, 1024};
153 
154 class TokenizerTest : public testing::Test {
155  protected:
156   // For easy testing.
ParseInteger(const std::string & text)157   uint64_t ParseInteger(const std::string& text) {
158     uint64_t result;
159     EXPECT_TRUE(Tokenizer::ParseInteger(text, kuint64max, &result))
160         << "'" << text << "'";
161     return result;
162   }
163 };
164 
165 // ===================================================================
166 
167 // These tests causes gcc 3.3.5 (and earlier?) to give the cryptic error:
168 //   "sorry, unimplemented: `method_call_expr' not supported by dump_expr"
169 #if !defined(__GNUC__) || __GNUC__ > 3 || (__GNUC__ == 3 && __GNUC_MINOR__ > 3)
170 
171 // In each test case, the entire input text should parse as a single token
172 // of the given type.
173 struct SimpleTokenCase {
174   std::string input;
175   Tokenizer::TokenType type;
176 };
177 
operator <<(std::ostream & out,const SimpleTokenCase & test_case)178 inline std::ostream& operator<<(std::ostream& out,
179                                 const SimpleTokenCase& test_case) {
180   return out << absl::CEscape(test_case.input);
181 }
182 
183 SimpleTokenCase kSimpleTokenCases[] = {
184     // Test identifiers.
185     {"hello", Tokenizer::TYPE_IDENTIFIER},
186 
187     // Test integers.
188     {"123", Tokenizer::TYPE_INTEGER},
189     {"0xab6", Tokenizer::TYPE_INTEGER},
190     {"0XAB6", Tokenizer::TYPE_INTEGER},
191     {"0X1234567", Tokenizer::TYPE_INTEGER},
192     {"0x89abcdef", Tokenizer::TYPE_INTEGER},
193     {"0x89ABCDEF", Tokenizer::TYPE_INTEGER},
194     {"01234567", Tokenizer::TYPE_INTEGER},
195 
196     // Test floats.
197     {"123.45", Tokenizer::TYPE_FLOAT},
198     {"1.", Tokenizer::TYPE_FLOAT},
199     {"1e3", Tokenizer::TYPE_FLOAT},
200     {"1E3", Tokenizer::TYPE_FLOAT},
201     {"1e-3", Tokenizer::TYPE_FLOAT},
202     {"1e+3", Tokenizer::TYPE_FLOAT},
203     {"1.e3", Tokenizer::TYPE_FLOAT},
204     {"1.2e3", Tokenizer::TYPE_FLOAT},
205     {".1", Tokenizer::TYPE_FLOAT},
206     {".1e3", Tokenizer::TYPE_FLOAT},
207     {".1e-3", Tokenizer::TYPE_FLOAT},
208     {".1e+3", Tokenizer::TYPE_FLOAT},
209 
210     // Test strings.
211     {"'hello'", Tokenizer::TYPE_STRING},
212     {"\"foo\"", Tokenizer::TYPE_STRING},
213     {"'a\"b'", Tokenizer::TYPE_STRING},
214     {"\"a'b\"", Tokenizer::TYPE_STRING},
215     {"'a\\'b'", Tokenizer::TYPE_STRING},
216     {"\"a\\\"b\"", Tokenizer::TYPE_STRING},
217     {"'\\xf'", Tokenizer::TYPE_STRING},
218     {"'\\0'", Tokenizer::TYPE_STRING},
219 
220     // Test symbols.
221     {"+", Tokenizer::TYPE_SYMBOL},
222     {".", Tokenizer::TYPE_SYMBOL},
223 };
224 
TEST_2D(TokenizerTest,SimpleTokens,kSimpleTokenCases,kBlockSizes)225 TEST_2D(TokenizerTest, SimpleTokens, kSimpleTokenCases, kBlockSizes) {
226   // Set up the tokenizer.
227   TestInputStream input(kSimpleTokenCases_case.input.data(),
228                         kSimpleTokenCases_case.input.size(), kBlockSizes_case);
229   TestErrorCollector error_collector;
230   Tokenizer tokenizer(&input, &error_collector);
231 
232   // Before Next() is called, the initial token should always be TYPE_START.
233   EXPECT_EQ(Tokenizer::TYPE_START, tokenizer.current().type);
234   EXPECT_EQ("", tokenizer.current().text);
235   EXPECT_EQ(0, tokenizer.current().line);
236   EXPECT_EQ(0, tokenizer.current().column);
237   EXPECT_EQ(0, tokenizer.current().end_column);
238 
239   // Parse the token.
240   ASSERT_TRUE(tokenizer.Next());
241 
242   // Check that it has the right type.
243   EXPECT_EQ(kSimpleTokenCases_case.type, tokenizer.current().type);
244   // Check that it contains the complete input text.
245   EXPECT_EQ(kSimpleTokenCases_case.input, tokenizer.current().text);
246   // Check that it is located at the beginning of the input
247   EXPECT_EQ(0, tokenizer.current().line);
248   EXPECT_EQ(0, tokenizer.current().column);
249   EXPECT_EQ(kSimpleTokenCases_case.input.size(),
250             tokenizer.current().end_column);
251 
252   // There should be no more input.
253   EXPECT_FALSE(tokenizer.Next());
254 
255   // After Next() returns false, the token should have type TYPE_END.
256   EXPECT_EQ(Tokenizer::TYPE_END, tokenizer.current().type);
257   EXPECT_EQ("", tokenizer.current().text);
258   EXPECT_EQ(0, tokenizer.current().line);
259   EXPECT_EQ(kSimpleTokenCases_case.input.size(), tokenizer.current().column);
260   EXPECT_EQ(kSimpleTokenCases_case.input.size(),
261             tokenizer.current().end_column);
262 
263   // There should be no errors.
264   EXPECT_TRUE(error_collector.text_.empty());
265 }
266 
TEST_1D(TokenizerTest,FloatSuffix,kBlockSizes)267 TEST_1D(TokenizerTest, FloatSuffix, kBlockSizes) {
268   // Test the "allow_f_after_float" option.
269 
270   // Set up the tokenizer.
271   const char* text = "1f 2.5f 6e3f 7F";
272   TestInputStream input(text, strlen(text), kBlockSizes_case);
273   TestErrorCollector error_collector;
274   Tokenizer tokenizer(&input, &error_collector);
275   tokenizer.set_allow_f_after_float(true);
276 
277   // Advance through tokens and check that they are parsed as expected.
278   ASSERT_TRUE(tokenizer.Next());
279   EXPECT_EQ(tokenizer.current().text, "1f");
280   EXPECT_EQ(tokenizer.current().type, Tokenizer::TYPE_FLOAT);
281   ASSERT_TRUE(tokenizer.Next());
282   EXPECT_EQ(tokenizer.current().text, "2.5f");
283   EXPECT_EQ(tokenizer.current().type, Tokenizer::TYPE_FLOAT);
284   ASSERT_TRUE(tokenizer.Next());
285   EXPECT_EQ(tokenizer.current().text, "6e3f");
286   EXPECT_EQ(tokenizer.current().type, Tokenizer::TYPE_FLOAT);
287   ASSERT_TRUE(tokenizer.Next());
288   EXPECT_EQ(tokenizer.current().text, "7F");
289   EXPECT_EQ(tokenizer.current().type, Tokenizer::TYPE_FLOAT);
290 
291   // There should be no more input.
292   EXPECT_FALSE(tokenizer.Next());
293   // There should be no errors.
294   EXPECT_TRUE(error_collector.text_.empty());
295 }
296 
297 SimpleTokenCase kWhitespaceTokenCases[] = {
298     {" ", Tokenizer::TYPE_WHITESPACE},
299     {"    ", Tokenizer::TYPE_WHITESPACE},
300     {"\t", Tokenizer::TYPE_WHITESPACE},
301     {"\v", Tokenizer::TYPE_WHITESPACE},
302     {"\t ", Tokenizer::TYPE_WHITESPACE},
303     {"\v\t", Tokenizer::TYPE_WHITESPACE},
304     {"   \t\r", Tokenizer::TYPE_WHITESPACE},
305     // Newlines:
306     {"\n", Tokenizer::TYPE_NEWLINE},
307 };
308 
TEST_2D(TokenizerTest,Whitespace,kWhitespaceTokenCases,kBlockSizes)309 TEST_2D(TokenizerTest, Whitespace, kWhitespaceTokenCases, kBlockSizes) {
310   {
311     TestInputStream input(kWhitespaceTokenCases_case.input.data(),
312                           kWhitespaceTokenCases_case.input.size(),
313                           kBlockSizes_case);
314     TestErrorCollector error_collector;
315     Tokenizer tokenizer(&input, &error_collector);
316 
317     EXPECT_FALSE(tokenizer.Next());
318   }
319   {
320     TestInputStream input(kWhitespaceTokenCases_case.input.data(),
321                           kWhitespaceTokenCases_case.input.size(),
322                           kBlockSizes_case);
323     TestErrorCollector error_collector;
324     Tokenizer tokenizer(&input, &error_collector);
325     tokenizer.set_report_whitespace(true);
326     tokenizer.set_report_newlines(true);
327 
328     ASSERT_TRUE(tokenizer.Next());
329     EXPECT_EQ(tokenizer.current().text, kWhitespaceTokenCases_case.input);
330     EXPECT_EQ(tokenizer.current().type, kWhitespaceTokenCases_case.type);
331 
332     EXPECT_FALSE(tokenizer.Next());
333   }
334 }
335 
336 #endif
337 
338 // -------------------------------------------------------------------
339 
340 // In each case, the input is parsed to produce a list of tokens.  The
341 // last token in "output" must have type TYPE_END.
342 struct MultiTokenCase {
343   std::string input;
344   std::vector<Tokenizer::Token> output;
345 };
346 
operator <<(std::ostream & out,const MultiTokenCase & test_case)347 inline std::ostream& operator<<(std::ostream& out,
348                                 const MultiTokenCase& test_case) {
349   return out << absl::CEscape(test_case.input);
350 }
351 
352 MultiTokenCase kMultiTokenCases[] = {
353     // Test empty input.
354     {"",
355      {
356          {Tokenizer::TYPE_END, "", 0, 0, 0},
357      }},
358 
359     // Test all token types at the same time.
360     {"foo 1 1.2 + 'bar'",
361      {
362          {Tokenizer::TYPE_IDENTIFIER, "foo", 0, 0, 3},
363          {Tokenizer::TYPE_INTEGER, "1", 0, 4, 5},
364          {Tokenizer::TYPE_FLOAT, "1.2", 0, 6, 9},
365          {Tokenizer::TYPE_SYMBOL, "+", 0, 10, 11},
366          {Tokenizer::TYPE_STRING, "'bar'", 0, 12, 17},
367          {Tokenizer::TYPE_END, "", 0, 17, 17},
368      }},
369 
370     // Test that consecutive symbols are parsed as separate tokens.
371     {"!@+%",
372      {
373          {Tokenizer::TYPE_SYMBOL, "!", 0, 0, 1},
374          {Tokenizer::TYPE_SYMBOL, "@", 0, 1, 2},
375          {Tokenizer::TYPE_SYMBOL, "+", 0, 2, 3},
376          {Tokenizer::TYPE_SYMBOL, "%", 0, 3, 4},
377          {Tokenizer::TYPE_END, "", 0, 4, 4},
378      }},
379 
380     // Test that newlines affect line numbers correctly.
381     {"foo bar\nrab oof",
382      {
383          {Tokenizer::TYPE_IDENTIFIER, "foo", 0, 0, 3},
384          {Tokenizer::TYPE_IDENTIFIER, "bar", 0, 4, 7},
385          {Tokenizer::TYPE_IDENTIFIER, "rab", 1, 0, 3},
386          {Tokenizer::TYPE_IDENTIFIER, "oof", 1, 4, 7},
387          {Tokenizer::TYPE_END, "", 1, 7, 7},
388      }},
389 
390     // Test that tabs affect column numbers correctly.
391     {"foo\tbar  \tbaz",
392      {
393          {Tokenizer::TYPE_IDENTIFIER, "foo", 0, 0, 3},
394          {Tokenizer::TYPE_IDENTIFIER, "bar", 0, 8, 11},
395          {Tokenizer::TYPE_IDENTIFIER, "baz", 0, 16, 19},
396          {Tokenizer::TYPE_END, "", 0, 19, 19},
397      }},
398 
399     // Test that tabs in string literals affect column numbers correctly.
400     {"\"foo\tbar\" baz",
401      {
402          {Tokenizer::TYPE_STRING, "\"foo\tbar\"", 0, 0, 12},
403          {Tokenizer::TYPE_IDENTIFIER, "baz", 0, 13, 16},
404          {Tokenizer::TYPE_END, "", 0, 16, 16},
405      }},
406 
407     // Test that line comments are ignored.
408     {"foo // This is a comment\n"
409      "bar // This is another comment",
410      {
411          {Tokenizer::TYPE_IDENTIFIER, "foo", 0, 0, 3},
412          {Tokenizer::TYPE_IDENTIFIER, "bar", 1, 0, 3},
413          {Tokenizer::TYPE_END, "", 1, 30, 30},
414      }},
415 
416     // Test that block comments are ignored.
417     {"foo /* This is a block comment */ bar",
418      {
419          {Tokenizer::TYPE_IDENTIFIER, "foo", 0, 0, 3},
420          {Tokenizer::TYPE_IDENTIFIER, "bar", 0, 34, 37},
421          {Tokenizer::TYPE_END, "", 0, 37, 37},
422      }},
423 
424     // Test that sh-style comments are not ignored by default.
425     {"foo # bar\n"
426      "baz",
427      {
428          {Tokenizer::TYPE_IDENTIFIER, "foo", 0, 0, 3},
429          {Tokenizer::TYPE_SYMBOL, "#", 0, 4, 5},
430          {Tokenizer::TYPE_IDENTIFIER, "bar", 0, 6, 9},
431          {Tokenizer::TYPE_IDENTIFIER, "baz", 1, 0, 3},
432          {Tokenizer::TYPE_END, "", 1, 3, 3},
433      }},
434 
435     // Test all whitespace chars
436     {"foo\n\t\r\v\fbar",
437      {
438          {Tokenizer::TYPE_IDENTIFIER, "foo", 0, 0, 3},
439          {Tokenizer::TYPE_IDENTIFIER, "bar", 1, 11, 14},
440          {Tokenizer::TYPE_END, "", 1, 14, 14},
441      }},
442 };
443 
TEST_2D(TokenizerTest,MultipleTokens,kMultiTokenCases,kBlockSizes)444 TEST_2D(TokenizerTest, MultipleTokens, kMultiTokenCases, kBlockSizes) {
445   // Set up the tokenizer.
446   TestInputStream input(kMultiTokenCases_case.input.data(),
447                         kMultiTokenCases_case.input.size(), kBlockSizes_case);
448   TestErrorCollector error_collector;
449   Tokenizer tokenizer(&input, &error_collector);
450 
451   // Before Next() is called, the initial token should always be TYPE_START.
452   EXPECT_EQ(Tokenizer::TYPE_START, tokenizer.current().type);
453   EXPECT_EQ("", tokenizer.current().text);
454   EXPECT_EQ(0, tokenizer.current().line);
455   EXPECT_EQ(0, tokenizer.current().column);
456   EXPECT_EQ(0, tokenizer.current().end_column);
457 
458   // Loop through all expected tokens.
459   int i = 0;
460   Tokenizer::Token token;
461   do {
462     token = kMultiTokenCases_case.output[i++];
463 
464     SCOPED_TRACE(testing::Message() << "Token #" << i << ": " << token.text);
465 
466     Tokenizer::Token previous = tokenizer.current();
467 
468     // Next() should only return false when it hits the end token.
469     if (token.type != Tokenizer::TYPE_END) {
470       ASSERT_TRUE(tokenizer.Next());
471     } else {
472       ASSERT_FALSE(tokenizer.Next());
473     }
474 
475     // Check that the previous token is set correctly.
476     EXPECT_EQ(previous.type, tokenizer.previous().type);
477     EXPECT_EQ(previous.text, tokenizer.previous().text);
478     EXPECT_EQ(previous.line, tokenizer.previous().line);
479     EXPECT_EQ(previous.column, tokenizer.previous().column);
480     EXPECT_EQ(previous.end_column, tokenizer.previous().end_column);
481 
482     // Check that the token matches the expected one.
483     EXPECT_EQ(token.type, tokenizer.current().type);
484     EXPECT_EQ(token.text, tokenizer.current().text);
485     EXPECT_EQ(token.line, tokenizer.current().line);
486     EXPECT_EQ(token.column, tokenizer.current().column);
487     EXPECT_EQ(token.end_column, tokenizer.current().end_column);
488 
489   } while (token.type != Tokenizer::TYPE_END);
490 
491   // There should be no errors.
492   EXPECT_TRUE(error_collector.text_.empty());
493 }
494 
495 MultiTokenCase kMultiWhitespaceTokenCases[] = {
496     // Test all token types at the same time.
497     {"foo 1 \t1.2  \n   +\v'bar'",
498      {
499          {Tokenizer::TYPE_IDENTIFIER, "foo", 0, 0, 3},
500          {Tokenizer::TYPE_WHITESPACE, " ", 0, 3, 4},
501          {Tokenizer::TYPE_INTEGER, "1", 0, 4, 5},
502          {Tokenizer::TYPE_WHITESPACE, " \t", 0, 5, 8},
503          {Tokenizer::TYPE_FLOAT, "1.2", 0, 8, 11},
504          {Tokenizer::TYPE_WHITESPACE, "  ", 0, 11, 13},
505          {Tokenizer::TYPE_NEWLINE, "\n", 0, 13, 0},
506          {Tokenizer::TYPE_WHITESPACE, "   ", 1, 0, 3},
507          {Tokenizer::TYPE_SYMBOL, "+", 1, 3, 4},
508          {Tokenizer::TYPE_WHITESPACE, "\v", 1, 4, 5},
509          {Tokenizer::TYPE_STRING, "'bar'", 1, 5, 10},
510          {Tokenizer::TYPE_END, "", 1, 10, 10},
511      }},
512 
513 };
514 
TEST_2D(TokenizerTest,MultipleWhitespaceTokens,kMultiWhitespaceTokenCases,kBlockSizes)515 TEST_2D(TokenizerTest, MultipleWhitespaceTokens, kMultiWhitespaceTokenCases,
516         kBlockSizes) {
517   // Set up the tokenizer.
518   TestInputStream input(kMultiWhitespaceTokenCases_case.input.data(),
519                         kMultiWhitespaceTokenCases_case.input.size(),
520                         kBlockSizes_case);
521   TestErrorCollector error_collector;
522   Tokenizer tokenizer(&input, &error_collector);
523   tokenizer.set_report_whitespace(true);
524   tokenizer.set_report_newlines(true);
525 
526   // Before Next() is called, the initial token should always be TYPE_START.
527   EXPECT_EQ(Tokenizer::TYPE_START, tokenizer.current().type);
528   EXPECT_EQ("", tokenizer.current().text);
529   EXPECT_EQ(0, tokenizer.current().line);
530   EXPECT_EQ(0, tokenizer.current().column);
531   EXPECT_EQ(0, tokenizer.current().end_column);
532 
533   // Loop through all expected tokens.
534   int i = 0;
535   Tokenizer::Token token;
536   do {
537     token = kMultiWhitespaceTokenCases_case.output[i++];
538 
539     SCOPED_TRACE(testing::Message() << "Token #" << i << ": " << token.text);
540 
541     Tokenizer::Token previous = tokenizer.current();
542 
543     // Next() should only return false when it hits the end token.
544     if (token.type != Tokenizer::TYPE_END) {
545       ASSERT_TRUE(tokenizer.Next());
546     } else {
547       ASSERT_FALSE(tokenizer.Next());
548     }
549 
550     // Check that the previous token is set correctly.
551     EXPECT_EQ(previous.type, tokenizer.previous().type);
552     EXPECT_EQ(previous.text, tokenizer.previous().text);
553     EXPECT_EQ(previous.line, tokenizer.previous().line);
554     EXPECT_EQ(previous.column, tokenizer.previous().column);
555     EXPECT_EQ(previous.end_column, tokenizer.previous().end_column);
556 
557     // Check that the token matches the expected one.
558     EXPECT_EQ(token.type, tokenizer.current().type);
559     EXPECT_EQ(token.text, tokenizer.current().text);
560     EXPECT_EQ(token.line, tokenizer.current().line);
561     EXPECT_EQ(token.column, tokenizer.current().column);
562     EXPECT_EQ(token.end_column, tokenizer.current().end_column);
563 
564   } while (token.type != Tokenizer::TYPE_END);
565 
566   // There should be no errors.
567   EXPECT_TRUE(error_collector.text_.empty());
568 }
569 
570 // This test causes gcc 3.3.5 (and earlier?) to give the cryptic error:
571 //   "sorry, unimplemented: `method_call_expr' not supported by dump_expr"
572 #if !defined(__GNUC__) || __GNUC__ > 3 || (__GNUC__ == 3 && __GNUC_MINOR__ > 3)
573 
TEST_1D(TokenizerTest,ShCommentStyle,kBlockSizes)574 TEST_1D(TokenizerTest, ShCommentStyle, kBlockSizes) {
575   // Test the "comment_style" option.
576 
577   const char* text =
578       "foo # bar\n"
579       "baz // qux\n"
580       "corge /* grault */\n"
581       "garply";
582   const char* const kTokens[] = {"foo",  // "# bar" is ignored
583                                  "baz", "/",      "/", "qux", "corge", "/",
584                                  "*",   "grault", "*", "/",   "garply"};
585 
586   // Set up the tokenizer.
587   TestInputStream input(text, strlen(text), kBlockSizes_case);
588   TestErrorCollector error_collector;
589   Tokenizer tokenizer(&input, &error_collector);
590   tokenizer.set_comment_style(Tokenizer::SH_COMMENT_STYLE);
591 
592   // Advance through tokens and check that they are parsed as expected.
593   for (int i = 0; i < ABSL_ARRAYSIZE(kTokens); i++) {
594     EXPECT_TRUE(tokenizer.Next());
595     EXPECT_EQ(tokenizer.current().text, kTokens[i]);
596   }
597 
598   // There should be no more input.
599   EXPECT_FALSE(tokenizer.Next());
600   // There should be no errors.
601   EXPECT_TRUE(error_collector.text_.empty());
602 }
603 
604 #endif
605 
606 // -------------------------------------------------------------------
607 
608 // In each case, the input is expected to have two tokens named "prev" and
609 // "next" with comments in between.
610 struct DocCommentCase {
611   std::string input;
612 
613   const char* prev_trailing_comments;
614   const char* detached_comments[10];
615   const char* next_leading_comments;
616 };
617 
operator <<(std::ostream & out,const DocCommentCase & test_case)618 inline std::ostream& operator<<(std::ostream& out,
619                                 const DocCommentCase& test_case) {
620   return out << absl::CEscape(test_case.input);
621 }
622 
623 // clang-format off
624 DocCommentCase kDocCommentCases[] = {
625     {"prev next",
626 
627      "",
628      {},
629      ""},
630 
631     {"prev // no next token\n",
632 
633      " no next token\n",
634      {},
635      ""},
636 
637     {"prev // no next token and no trailing newline",
638 
639      " no next token and no trailing newline",
640      {},
641      ""},
642 
643     {"prev /* detached */ next",
644 
645      "",
646      {" detached "},
647      ""},
648 
649     {"prev // trailing comment\n"
650      "next",
651 
652      " trailing comment\n",
653      {},
654      ""},
655 
656     {"prev\n"
657      "/* leading comment */ next",
658 
659      "",
660      {},
661      " leading comment "},
662 
663     {"prev\n"
664      "// leading comment\n"
665      "// line 2\n"
666      "next",
667 
668      "",
669      {},
670      " leading comment\n"
671      " line 2\n"},
672 
673     {"prev\n"
674      "// trailing comment\n"
675      "// line 2\n"
676      "\n"
677      "next",
678 
679      " trailing comment\n"
680      " line 2\n",
681      {},
682      ""},
683 
684     {"prev // trailing comment\n"
685      "// leading comment\n"
686      "// line 2\n"
687      "next",
688 
689      " trailing comment\n",
690      {},
691      " leading comment\n"
692      " line 2\n"},
693 
694     {"prev /* trailing block comment */\n"
695      "/* leading block comment\n"
696      " * line 2\n"
697      " * line 3 */"
698      "next",
699 
700      " trailing block comment ",
701      {},
702      " leading block comment\n"
703      " line 2\n"
704      " line 3 "},
705 
706     {"prev\n"
707      "/* trailing block comment\n"
708      " * line 2\n"
709      " * line 3\n"
710      " */\n"
711      "/* leading block comment\n"
712      " * line 2\n"
713      " * line 3 */"
714      "next",
715 
716      " trailing block comment\n"
717      " line 2\n"
718      " line 3\n",
719      {},
720      " leading block comment\n"
721      " line 2\n"
722      " line 3 "},
723 
724     {"prev\n"
725      "// trailing comment\n"
726      "\n"
727      "// detached comment\n"
728      "// line 2\n"
729      "\n"
730      "// second detached comment\n"
731      "/* third detached comment\n"
732      " * line 2 */\n"
733      "// leading comment\n"
734      "next",
735 
736      " trailing comment\n",
737      {" detached comment\n"
738       " line 2\n",
739       " second detached comment\n",
740       " third detached comment\n"
741       " line 2 "},
742      " leading comment\n"},
743 
744     {"prev /**/\n"
745      "\n"
746      "// detached comment\n"
747      "\n"
748      "// leading comment\n"
749      "next",
750 
751      "",
752      {" detached comment\n"},
753      " leading comment\n"},
754 
755     {"prev /**/\n"
756      "// leading comment\n"
757      "next",
758 
759      "",
760      {},
761      " leading comment\n"},
762 
763     {"prev /* many comments*/ /* all inline */ /* will be handled */ next",
764 
765      " many comments",
766      {" all inline "},
767      " will be handled "},
768 
769     {R"pb(
770      prev /* a single block comment
771          that spans multiple lines
772          is detached if it ends
773          on the same line as next */ next
774      )pb",
775 
776      "",
777      {" a single block comment\n"
778       "that spans multiple lines\n"
779       "is detached if it ends\n"
780       "on the same line as next "},
781      ""},
782 
783     {R"pb(
784        prev /* trailing */ /* leading */ next
785      )pb",
786 
787      " trailing ",
788      {},
789      " leading "},
790 
791     {R"pb(
792      prev /* multi-line
793           trailing */ /* an oddly
794                       placed detached */ /* an oddly
795                                          placed leading */ next
796      )pb",
797 
798      " multi-line\ntrailing ",
799      {" an oddly\nplaced detached "},
800      " an oddly\nplaced leading "},
801 
802     {R"pb(
803        prev  // trailing with newline
804        // detached
805        /* another detached */
806        // leading but no next token to attach it to
807      )pb",
808 
809      " trailing with newline\n",
810      {" detached\n", " another detached ",
811       " leading but no next token to attach it to\n"},
812      ""},
813 };
814 // clang-format on
815 
TEST_2D(TokenizerTest,DocComments,kDocCommentCases,kBlockSizes)816 TEST_2D(TokenizerTest, DocComments, kDocCommentCases, kBlockSizes) {
817   // Set up the tokenizer.
818   TestInputStream input(kDocCommentCases_case.input.data(),
819                         kDocCommentCases_case.input.size(), kBlockSizes_case);
820   TestErrorCollector error_collector;
821   Tokenizer tokenizer(&input, &error_collector);
822 
823   // Set up a second tokenizer where we'll pass all NULLs to NextWithComments().
824   TestInputStream input2(kDocCommentCases_case.input.data(),
825                          kDocCommentCases_case.input.size(), kBlockSizes_case);
826   Tokenizer tokenizer2(&input2, &error_collector);
827 
828   EXPECT_TRUE(tokenizer.Next());
829   EXPECT_TRUE(tokenizer2.Next());
830 
831   EXPECT_EQ("prev", tokenizer.current().text);
832   EXPECT_EQ("prev", tokenizer2.current().text);
833 
834   std::string prev_trailing_comments;
835   std::vector<std::string> detached_comments;
836   std::string next_leading_comments;
837   bool has_next = tokenizer.NextWithComments(
838       &prev_trailing_comments, &detached_comments, &next_leading_comments);
839   EXPECT_EQ(has_next, tokenizer2.NextWithComments(nullptr, nullptr, nullptr));
840   if (has_next) {
841     EXPECT_EQ("next", tokenizer.current().text);
842     EXPECT_EQ("next", tokenizer2.current().text);
843   }
844 
845   EXPECT_EQ(kDocCommentCases_case.prev_trailing_comments,
846             prev_trailing_comments);
847 
848   for (int i = 0; i < detached_comments.size(); i++) {
849     ASSERT_LT(i, ABSL_ARRAYSIZE(kDocCommentCases));
850     ASSERT_TRUE(kDocCommentCases_case.detached_comments[i] != nullptr);
851     EXPECT_EQ(kDocCommentCases_case.detached_comments[i], detached_comments[i]);
852   }
853 
854   // Verify that we matched all the detached comments.
855   EXPECT_EQ(nullptr,
856             kDocCommentCases_case.detached_comments[detached_comments.size()]);
857 
858   EXPECT_EQ(kDocCommentCases_case.next_leading_comments, next_leading_comments);
859 }
860 
861 // -------------------------------------------------------------------
862 
863 // Test parse helpers.
864 // TODO: Add a fuzz test for this.
TEST_F(TokenizerTest,ParseInteger)865 TEST_F(TokenizerTest, ParseInteger) {
866   EXPECT_EQ(0, ParseInteger("0"));
867   EXPECT_EQ(123, ParseInteger("123"));
868   EXPECT_EQ(0xabcdef12u, ParseInteger("0xabcdef12"));
869   EXPECT_EQ(0xabcdef12u, ParseInteger("0xABCDEF12"));
870   EXPECT_EQ(kuint64max, ParseInteger("0xFFFFFFFFFFFFFFFF"));
871   EXPECT_EQ(01234567, ParseInteger("01234567"));
872   EXPECT_EQ(0X123, ParseInteger("0X123"));
873 
874   // Test invalid integers that may still be tokenized as integers.
875   EXPECT_EQ(0, ParseInteger("0x"));
876 
877   uint64_t i;
878 
879   // Test invalid integers that will never be tokenized as integers.
880   EXPECT_FALSE(Tokenizer::ParseInteger("zxy", kuint64max, &i));
881   EXPECT_FALSE(Tokenizer::ParseInteger("1.2", kuint64max, &i));
882   EXPECT_FALSE(Tokenizer::ParseInteger("08", kuint64max, &i));
883   EXPECT_FALSE(Tokenizer::ParseInteger("0xg", kuint64max, &i));
884   EXPECT_FALSE(Tokenizer::ParseInteger("-1", kuint64max, &i));
885 
886   // Test overflows.
887   EXPECT_TRUE(Tokenizer::ParseInteger("0", 0, &i));
888   EXPECT_FALSE(Tokenizer::ParseInteger("1", 0, &i));
889   EXPECT_TRUE(Tokenizer::ParseInteger("1", 1, &i));
890   EXPECT_TRUE(Tokenizer::ParseInteger("12345", 12345, &i));
891   EXPECT_FALSE(Tokenizer::ParseInteger("12346", 12345, &i));
892   EXPECT_TRUE(Tokenizer::ParseInteger("0xFFFFFFFFFFFFFFFF", kuint64max, &i));
893   EXPECT_FALSE(Tokenizer::ParseInteger("0x10000000000000000", kuint64max, &i));
894 
895   // Test near the limits of signed parsing (values in kint64max +/- 1600)
896   for (int64_t offset = -1600; offset <= 1600; ++offset) {
897     // We make sure to perform an unsigned addition so that we avoid signed
898     // overflow, which would be undefined behavior.
899     uint64_t i = 0x7FFFFFFFFFFFFFFFu + static_cast<uint64_t>(offset);
900     char decimal[32];
901     snprintf(decimal, 32, "%llu", static_cast<unsigned long long>(i));
902     if (offset > 0) {
903       uint64_t parsed = -1;
904       EXPECT_FALSE(Tokenizer::ParseInteger(decimal, kint64max, &parsed))
905           << decimal << "=>" << parsed;
906     } else {
907       uint64_t parsed = -1;
908       EXPECT_TRUE(Tokenizer::ParseInteger(decimal, kint64max, &parsed))
909           << decimal << "=>" << parsed;
910       EXPECT_EQ(parsed, i);
911     }
912     char octal[32];
913     snprintf(octal, 32, "0%llo", static_cast<unsigned long long>(i));
914     if (offset > 0) {
915       uint64_t parsed = -1;
916       EXPECT_FALSE(Tokenizer::ParseInteger(octal, kint64max, &parsed))
917           << octal << "=>" << parsed;
918     } else {
919       uint64_t parsed = -1;
920       EXPECT_TRUE(Tokenizer::ParseInteger(octal, kint64max, &parsed))
921           << octal << "=>" << parsed;
922       EXPECT_EQ(parsed, i);
923     }
924     char hex[32];
925     snprintf(hex, 32, "0x%llx", static_cast<unsigned long long>(i));
926     if (offset > 0) {
927       uint64_t parsed = -1;
928       EXPECT_FALSE(Tokenizer::ParseInteger(hex, kint64max, &parsed))
929           << hex << "=>" << parsed;
930     } else {
931       uint64_t parsed = -1;
932       EXPECT_TRUE(Tokenizer::ParseInteger(hex, kint64max, &parsed)) << hex;
933       EXPECT_EQ(parsed, i);
934     }
935     // EXPECT_NE(offset, -237);
936   }
937 
938   // Test near the limits of unsigned parsing (values in kuint64max +/- 1600)
939   // By definition, values greater than kuint64max cannot be held in a uint64_t
940   // variable, so printing them is a little tricky; fortunately all but the
941   // last four digits are known, so we can hard-code them in the printf string,
942   // and we only need to format the last 4.
943   for (int64_t offset = -1600; offset <= 1600; ++offset) {
944     {
945       uint64_t i = 18446744073709551615u + offset;
946       char decimal[32];
947       snprintf(decimal, 32, "1844674407370955%04llu",
948                static_cast<unsigned long long>(1615 + offset));
949       if (offset > 0) {
950         uint64_t parsed = -1;
951         EXPECT_FALSE(Tokenizer::ParseInteger(decimal, kuint64max, &parsed))
952             << decimal << "=>" << parsed;
953       } else {
954         uint64_t parsed = -1;
955         EXPECT_TRUE(Tokenizer::ParseInteger(decimal, kuint64max, &parsed))
956             << decimal;
957         EXPECT_EQ(parsed, i);
958       }
959     }
960     {
961       uint64_t i = 01777777777777777777777u + offset;
962       if (offset > 0) {
963         char octal[32];
964         snprintf(octal, 32, "0200000000000000000%04llo",
965                  static_cast<unsigned long long>(offset - 1));
966         uint64_t parsed = -1;
967         EXPECT_FALSE(Tokenizer::ParseInteger(octal, kuint64max, &parsed))
968             << octal << "=>" << parsed;
969       } else {
970         char octal[32];
971         snprintf(octal, 32, "0%llo", static_cast<unsigned long long>(i));
972         uint64_t parsed = -1;
973         EXPECT_TRUE(Tokenizer::ParseInteger(octal, kuint64max, &parsed))
974             << octal;
975         EXPECT_EQ(parsed, i);
976       }
977     }
978     {
979       uint64_t ui = 0xffffffffffffffffu + offset;
980       char hex[32];
981       if (offset > 0) {
982         snprintf(hex, 32, "0x1000000000000%04llx",
983                  static_cast<unsigned long long>(offset - 1));
984         uint64_t parsed = -1;
985         EXPECT_FALSE(Tokenizer::ParseInteger(hex, kuint64max, &parsed))
986             << hex << "=>" << parsed;
987       } else {
988         snprintf(hex, 32, "0x%llx", static_cast<unsigned long long>(ui));
989         uint64_t parsed = -1;
990         EXPECT_TRUE(Tokenizer::ParseInteger(hex, kuint64max, &parsed)) << hex;
991         EXPECT_EQ(parsed, ui);
992       }
993     }
994   }
995 }
996 
TEST_F(TokenizerTest,ParseFloat)997 TEST_F(TokenizerTest, ParseFloat) {
998   EXPECT_DOUBLE_EQ(1, Tokenizer::ParseFloat("1."));
999   EXPECT_DOUBLE_EQ(1e3, Tokenizer::ParseFloat("1e3"));
1000   EXPECT_DOUBLE_EQ(1e3, Tokenizer::ParseFloat("1E3"));
1001   EXPECT_DOUBLE_EQ(1.5e3, Tokenizer::ParseFloat("1.5e3"));
1002   EXPECT_DOUBLE_EQ(.1, Tokenizer::ParseFloat(".1"));
1003   EXPECT_DOUBLE_EQ(.25, Tokenizer::ParseFloat(".25"));
1004   EXPECT_DOUBLE_EQ(.1e3, Tokenizer::ParseFloat(".1e3"));
1005   EXPECT_DOUBLE_EQ(.25e3, Tokenizer::ParseFloat(".25e3"));
1006   EXPECT_DOUBLE_EQ(.1e+3, Tokenizer::ParseFloat(".1e+3"));
1007   EXPECT_DOUBLE_EQ(.1e-3, Tokenizer::ParseFloat(".1e-3"));
1008   EXPECT_DOUBLE_EQ(5, Tokenizer::ParseFloat("5"));
1009   EXPECT_DOUBLE_EQ(6e-12, Tokenizer::ParseFloat("6e-12"));
1010   EXPECT_DOUBLE_EQ(1.2, Tokenizer::ParseFloat("1.2"));
1011   EXPECT_DOUBLE_EQ(1.e2, Tokenizer::ParseFloat("1.e2"));
1012 
1013   // Test invalid integers that may still be tokenized as integers.
1014   EXPECT_DOUBLE_EQ(1, Tokenizer::ParseFloat("1e"));
1015   EXPECT_DOUBLE_EQ(1, Tokenizer::ParseFloat("1e-"));
1016   EXPECT_DOUBLE_EQ(1, Tokenizer::ParseFloat("1.e"));
1017 
1018   // Test 'f' suffix.
1019   EXPECT_DOUBLE_EQ(1, Tokenizer::ParseFloat("1f"));
1020   EXPECT_DOUBLE_EQ(1, Tokenizer::ParseFloat("1.0f"));
1021   EXPECT_DOUBLE_EQ(1, Tokenizer::ParseFloat("1F"));
1022 
1023   // These should parse successfully even though they are out of range.
1024   // Overflows become infinity and underflows become zero.
1025   EXPECT_EQ(0.0, Tokenizer::ParseFloat("1e-9999999999999999999999999999"));
1026   EXPECT_EQ(HUGE_VAL, Tokenizer::ParseFloat("1e+9999999999999999999999999999"));
1027 
1028 #if GTEST_HAS_DEATH_TEST  // death tests do not work on Windows yet
1029   // Test invalid integers that will never be tokenized as integers.
1030   EXPECT_DEBUG_DEATH(
1031       Tokenizer::ParseFloat("zxy"),
1032       "passed text that could not have been tokenized as a float");
1033   EXPECT_DEBUG_DEATH(
1034       Tokenizer::ParseFloat("1-e0"),
1035       "passed text that could not have been tokenized as a float");
1036   EXPECT_DEBUG_DEATH(
1037       Tokenizer::ParseFloat("-1.0"),
1038       "passed text that could not have been tokenized as a float");
1039 #endif  // GTEST_HAS_DEATH_TEST
1040 }
1041 
TEST_F(TokenizerTest,ParseString)1042 TEST_F(TokenizerTest, ParseString) {
1043   std::string output;
1044   Tokenizer::ParseString("'hello'", &output);
1045   EXPECT_EQ("hello", output);
1046   Tokenizer::ParseString("\"blah\\nblah2\"", &output);
1047   EXPECT_EQ("blah\nblah2", output);
1048   Tokenizer::ParseString("'\\1x\\1\\123\\739\\52\\334n\\3'", &output);
1049   EXPECT_EQ("\1x\1\123\739\52\334n\3", output);
1050   Tokenizer::ParseString("'\\x20\\x4'", &output);
1051   EXPECT_EQ("\x20\x4", output);
1052   Tokenizer::ParseString("'\\X20\\X4'", &output);
1053   EXPECT_EQ("\x20\x4", output);
1054 
1055   // Test invalid strings that may still be tokenized as strings.
1056   Tokenizer::ParseString("\"\\a\\l\\v\\t", &output);  // \l is invalid
1057   EXPECT_EQ("\a?\v\t", output);
1058   Tokenizer::ParseString("'", &output);
1059   EXPECT_EQ("", output);
1060   Tokenizer::ParseString("'\\", &output);
1061   EXPECT_EQ("\\", output);
1062 
1063   // Experiment with Unicode escapes. Here are one-, two- and three-byte Unicode
1064   // characters.
1065   Tokenizer::ParseString("'\\u0024\\u00a2\\u20ac\\U00024b62XX'", &output);
1066   EXPECT_EQ("$¢€��XX", output);
1067   // Same thing encoded using UTF16.
1068   Tokenizer::ParseString("'\\u0024\\u00a2\\u20ac\\ud852\\udf62XX'", &output);
1069   EXPECT_EQ("$¢€��XX", output);
1070   // Here's some broken UTF16; there's a head surrogate with no tail surrogate.
1071   // We just output this as if it were UTF8; it's not a defined code point, but
1072   // it has a defined encoding.
1073   Tokenizer::ParseString("'\\ud852XX'", &output);
1074   EXPECT_EQ("\xed\xa1\x92XX", output);
1075   // Malformed escape: Demons may fly out of the nose.
1076   Tokenizer::ParseString("'\\u0'", &output);
1077   EXPECT_EQ("u0", output);
1078   // Beyond the range of valid UTF-32 code units.
1079   Tokenizer::ParseString("'\\U00110000\\U00200000\\UFFFFFFFF'", &output);
1080   EXPECT_EQ("\\U00110000\\U00200000\\Uffffffff", output);
1081 
1082   // Test invalid strings that will never be tokenized as strings.
1083 #if GTEST_HAS_DEATH_TEST  // death tests do not work on Windows yet
1084   EXPECT_DEBUG_DEATH(
1085       Tokenizer::ParseString("", &output),
1086       "passed text that could not have been tokenized as a string");
1087 #endif  // GTEST_HAS_DEATH_TEST
1088 }
1089 
TEST_F(TokenizerTest,ParseStringAppend)1090 TEST_F(TokenizerTest, ParseStringAppend) {
1091   // Check that ParseString and ParseStringAppend differ.
1092   std::string output("stuff+");
1093   Tokenizer::ParseStringAppend("'hello'", &output);
1094   EXPECT_EQ("stuff+hello", output);
1095   Tokenizer::ParseString("'hello'", &output);
1096   EXPECT_EQ("hello", output);
1097 }
1098 
1099 // -------------------------------------------------------------------
1100 
1101 // Each case parses some input text, ignoring the tokens produced, and
1102 // checks that the error output matches what is expected.
1103 struct ErrorCase {
1104   std::string input;
1105   bool recoverable;  // True if the tokenizer should be able to recover and
1106                      // parse more tokens after seeing this error.  Cases
1107                      // for which this is true must end with "foo" as
1108                      // the last token, which the test will check for.
1109   const char* errors;
1110 };
1111 
operator <<(std::ostream & out,const ErrorCase & test_case)1112 inline std::ostream& operator<<(std::ostream& out, const ErrorCase& test_case) {
1113   return out << absl::CEscape(test_case.input);
1114 }
1115 
1116 ErrorCase kErrorCases[] = {
1117     // String errors.
1118     {"'\\l' foo", true, "0:2: Invalid escape sequence in string literal.\n"},
1119     {"'\\X' foo", true, "0:3: Expected hex digits for escape sequence.\n"},
1120     {"'\\x' foo", true, "0:3: Expected hex digits for escape sequence.\n"},
1121     {"'foo", false, "0:4: Unexpected end of string.\n"},
1122     {"'bar\nfoo", true,
1123      "0:4: Multiline strings are not allowed. Did you miss a \"?.\n"},
1124     {"'\\u01' foo", true,
1125      "0:5: Expected four hex digits for \\u escape sequence.\n"},
1126     {"'\\u01' foo", true,
1127      "0:5: Expected four hex digits for \\u escape sequence.\n"},
1128     {"'\\uXYZ' foo", true,
1129      "0:3: Expected four hex digits for \\u escape sequence.\n"},
1130 
1131     // Integer errors.
1132     {"123foo", true, "0:3: Need space between number and identifier.\n"},
1133 
1134     // Hex/octal errors.
1135     {"0x foo", true, "0:2: \"0x\" must be followed by hex digits.\n"},
1136     {"0541823 foo", true,
1137      "0:4: Numbers starting with leading zero must be in octal.\n"},
1138     {"0x123z foo", true, "0:5: Need space between number and identifier.\n"},
1139     {"0x123.4 foo", true, "0:5: Hex and octal numbers must be integers.\n"},
1140     {"0123.4 foo", true, "0:4: Hex and octal numbers must be integers.\n"},
1141 
1142     // Float errors.
1143     {"1e foo", true, "0:2: \"e\" must be followed by exponent.\n"},
1144     {"1e- foo", true, "0:3: \"e\" must be followed by exponent.\n"},
1145     {"1.2.3 foo", true,
1146      "0:3: Already saw decimal point or exponent; can't have another one.\n"},
1147     {"1e2.3 foo", true,
1148      "0:3: Already saw decimal point or exponent; can't have another one.\n"},
1149     {"a.1 foo", true,
1150      "0:1: Need space between identifier and decimal point.\n"},
1151     // allow_f_after_float not enabled, so this should be an error.
1152     {"1.0f foo", true, "0:3: Need space between number and identifier.\n"},
1153 
1154     // Block comment errors.
1155     {"/*", false,
1156      "0:2: End-of-file inside block comment.\n"
1157      "0:0:   Comment started here.\n"},
1158     {"/*/*/ foo", true,
1159      "0:3: \"/*\" inside block comment.  Block comments cannot be nested.\n"},
1160 
1161     // Control characters.  Multiple consecutive control characters should only
1162     // produce one error.
1163     {"\b foo", true, "0:0: Invalid control characters encountered in text.\n"},
1164     {"\b\b foo", true,
1165      "0:0: Invalid control characters encountered in text.\n"},
1166 
1167     // Check that control characters at end of input don't result in an
1168     // infinite loop.
1169     {"\b", false, "0:0: Invalid control characters encountered in text.\n"},
1170 
1171     // Check recovery from '\0'.  We have to explicitly specify the length of
1172     // these strings because otherwise the string constructor will just call
1173     // strlen() which will see the first '\0' and think that is the end of the
1174     // string.
1175     {std::string("\0foo", 4), true,
1176      "0:0: Invalid control characters encountered in text.\n"},
1177     {std::string("\0\0foo", 5), true,
1178      "0:0: Invalid control characters encountered in text.\n"},
1179 
1180     // Check error from high order bits set
1181     {"\300foo", true, "0:0: Interpreting non ascii codepoint 192.\n"},
1182 };
1183 
TEST_2D(TokenizerTest,Errors,kErrorCases,kBlockSizes)1184 TEST_2D(TokenizerTest, Errors, kErrorCases, kBlockSizes) {
1185   // Set up the tokenizer.
1186   TestInputStream input(kErrorCases_case.input.data(),
1187                         kErrorCases_case.input.size(), kBlockSizes_case);
1188   TestErrorCollector error_collector;
1189   Tokenizer tokenizer(&input, &error_collector);
1190 
1191   // Ignore all input, except remember if the last token was "foo".
1192   bool last_was_foo = false;
1193   while (tokenizer.Next()) {
1194     last_was_foo = tokenizer.current().text == "foo";
1195   }
1196 
1197   // Check that the errors match what was expected.
1198   EXPECT_EQ(kErrorCases_case.errors, error_collector.text_);
1199 
1200   // If the error was recoverable, make sure we saw "foo" after it.
1201   if (kErrorCases_case.recoverable) {
1202     EXPECT_TRUE(last_was_foo);
1203   }
1204 }
1205 
1206 // -------------------------------------------------------------------
1207 
TEST_1D(TokenizerTest,BackUpOnDestruction,kBlockSizes)1208 TEST_1D(TokenizerTest, BackUpOnDestruction, kBlockSizes) {
1209   std::string text = "foo bar";
1210   TestInputStream input(text.data(), text.size(), kBlockSizes_case);
1211 
1212   // Create a tokenizer, read one token, then destroy it.
1213   {
1214     TestErrorCollector error_collector;
1215     Tokenizer tokenizer(&input, &error_collector);
1216 
1217     tokenizer.Next();
1218   }
1219 
1220   // Only "foo" should have been read.
1221   EXPECT_EQ(strlen("foo"), input.ByteCount());
1222 }
1223 
1224 
1225 }  // namespace
1226 }  // namespace io
1227 }  // namespace protobuf
1228 }  // namespace google
1229