• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Protocol Buffers - Google's data interchange format
2 // Copyright 2023 Google LLC.  All rights reserved.
3 //
4 // Use of this source code is governed by a BSD-style
5 // license that can be found in the LICENSE file or at
6 // https://developers.google.com/open-source/licenses/bsd
7 
8 #include "upb/io/tokenizer.h"
9 
10 #include <gtest/gtest.h>
11 #include "absl/strings/escaping.h"
12 #include "absl/strings/str_format.h"
13 #include "upb/io/chunked_input_stream.h"
14 #include "upb/io/string.h"
15 #include "upb/lex/unicode.h"
16 #include "upb/mem/arena.hpp"
17 
18 // Must be last.
19 #include "upb/port/def.inc"
20 
21 namespace google {
22 namespace protobuf {
23 namespace io {
24 namespace {
25 
26 #ifndef arraysize
27 #define arraysize(a) (sizeof(a) / sizeof(a[0]))
28 #endif
29 
StringEquals(const char * a,const char * b)30 static bool StringEquals(const char* a, const char* b) {
31   return strcmp(a, b) == 0;
32 }
33 
34 // ===================================================================
35 // Data-Driven Test Infrastructure
36 
37 // TODO:  This is copied from coded_stream_unittest.  This is
38 //   temporary until these features are integrated into gTest itself.
39 
40 // TEST_1D and TEST_2D are macros I'd eventually like to see added to
41 // gTest.  These macros can be used to declare tests which should be
42 // run multiple times, once for each item in some input array.  TEST_1D
43 // tests all cases in a single input array.  TEST_2D tests all
44 // combinations of cases from two arrays.  The arrays must be statically
45 // defined such that the arraysize() macro works on them.  Example:
46 //
47 // int kCases[] = {1, 2, 3, 4}
48 // TEST_1D(MyFixture, MyTest, kCases) {
49 //   EXPECT_GT(kCases_case, 0);
50 // }
51 //
52 // This test iterates through the numbers 1, 2, 3, and 4 and tests that
53 // they are all grater than zero.  In case of failure, the exact case
54 // which failed will be printed.  The case type must be printable using
55 // ostream::operator<<.
56 
57 #define TEST_1D(FIXTURE, NAME, CASES)                             \
58   class FIXTURE##_##NAME##_DD : public FIXTURE {                  \
59    protected:                                                     \
60     template <typename CaseType>                                  \
61     void DoSingleCase(const CaseType& CASES##_case);              \
62   };                                                              \
63                                                                   \
64   TEST_F(FIXTURE##_##NAME##_DD, NAME) {                           \
65     for (size_t i = 0; i < arraysize(CASES); i++) {               \
66       SCOPED_TRACE(testing::Message()                             \
67                    << #CASES " case #" << i << ": " << CASES[i]); \
68       DoSingleCase(CASES[i]);                                     \
69     }                                                             \
70   }                                                               \
71                                                                   \
72   template <typename CaseType>                                    \
73   void FIXTURE##_##NAME##_DD::DoSingleCase(const CaseType& CASES##_case)
74 
75 #define TEST_2D(FIXTURE, NAME, CASES1, CASES2)                              \
76   class FIXTURE##_##NAME##_DD : public FIXTURE {                            \
77    protected:                                                               \
78     template <typename CaseType1, typename CaseType2>                       \
79     void DoSingleCase(const CaseType1& CASES1##_case,                       \
80                       const CaseType2& CASES2##_case);                      \
81   };                                                                        \
82                                                                             \
83   TEST_F(FIXTURE##_##NAME##_DD, NAME) {                                     \
84     for (size_t i = 0; i < arraysize(CASES1); i++) {                        \
85       for (size_t j = 0; j < arraysize(CASES2); j++) {                      \
86         SCOPED_TRACE(testing::Message()                                     \
87                      << #CASES1 " case #" << i << ": " << CASES1[i] << ", " \
88                      << #CASES2 " case #" << j << ": " << CASES2[j]);       \
89         DoSingleCase(CASES1[i], CASES2[j]);                                 \
90       }                                                                     \
91     }                                                                       \
92   }                                                                         \
93                                                                             \
94   template <typename CaseType1, typename CaseType2>                         \
95   void FIXTURE##_##NAME##_DD::DoSingleCase(const CaseType1& CASES1##_case,  \
96                                            const CaseType2& CASES2##_case)
97 
98 // -------------------------------------------------------------------
99 
100 // In C, a size of zero from ZCIS_Next() means EOF so we can't play the same
101 // trick here that happens in the C++ version. Use ChunkedInputStream instead.
TestInputStream(const void * data,size_t size,size_t block_size,upb_Arena * arena)102 upb_ZeroCopyInputStream* TestInputStream(const void* data, size_t size,
103                                          size_t block_size, upb_Arena* arena) {
104   return upb_ChunkedInputStream_New(data, size, block_size, arena);
105 }
106 
107 // -------------------------------------------------------------------
108 
109 // We test each operation over a variety of block sizes to insure that
110 // we test cases where reads cross buffer boundaries as well as cases
111 // where they don't.  This is sort of a brute-force approach to this,
112 // but it's easy to write and easy to understand.
113 const int kBlockSizes[] = {1, 2, 3, 5, 7, 13, 32, 1024};
114 
115 class TokenizerTest : public testing::Test {
116  protected:
117   // For easy testing.
ParseInteger(const std::string & text)118   uint64_t ParseInteger(const std::string& text) {
119     uint64_t result;
120     EXPECT_TRUE(upb_Parse_Integer(text.data(), UINT64_MAX, &result))
121         << "'" << text << "'";
122     return result;
123   }
124 };
125 
126 // ===================================================================
127 
128 // These tests causes gcc 3.3.5 (and earlier?) to give the cryptic error:
129 //   "sorry, unimplemented: `method_call_expr' not supported by dump_expr"
130 #if !defined(__GNUC__) || __GNUC__ > 3 || (__GNUC__ == 3 && __GNUC_MINOR__ > 3)
131 
132 // In each test case, the entire input text should parse as a single token
133 // of the given type.
134 struct SimpleTokenCase {
135   std::string input;
136   upb_TokenType type;
137 };
138 
operator <<(std::ostream & out,const SimpleTokenCase & test_case)139 inline std::ostream& operator<<(std::ostream& out,
140                                 const SimpleTokenCase& test_case) {
141   return out << absl::CEscape(test_case.input);
142 }
143 
144 SimpleTokenCase kSimpleTokenCases[] = {
145     // Test identifiers.
146     {"hello", kUpb_TokenType_Identifier},
147 
148     // Test integers.
149     {"123", kUpb_TokenType_Integer},
150     {"0xab6", kUpb_TokenType_Integer},
151     {"0XAB6", kUpb_TokenType_Integer},
152     {"0X1234567", kUpb_TokenType_Integer},
153     {"0x89abcdef", kUpb_TokenType_Integer},
154     {"0x89ABCDEF", kUpb_TokenType_Integer},
155     {"01234567", kUpb_TokenType_Integer},
156 
157     // Test floats.
158     {"123.45", kUpb_TokenType_Float},
159     {"1.", kUpb_TokenType_Float},
160     {"1e3", kUpb_TokenType_Float},
161     {"1E3", kUpb_TokenType_Float},
162     {"1e-3", kUpb_TokenType_Float},
163     {"1e+3", kUpb_TokenType_Float},
164     {"1.e3", kUpb_TokenType_Float},
165     {"1.2e3", kUpb_TokenType_Float},
166     {".1", kUpb_TokenType_Float},
167     {".1e3", kUpb_TokenType_Float},
168     {".1e-3", kUpb_TokenType_Float},
169     {".1e+3", kUpb_TokenType_Float},
170 
171     // Test strings.
172     {"'hello'", kUpb_TokenType_String},
173     {"\"foo\"", kUpb_TokenType_String},
174     {"'a\"b'", kUpb_TokenType_String},
175     {"\"a'b\"", kUpb_TokenType_String},
176     {"'a\\'b'", kUpb_TokenType_String},
177     {"\"a\\\"b\"", kUpb_TokenType_String},
178     {"'\\xf'", kUpb_TokenType_String},
179     {"'\\0'", kUpb_TokenType_String},
180 
181     // Test symbols.
182     {"+", kUpb_TokenType_Symbol},
183     {".", kUpb_TokenType_Symbol},
184 };
185 
TEST_2D(TokenizerTest,SimpleTokens,kSimpleTokenCases,kBlockSizes)186 TEST_2D(TokenizerTest, SimpleTokens, kSimpleTokenCases, kBlockSizes) {
187   upb::Arena arena;
188 
189   // Set up the tokenizer.
190   auto input = TestInputStream(kSimpleTokenCases_case.input.data(),
191                                kSimpleTokenCases_case.input.size(),
192                                kBlockSizes_case, arena.ptr());
193   auto t = upb_Tokenizer_New(nullptr, 0, input, 0, arena.ptr());
194 
195   // Before Next() is called, the initial token should always be TYPE_START.
196   EXPECT_EQ(upb_Tokenizer_Type(t), kUpb_TokenType_Start);
197   EXPECT_EQ(upb_Tokenizer_Line(t), 0);
198   EXPECT_EQ(upb_Tokenizer_Column(t), 0);
199   EXPECT_EQ(upb_Tokenizer_EndColumn(t), 0);
200   EXPECT_TRUE(StringEquals(upb_Tokenizer_TextData(t), ""));
201 
202   // Parse the token.
203   EXPECT_TRUE(upb_Tokenizer_Next(t, nullptr));
204   // Check that it has the right type.
205   EXPECT_EQ(upb_Tokenizer_Type(t), kSimpleTokenCases_case.type);
206   // Check that it contains the complete input text.
207   EXPECT_TRUE(StringEquals(upb_Tokenizer_TextData(t),
208                            kSimpleTokenCases_case.input.data()));
209 
210   // Check that it is located at the beginning of the input
211   EXPECT_EQ(upb_Tokenizer_Line(t), 0);
212   EXPECT_EQ(upb_Tokenizer_Column(t), 0);
213   EXPECT_EQ(upb_Tokenizer_EndColumn(t), kSimpleTokenCases_case.input.size());
214 
215   upb_Status status;
216   upb_Status_Clear(&status);
217 
218   // There should be no more input and no errors..
219   EXPECT_FALSE(upb_Tokenizer_Next(t, &status));
220   EXPECT_TRUE(upb_Status_IsOk(&status));
221 
222   // After Next() returns false, the token should have type TYPE_END.
223   EXPECT_EQ(upb_Tokenizer_Type(t), kUpb_TokenType_End);
224   EXPECT_EQ(upb_Tokenizer_Line(t), 0);
225   EXPECT_EQ(upb_Tokenizer_Column(t), kSimpleTokenCases_case.input.size());
226   EXPECT_EQ(upb_Tokenizer_EndColumn(t), kSimpleTokenCases_case.input.size());
227   EXPECT_TRUE(StringEquals(upb_Tokenizer_TextData(t), ""));
228 }
229 
TEST_1D(TokenizerTest,FloatSuffix,kBlockSizes)230 TEST_1D(TokenizerTest, FloatSuffix, kBlockSizes) {
231   // Test the "allow_f_after_float" option.
232 
233   // Set up the tokenizer.
234   upb::Arena arena;
235   const char* text = "1f 2.5f 6e3f 7F";
236   auto input =
237       TestInputStream(text, strlen(text), kBlockSizes_case, arena.ptr());
238   const int options = kUpb_TokenizerOption_AllowFAfterFloat;
239   auto t = upb_Tokenizer_New(nullptr, 0, input, options, arena.ptr());
240 
241   // Advance through tokens and check that they are parsed as expected.
242 
243   EXPECT_TRUE(upb_Tokenizer_Next(t, nullptr));
244   EXPECT_EQ(upb_Tokenizer_Type(t), kUpb_TokenType_Float);
245   EXPECT_TRUE(StringEquals(upb_Tokenizer_TextData(t), "1f"));
246 
247   EXPECT_TRUE(upb_Tokenizer_Next(t, nullptr));
248   EXPECT_EQ(upb_Tokenizer_Type(t), kUpb_TokenType_Float);
249   EXPECT_TRUE(StringEquals(upb_Tokenizer_TextData(t), "2.5f"));
250 
251   EXPECT_TRUE(upb_Tokenizer_Next(t, nullptr));
252   EXPECT_EQ(upb_Tokenizer_Type(t), kUpb_TokenType_Float);
253   EXPECT_TRUE(StringEquals(upb_Tokenizer_TextData(t), "6e3f"));
254 
255   EXPECT_TRUE(upb_Tokenizer_Next(t, nullptr));
256   EXPECT_EQ(upb_Tokenizer_Type(t), kUpb_TokenType_Float);
257   EXPECT_TRUE(StringEquals(upb_Tokenizer_TextData(t), "7F"));
258 
259   upb_Status status;
260   upb_Status_Clear(&status);
261 
262   // There should be no more input and no errors..
263   EXPECT_FALSE(upb_Tokenizer_Next(t, &status));
264   EXPECT_TRUE(upb_Status_IsOk(&status));
265 }
266 
267 SimpleTokenCase kWhitespaceTokenCases[] = {
268     {" ", kUpb_TokenType_Whitespace},
269     {"    ", kUpb_TokenType_Whitespace},
270     {"\t", kUpb_TokenType_Whitespace},
271     {"\v", kUpb_TokenType_Whitespace},
272     {"\t ", kUpb_TokenType_Whitespace},
273     {"\v\t", kUpb_TokenType_Whitespace},
274     {"   \t\r", kUpb_TokenType_Whitespace},
275     // Newlines:
276     {"\n", kUpb_TokenType_Newline},
277 };
278 
TEST_2D(TokenizerTest,Whitespace,kWhitespaceTokenCases,kBlockSizes)279 TEST_2D(TokenizerTest, Whitespace, kWhitespaceTokenCases, kBlockSizes) {
280   upb::Arena arena;
281   {
282     auto input = TestInputStream(kWhitespaceTokenCases_case.input.data(),
283                                  kWhitespaceTokenCases_case.input.size(),
284                                  kBlockSizes_case, arena.ptr());
285     auto t = upb_Tokenizer_New(nullptr, 0, input, 0, arena.ptr());
286 
287     EXPECT_FALSE(upb_Tokenizer_Next(t, nullptr));
288   }
289   {
290     auto input = TestInputStream(kWhitespaceTokenCases_case.input.data(),
291                                  kWhitespaceTokenCases_case.input.size(),
292                                  kBlockSizes_case, arena.ptr());
293     const int options = kUpb_TokenizerOption_ReportNewlines;
294     auto t = upb_Tokenizer_New(nullptr, 0, input, options, arena.ptr());
295 
296     EXPECT_TRUE(upb_Tokenizer_Next(t, nullptr));
297 
298     EXPECT_EQ(upb_Tokenizer_Type(t), kWhitespaceTokenCases_case.type);
299     EXPECT_TRUE(StringEquals(upb_Tokenizer_TextData(t),
300                              kWhitespaceTokenCases_case.input.data()));
301     EXPECT_FALSE(upb_Tokenizer_Next(t, nullptr));
302   }
303 }
304 
305 #endif
306 
307 // -------------------------------------------------------------------
308 
309 struct TokenFields {
310   upb_TokenType type;
311   std::string text;
312   size_t line;
313   size_t column;
314   size_t end_column;
315 };
316 
317 // In each case, the input is parsed to produce a list of tokens.  The
318 // last token in "output" must have type TYPE_END.
319 struct MultiTokenCase {
320   std::string input;
321   std::vector<TokenFields> output;
322 };
323 
operator <<(std::ostream & out,const MultiTokenCase & test_case)324 inline std::ostream& operator<<(std::ostream& out,
325                                 const MultiTokenCase& test_case) {
326   return out << absl::CEscape(test_case.input);
327 }
328 
329 MultiTokenCase kMultiTokenCases[] = {
330     // Test empty input.
331     {"",
332      {
333          {kUpb_TokenType_End, "", 0, 0, 0},
334      }},
335     // Test all token types at the same time.
336     {"foo 1 1.2 + 'bar'",
337      {
338          {kUpb_TokenType_Identifier, "foo", 0, 0, 3},
339          {kUpb_TokenType_Integer, "1", 0, 4, 5},
340          {kUpb_TokenType_Float, "1.2", 0, 6, 9},
341          {kUpb_TokenType_Symbol, "+", 0, 10, 11},
342          {kUpb_TokenType_String, "'bar'", 0, 12, 17},
343          {kUpb_TokenType_End, "", 0, 17, 17},
344      }},
345 
346     // Test that consecutive symbols are parsed as separate tokens.
347     {"!@+%",
348      {
349          {kUpb_TokenType_Symbol, "!", 0, 0, 1},
350          {kUpb_TokenType_Symbol, "@", 0, 1, 2},
351          {kUpb_TokenType_Symbol, "+", 0, 2, 3},
352          {kUpb_TokenType_Symbol, "%", 0, 3, 4},
353          {kUpb_TokenType_End, "", 0, 4, 4},
354      }},
355 
356     // Test that newlines affect line numbers correctly.
357     {"foo bar\nrab oof",
358      {
359          {kUpb_TokenType_Identifier, "foo", 0, 0, 3},
360          {kUpb_TokenType_Identifier, "bar", 0, 4, 7},
361          {kUpb_TokenType_Identifier, "rab", 1, 0, 3},
362          {kUpb_TokenType_Identifier, "oof", 1, 4, 7},
363          {kUpb_TokenType_End, "", 1, 7, 7},
364      }},
365 
366     // Test that tabs affect column numbers correctly.
367     {"foo\tbar  \tbaz",
368      {
369          {kUpb_TokenType_Identifier, "foo", 0, 0, 3},
370          {kUpb_TokenType_Identifier, "bar", 0, 8, 11},
371          {kUpb_TokenType_Identifier, "baz", 0, 16, 19},
372          {kUpb_TokenType_End, "", 0, 19, 19},
373      }},
374 
375     // Test that tabs in string literals affect column numbers correctly.
376     {"\"foo\tbar\" baz",
377      {
378          {kUpb_TokenType_String, "\"foo\tbar\"", 0, 0, 12},
379          {kUpb_TokenType_Identifier, "baz", 0, 13, 16},
380          {kUpb_TokenType_End, "", 0, 16, 16},
381      }},
382 
383     // Test that line comments are ignored.
384     {"foo // This is a comment\n"
385      "bar // This is another comment",
386      {
387          {kUpb_TokenType_Identifier, "foo", 0, 0, 3},
388          {kUpb_TokenType_Identifier, "bar", 1, 0, 3},
389          {kUpb_TokenType_End, "", 1, 30, 30},
390      }},
391 
392     // Test that block comments are ignored.
393     {"foo /* This is a block comment */ bar",
394      {
395          {kUpb_TokenType_Identifier, "foo", 0, 0, 3},
396          {kUpb_TokenType_Identifier, "bar", 0, 34, 37},
397          {kUpb_TokenType_End, "", 0, 37, 37},
398      }},
399 
400     // Test that sh-style comments are not ignored by default.
401     {"foo # bar\n"
402      "baz",
403      {
404          {kUpb_TokenType_Identifier, "foo", 0, 0, 3},
405          {kUpb_TokenType_Symbol, "#", 0, 4, 5},
406          {kUpb_TokenType_Identifier, "bar", 0, 6, 9},
407          {kUpb_TokenType_Identifier, "baz", 1, 0, 3},
408          {kUpb_TokenType_End, "", 1, 3, 3},
409      }},
410 
411     // Test all whitespace chars
412     {"foo\n\t\r\v\fbar",
413      {
414          {kUpb_TokenType_Identifier, "foo", 0, 0, 3},
415          {kUpb_TokenType_Identifier, "bar", 1, 11, 14},
416          {kUpb_TokenType_End, "", 1, 14, 14},
417      }},
418 };
419 
TEST_2D(TokenizerTest,MultipleTokens,kMultiTokenCases,kBlockSizes)420 TEST_2D(TokenizerTest, MultipleTokens, kMultiTokenCases, kBlockSizes) {
421   // Set up the tokenizer.
422   upb::Arena arena;
423   auto input = TestInputStream(kMultiTokenCases_case.input.data(),
424                                kMultiTokenCases_case.input.size(),
425                                kBlockSizes_case, arena.ptr());
426   auto t = upb_Tokenizer_New(nullptr, 0, input, 0, arena.ptr());
427 
428   // Before Next() is called, the initial token should always be TYPE_START.
429   EXPECT_EQ(upb_Tokenizer_Type(t), kUpb_TokenType_Start);
430   EXPECT_EQ(upb_Tokenizer_Line(t), 0);
431   EXPECT_EQ(upb_Tokenizer_Column(t), 0);
432   EXPECT_EQ(upb_Tokenizer_EndColumn(t), 0);
433   EXPECT_TRUE(StringEquals(upb_Tokenizer_TextData(t), ""));
434 
435   // Loop through all expected tokens.
436   TokenFields token_fields;
437   upb_Status status;
438   upb_Status_Clear(&status);
439   int i = 0;
440   do {
441     token_fields = kMultiTokenCases_case.output[i++];
442 
443     SCOPED_TRACE(testing::Message()
444                  << "Token #" << i << ": " << absl::CEscape(token_fields.text));
445 
446     // Next() should only return false when it hits the end token.
447     if (token_fields.type == kUpb_TokenType_End) {
448       EXPECT_FALSE(upb_Tokenizer_Next(t, &status));
449       EXPECT_TRUE(upb_Status_IsOk(&status));
450     } else {
451       EXPECT_TRUE(upb_Tokenizer_Next(t, nullptr));
452     }
453 
454     // Check that the token matches the expected one.
455     EXPECT_EQ(upb_Tokenizer_Type(t), token_fields.type);
456     EXPECT_EQ(upb_Tokenizer_Line(t), token_fields.line);
457     EXPECT_EQ(upb_Tokenizer_Column(t), token_fields.column);
458     EXPECT_EQ(upb_Tokenizer_EndColumn(t), token_fields.end_column);
459     EXPECT_EQ(upb_Tokenizer_TextSize(t), token_fields.text.size());
460     EXPECT_TRUE(
461         StringEquals(upb_Tokenizer_TextData(t), token_fields.text.data()));
462   } while (token_fields.type != kUpb_TokenType_End);
463 }
464 
465 MultiTokenCase kMultiWhitespaceTokenCases[] = {
466     // Test all token types at the same time.
467     {"foo 1 \t1.2  \n   +\v'bar'",
468      {
469          {kUpb_TokenType_Identifier, "foo", 0, 0, 3},
470          {kUpb_TokenType_Whitespace, " ", 0, 3, 4},
471          {kUpb_TokenType_Integer, "1", 0, 4, 5},
472          {kUpb_TokenType_Whitespace, " \t", 0, 5, 8},
473          {kUpb_TokenType_Float, "1.2", 0, 8, 11},
474          {kUpb_TokenType_Whitespace, "  ", 0, 11, 13},
475          {kUpb_TokenType_Newline, "\n", 0, 13, 0},
476          {kUpb_TokenType_Whitespace, "   ", 1, 0, 3},
477          {kUpb_TokenType_Symbol, "+", 1, 3, 4},
478          {kUpb_TokenType_Whitespace, "\v", 1, 4, 5},
479          {kUpb_TokenType_String, "'bar'", 1, 5, 10},
480          {kUpb_TokenType_End, "", 1, 10, 10},
481      }},
482 
483 };
484 
TEST_2D(TokenizerTest,MultipleWhitespaceTokens,kMultiWhitespaceTokenCases,kBlockSizes)485 TEST_2D(TokenizerTest, MultipleWhitespaceTokens, kMultiWhitespaceTokenCases,
486         kBlockSizes) {
487   // Set up the tokenizer.
488   upb::Arena arena;
489   auto input = TestInputStream(kMultiWhitespaceTokenCases_case.input.data(),
490                                kMultiWhitespaceTokenCases_case.input.size(),
491                                kBlockSizes_case, arena.ptr());
492   const int options = kUpb_TokenizerOption_ReportNewlines;
493   auto t = upb_Tokenizer_New(nullptr, 0, input, options, arena.ptr());
494 
495   // Before Next() is called, the initial token should always be TYPE_START.
496   EXPECT_EQ(upb_Tokenizer_Type(t), kUpb_TokenType_Start);
497   EXPECT_EQ(upb_Tokenizer_Line(t), 0);
498   EXPECT_EQ(upb_Tokenizer_Column(t), 0);
499   EXPECT_EQ(upb_Tokenizer_EndColumn(t), 0);
500   EXPECT_TRUE(StringEquals(upb_Tokenizer_TextData(t), ""));
501 
502   // Loop through all expected tokens.
503   TokenFields token_fields;
504   upb_Status status;
505   upb_Status_Clear(&status);
506   int i = 0;
507   do {
508     token_fields = kMultiWhitespaceTokenCases_case.output[i++];
509 
510     SCOPED_TRACE(testing::Message()
511                  << "Token #" << i << ": " << token_fields.text);
512 
513     // Next() should only return false when it hits the end token.
514     if (token_fields.type == kUpb_TokenType_End) {
515       EXPECT_FALSE(upb_Tokenizer_Next(t, &status));
516       EXPECT_TRUE(upb_Status_IsOk(&status));
517     } else {
518       EXPECT_TRUE(upb_Tokenizer_Next(t, nullptr));
519     }
520 
521     // Check that the token matches the expected one.
522     EXPECT_EQ(upb_Tokenizer_Type(t), token_fields.type);
523     EXPECT_EQ(upb_Tokenizer_Line(t), token_fields.line);
524     EXPECT_EQ(upb_Tokenizer_Column(t), token_fields.column);
525     EXPECT_EQ(upb_Tokenizer_EndColumn(t), token_fields.end_column);
526     EXPECT_TRUE(
527         StringEquals(upb_Tokenizer_TextData(t), token_fields.text.data()));
528   } while (token_fields.type != kUpb_TokenType_End);
529 }
530 
531 // This test causes gcc 3.3.5 (and earlier?) to give the cryptic error:
532 //   "sorry, unimplemented: `method_call_expr' not supported by dump_expr"
533 #if !defined(__GNUC__) || __GNUC__ > 3 || (__GNUC__ == 3 && __GNUC_MINOR__ > 3)
534 
TEST_1D(TokenizerTest,ShCommentStyle,kBlockSizes)535 TEST_1D(TokenizerTest, ShCommentStyle, kBlockSizes) {
536   // Test the "comment_style" option.
537 
538   const char* text =
539       "foo # bar\n"
540       "baz // qux\n"
541       "corge /* grault */\n"
542       "garply";
543   const char* const kTokens[] = {"foo",  // "# bar" is ignored
544                                  "baz", "/",      "/", "qux", "corge", "/",
545                                  "*",   "grault", "*", "/",   "garply"};
546 
547   // Set up the tokenizer.
548   upb::Arena arena;
549   auto input =
550       TestInputStream(text, strlen(text), kBlockSizes_case, arena.ptr());
551   const int options = kUpb_TokenizerOption_CommentStyleShell;
552   auto t = upb_Tokenizer_New(nullptr, 0, input, options, arena.ptr());
553 
554   // Advance through tokens and check that they are parsed as expected.
555   for (size_t i = 0; i < arraysize(kTokens); i++) {
556     EXPECT_TRUE(upb_Tokenizer_Next(t, nullptr));
557     EXPECT_TRUE(StringEquals(upb_Tokenizer_TextData(t), kTokens[i]));
558   }
559 
560   // There should be no more input and no errors.
561   upb_Status status;
562   upb_Status_Clear(&status);
563   EXPECT_FALSE(upb_Tokenizer_Next(t, &status));
564   EXPECT_TRUE(upb_Status_IsOk(&status));
565 }
566 
567 #endif
568 
569 // -------------------------------------------------------------------
570 
571 #if 0  // TODO: Extended comments are currently unimplemented.
572 
573 // In each case, the input is expected to have two tokens named "prev" and
574 // "next" with comments in between.
575 struct DocCommentCase {
576   std::string input;
577 
578   const char* prev_trailing_comments;
579   const char* detached_comments[10];
580   const char* next_leading_comments;
581 };
582 
583 inline std::ostream& operator<<(std::ostream& out,
584                                 const DocCommentCase& test_case) {
585   return out << absl::CEscape(test_case.input);
586 }
587 
588 DocCommentCase kDocCommentCases[] = {
589     {"prev next",
590 
591      "",
592      {},
593      ""},
594 
595     {"prev /* ignored */ next",
596 
597      "",
598      {},
599      ""},
600 
601     {"prev // trailing comment\n"
602      "next",
603 
604      " trailing comment\n",
605      {},
606      ""},
607 
608     {"prev\n"
609      "// leading comment\n"
610      "// line 2\n"
611      "next",
612 
613      "",
614      {},
615      " leading comment\n"
616      " line 2\n"},
617 
618     {"prev\n"
619      "// trailing comment\n"
620      "// line 2\n"
621      "\n"
622      "next",
623 
624      " trailing comment\n"
625      " line 2\n",
626      {},
627      ""},
628 
629     {"prev // trailing comment\n"
630      "// leading comment\n"
631      "// line 2\n"
632      "next",
633 
634      " trailing comment\n",
635      {},
636      " leading comment\n"
637      " line 2\n"},
638 
639     {"prev /* trailing block comment */\n"
640      "/* leading block comment\n"
641      " * line 2\n"
642      " * line 3 */"
643      "next",
644 
645      " trailing block comment ",
646      {},
647      " leading block comment\n"
648      " line 2\n"
649      " line 3 "},
650 
651     {"prev\n"
652      "/* trailing block comment\n"
653      " * line 2\n"
654      " * line 3\n"
655      " */\n"
656      "/* leading block comment\n"
657      " * line 2\n"
658      " * line 3 */"
659      "next",
660 
661      " trailing block comment\n"
662      " line 2\n"
663      " line 3\n",
664      {},
665      " leading block comment\n"
666      " line 2\n"
667      " line 3 "},
668 
669     {"prev\n"
670      "// trailing comment\n"
671      "\n"
672      "// detached comment\n"
673      "// line 2\n"
674      "\n"
675      "// second detached comment\n"
676      "/* third detached comment\n"
677      " * line 2 */\n"
678      "// leading comment\n"
679      "next",
680 
681      " trailing comment\n",
682      {" detached comment\n"
683       " line 2\n",
684       " second detached comment\n",
685       " third detached comment\n"
686       " line 2 "},
687      " leading comment\n"},
688 
689     {"prev /**/\n"
690      "\n"
691      "// detached comment\n"
692      "\n"
693      "// leading comment\n"
694      "next",
695 
696      "",
697      {" detached comment\n"},
698      " leading comment\n"},
699 
700     {"prev /**/\n"
701      "// leading comment\n"
702      "next",
703 
704      "",
705      {},
706      " leading comment\n"},
707 };
708 
709 TEST_2D(TokenizerTest, DocComments, kDocCommentCases, kBlockSizes) {
710   // Set up the tokenizer.
711   TestInputStream input(kDocCommentCases_case.input.data(),
712                         kDocCommentCases_case.input.size(), kBlockSizes_case);
713   TestErrorCollector error_collector;
714   Tokenizer tokenizer(&input, &error_collector);
715 
716   // Set up a second tokenizer where we'll pass all NULLs to NextWithComments().
717   TestInputStream input2(kDocCommentCases_case.input.data(),
718                          kDocCommentCases_case.input.size(), kBlockSizes_case);
719   Tokenizer tokenizer2(&input2, &error_collector);
720 
721   tokenizer.Next();
722   tokenizer2.Next();
723 
724   EXPECT_EQ("prev", tokenizer.current().text);
725   EXPECT_EQ("prev", tokenizer2.current().text);
726 
727   std::string prev_trailing_comments;
728   std::vector<std::string> detached_comments;
729   std::string next_leading_comments;
730   tokenizer.NextWithComments(&prev_trailing_comments, &detached_comments,
731                              &next_leading_comments);
732   tokenizer2.NextWithComments(nullptr, nullptr, nullptr);
733   EXPECT_EQ("next", tokenizer.current().text);
734   EXPECT_EQ("next", tokenizer2.current().text);
735 
736   EXPECT_EQ(kDocCommentCases_case.prev_trailing_comments,
737             prev_trailing_comments);
738 
739   for (int i = 0; i < detached_comments.size(); i++) {
740     EXPECT_LT(i, arraysize(kDocCommentCases));
741     EXPECT_TRUE(kDocCommentCases_case.detached_comments[i] != nullptr);
742     EXPECT_EQ(kDocCommentCases_case.detached_comments[i], detached_comments[i]);
743   }
744 
745   // Verify that we matched all the detached comments.
746   EXPECT_EQ(nullptr,
747             kDocCommentCases_case.detached_comments[detached_comments.size()]);
748 
749   EXPECT_EQ(kDocCommentCases_case.next_leading_comments, next_leading_comments);
750 }
751 
752 #endif  // 0
753 
754 // -------------------------------------------------------------------
755 
756 // Test parse helpers.
757 // TODO: Add a fuzz test for this.
TEST_F(TokenizerTest,ParseInteger)758 TEST_F(TokenizerTest, ParseInteger) {
759   EXPECT_EQ(0, ParseInteger("0"));
760   EXPECT_EQ(123, ParseInteger("123"));
761   EXPECT_EQ(0xabcdef12u, ParseInteger("0xabcdef12"));
762   EXPECT_EQ(0xabcdef12u, ParseInteger("0xABCDEF12"));
763   EXPECT_EQ(UINT64_MAX, ParseInteger("0xFFFFFFFFFFFFFFFF"));
764   EXPECT_EQ(01234567, ParseInteger("01234567"));
765   EXPECT_EQ(0X123, ParseInteger("0X123"));
766 
767   // Test invalid integers that may still be tokenized as integers.
768   EXPECT_EQ(0, ParseInteger("0x"));
769 
770   uint64_t i;
771 
772   // Test invalid integers that will never be tokenized as integers.
773   EXPECT_FALSE(upb_Parse_Integer("zxy", UINT64_MAX, &i));
774   EXPECT_FALSE(upb_Parse_Integer("1.2", UINT64_MAX, &i));
775   EXPECT_FALSE(upb_Parse_Integer("08", UINT64_MAX, &i));
776   EXPECT_FALSE(upb_Parse_Integer("0xg", UINT64_MAX, &i));
777   EXPECT_FALSE(upb_Parse_Integer("-1", UINT64_MAX, &i));
778 
779   // Test overflows.
780   EXPECT_TRUE(upb_Parse_Integer("0", 0, &i));
781   EXPECT_FALSE(upb_Parse_Integer("1", 0, &i));
782   EXPECT_TRUE(upb_Parse_Integer("1", 1, &i));
783   EXPECT_TRUE(upb_Parse_Integer("12345", 12345, &i));
784   EXPECT_FALSE(upb_Parse_Integer("12346", 12345, &i));
785   EXPECT_TRUE(upb_Parse_Integer("0xFFFFFFFFFFFFFFFF", UINT64_MAX, &i));
786   EXPECT_FALSE(upb_Parse_Integer("0x10000000000000000", UINT64_MAX, &i));
787 
788   // Test near the limits of signed parsing (values in INT64_MAX +/- 1600)
789   for (int64_t offset = -1600; offset <= 1600; ++offset) {
790     // We make sure to perform an unsigned addition so that we avoid signed
791     // overflow, which would be undefined behavior.
792     uint64_t i = 0x7FFFFFFFFFFFFFFFu + static_cast<uint64_t>(offset);
793     char decimal[32];
794     snprintf(decimal, 32, "%llu", static_cast<unsigned long long>(i));
795     if (offset > 0) {
796       uint64_t parsed = -1;
797       EXPECT_FALSE(upb_Parse_Integer(decimal, INT64_MAX, &parsed))
798           << decimal << "=>" << parsed;
799     } else {
800       uint64_t parsed = -1;
801       EXPECT_TRUE(upb_Parse_Integer(decimal, INT64_MAX, &parsed))
802           << decimal << "=>" << parsed;
803       EXPECT_EQ(parsed, i);
804     }
805     char octal[32];
806     snprintf(octal, 32, "0%llo", static_cast<unsigned long long>(i));
807     if (offset > 0) {
808       uint64_t parsed = -1;
809       EXPECT_FALSE(upb_Parse_Integer(octal, INT64_MAX, &parsed))
810           << octal << "=>" << parsed;
811     } else {
812       uint64_t parsed = -1;
813       EXPECT_TRUE(upb_Parse_Integer(octal, INT64_MAX, &parsed))
814           << octal << "=>" << parsed;
815       EXPECT_EQ(parsed, i);
816     }
817     char hex[32];
818     snprintf(hex, 32, "0x%llx", static_cast<unsigned long long>(i));
819     if (offset > 0) {
820       uint64_t parsed = -1;
821       EXPECT_FALSE(upb_Parse_Integer(hex, INT64_MAX, &parsed))
822           << hex << "=>" << parsed;
823     } else {
824       uint64_t parsed = -1;
825       EXPECT_TRUE(upb_Parse_Integer(hex, INT64_MAX, &parsed)) << hex;
826       EXPECT_EQ(parsed, i);
827     }
828     // EXPECT_NE(offset, -237);
829   }
830 
831   // Test near the limits of unsigned parsing (values in UINT64_MAX +/- 1600)
832   // By definition, values greater than UINT64_MAX cannot be held in a uint64_t
833   // variable, so printing them is a little tricky; fortunately all but the
834   // last four digits are known, so we can hard-code them in the printf string,
835   // and we only need to format the last 4.
836   for (int64_t offset = -1600; offset <= 1600; ++offset) {
837     {
838       uint64_t i = 18446744073709551615u + offset;
839       char decimal[32];
840       snprintf(decimal, 32, "1844674407370955%04llu",
841                static_cast<unsigned long long>(1615 + offset));
842       if (offset > 0) {
843         uint64_t parsed = -1;
844         EXPECT_FALSE(upb_Parse_Integer(decimal, UINT64_MAX, &parsed))
845             << decimal << "=>" << parsed;
846       } else {
847         uint64_t parsed = -1;
848         EXPECT_TRUE(upb_Parse_Integer(decimal, UINT64_MAX, &parsed)) << decimal;
849         EXPECT_EQ(parsed, i);
850       }
851     }
852     {
853       uint64_t i = 01777777777777777777777u + offset;
854       if (offset > 0) {
855         char octal[32];
856         snprintf(octal, 32, "0200000000000000000%04llo",
857                  static_cast<unsigned long long>(offset - 1));
858         uint64_t parsed = -1;
859         EXPECT_FALSE(upb_Parse_Integer(octal, UINT64_MAX, &parsed))
860             << octal << "=>" << parsed;
861       } else {
862         char octal[32];
863         snprintf(octal, 32, "0%llo", static_cast<unsigned long long>(i));
864         uint64_t parsed = -1;
865         EXPECT_TRUE(upb_Parse_Integer(octal, UINT64_MAX, &parsed)) << octal;
866         EXPECT_EQ(parsed, i);
867       }
868     }
869     {
870       uint64_t ui = 0xffffffffffffffffu + offset;
871       char hex[32];
872       if (offset > 0) {
873         snprintf(hex, 32, "0x1000000000000%04llx",
874                  static_cast<unsigned long long>(offset - 1));
875         uint64_t parsed = -1;
876         EXPECT_FALSE(upb_Parse_Integer(hex, UINT64_MAX, &parsed))
877             << hex << "=>" << parsed;
878       } else {
879         snprintf(hex, 32, "0x%llx", static_cast<unsigned long long>(ui));
880         uint64_t parsed = -1;
881         EXPECT_TRUE(upb_Parse_Integer(hex, UINT64_MAX, &parsed)) << hex;
882         EXPECT_EQ(parsed, ui);
883       }
884     }
885   }
886 }
887 
TEST_F(TokenizerTest,ParseFloat)888 TEST_F(TokenizerTest, ParseFloat) {
889   EXPECT_DOUBLE_EQ(1, upb_Parse_Float("1."));
890   EXPECT_DOUBLE_EQ(1e3, upb_Parse_Float("1e3"));
891   EXPECT_DOUBLE_EQ(1e3, upb_Parse_Float("1E3"));
892   EXPECT_DOUBLE_EQ(1.5e3, upb_Parse_Float("1.5e3"));
893   EXPECT_DOUBLE_EQ(.1, upb_Parse_Float(".1"));
894   EXPECT_DOUBLE_EQ(.25, upb_Parse_Float(".25"));
895   EXPECT_DOUBLE_EQ(.1e3, upb_Parse_Float(".1e3"));
896   EXPECT_DOUBLE_EQ(.25e3, upb_Parse_Float(".25e3"));
897   EXPECT_DOUBLE_EQ(.1e+3, upb_Parse_Float(".1e+3"));
898   EXPECT_DOUBLE_EQ(.1e-3, upb_Parse_Float(".1e-3"));
899   EXPECT_DOUBLE_EQ(5, upb_Parse_Float("5"));
900   EXPECT_DOUBLE_EQ(6e-12, upb_Parse_Float("6e-12"));
901   EXPECT_DOUBLE_EQ(1.2, upb_Parse_Float("1.2"));
902   EXPECT_DOUBLE_EQ(1.e2, upb_Parse_Float("1.e2"));
903 
904   // Test invalid integers that may still be tokenized as integers.
905   EXPECT_DOUBLE_EQ(1, upb_Parse_Float("1e"));
906   EXPECT_DOUBLE_EQ(1, upb_Parse_Float("1e-"));
907   EXPECT_DOUBLE_EQ(1, upb_Parse_Float("1.e"));
908 
909   // Test 'f' suffix.
910   EXPECT_DOUBLE_EQ(1, upb_Parse_Float("1f"));
911   EXPECT_DOUBLE_EQ(1, upb_Parse_Float("1.0f"));
912   EXPECT_DOUBLE_EQ(1, upb_Parse_Float("1F"));
913 
914   // These should parse successfully even though they are out of range.
915   // Overflows become infinity and underflows become zero.
916   EXPECT_EQ(0.0, upb_Parse_Float("1e-9999999999999999999999999999"));
917   EXPECT_EQ(HUGE_VAL, upb_Parse_Float("1e+9999999999999999999999999999"));
918 
919 #if GTEST_HAS_DEATH_TEST  // death tests do not work on Windows yet
920   // Test invalid integers that will never be tokenized as integers.
921   EXPECT_DEBUG_DEATH(
922       upb_Parse_Float("zxy"),
923       "passed text that could not have been tokenized as a float");
924   EXPECT_DEBUG_DEATH(
925       upb_Parse_Float("1-e0"),
926       "passed text that could not have been tokenized as a float");
927   EXPECT_DEBUG_DEATH(
928       upb_Parse_Float("-1.0"),
929       "passed text that could not have been tokenized as a float");
930 #endif  // GTEST_HAS_DEATH_TEST
931 }
932 
TEST_F(TokenizerTest,ParseString)933 TEST_F(TokenizerTest, ParseString) {
934   const std::string inputs[] = {
935       "'hello'",
936       "\"blah\\nblah2\"",
937       "'\\1x\\1\\123\\739\\52\\334n\\3'",
938       "'\\x20\\x4'",
939 
940       // Test invalid strings that may still be tokenized as strings.
941       "\"\\a\\l\\v\\t",  // \l is invalid
942       "'",
943       "'\\",
944 
945       // Experiment with Unicode escapes.
946       // Here are one-, two- and three-byte Unicode characters.
947       "'\\u0024\\u00a2\\u20ac\\U00024b62XX'",
948       "'\\u0024\\u00a2\\u20ac\\ud852\\udf62XX'",  // Same, encoded using UTF16.
949 
950       // Here's some broken UTF16: a head surrogate with no tail surrogate.
951       // We just output this as if it were UTF8; it's not a defined code point,
952       // but it has a defined encoding.
953       "'\\ud852XX'",
954 
955       // Malformed escape: Demons may fly out of the nose.
956       "'\\u0'",
957 
958       // Beyond the range of valid UTF-32 code units.
959       "'\\U00110000\\U00200000\\UFFFFFFFF'",
960   };
961 
962   const std::string outputs[] = {
963       "hello",
964       "blah\nblah2",
965       "\1x\1\123\739\52\334n\3",
966       "\x20\x4",
967 
968       "\a?\v\t",
969       "",
970       "\\",
971 
972       "$¢€��XX",
973       "$¢€��XX",
974 
975       "\xed\xa1\x92XX",
976 
977       "u0",
978 
979       "\\U00110000\\U00200000\\Uffffffff",
980   };
981 
982   upb::Arena arena;
983 
984   for (size_t i = 0; i < sizeof(inputs) / sizeof(inputs[0]); i++) {
985     auto sv = upb_Parse_String(inputs[i].data(), arena.ptr());
986     EXPECT_TRUE(StringEquals(sv.data, outputs[i].data()));
987   }
988 
989   // Test invalid strings that will never be tokenized as strings.
990 #if GTEST_HAS_DEATH_TEST  // death tests do not work on Windows yet
991   EXPECT_DEBUG_DEATH(
992       upb_Parse_String("", arena.ptr()),
993       "passed text that could not have been tokenized as a string");
994 #endif  // GTEST_HAS_DEATH_TEST
995 }
996 
TEST_F(TokenizerTest,ParseStringAppend)997 TEST_F(TokenizerTest, ParseStringAppend) {
998   upb::Arena arena;
999   upb_String output;
1000   upb_String_Init(&output, arena.ptr());
1001 
1002   upb_String_Assign(&output, "stuff+", 6);
1003   auto sv = upb_Parse_String("'hello'", arena.ptr());
1004   EXPECT_TRUE(StringEquals(sv.data, "hello"));
1005   upb_String_Append(&output, sv.data, sv.size);
1006   EXPECT_TRUE(StringEquals(upb_String_Data(&output), "stuff+hello"));
1007 }
1008 
1009 // -------------------------------------------------------------------
1010 
1011 // Each case parses some input text, ignoring the tokens produced, and
1012 // checks that the error output matches what is expected.
1013 struct ErrorCase {
1014   std::string input;
1015   const char* errors;
1016 };
1017 
operator <<(std::ostream & out,const ErrorCase & test_case)1018 inline std::ostream& operator<<(std::ostream& out, const ErrorCase& test_case) {
1019   return out << absl::CEscape(test_case.input);
1020 }
1021 
1022 ErrorCase kErrorCases[] = {
1023     // String errors.
1024     {"'\\l'", "0:2: Invalid escape sequence in string literal."},
1025     {"'\\X'", "0:2: Invalid escape sequence in string literal."},
1026     {"'\\x'", "0:3: Expected hex digits for escape sequence."},
1027     {"'foo", "0:4: Unexpected end of string."},
1028     {"'bar\nfoo", "0:4: String literals cannot cross line boundaries."},
1029     {"'\\u01'", "0:5: Expected four hex digits for \\u escape sequence."},
1030     {"'\\uXYZ'", "0:3: Expected four hex digits for \\u escape sequence."},
1031 
1032     // Integer errors.
1033     {"123foo", "0:3: Need space between number and identifier."},
1034 
1035     // Hex/octal errors.
1036     {"0x foo", "0:2: \"0x\" must be followed by hex digits."},
1037     {"0541823", "0:4: Numbers starting with leading zero must be in octal."},
1038     {"0x123z", "0:5: Need space between number and identifier."},
1039     {"0x123.4", "0:5: Hex and octal numbers must be integers."},
1040     {"0123.4", "0:4: Hex and octal numbers must be integers."},
1041 
1042     // Float errors.
1043     {"1e foo", "0:2: \"e\" must be followed by exponent."},
1044     {"1e- foo", "0:3: \"e\" must be followed by exponent."},
1045     {"1.2.3",
1046      "0:3: Already saw decimal point or exponent; can't have another one."},
1047     {"1e2.3",
1048      "0:3: Already saw decimal point or exponent; can't have another one."},
1049     {"a.1", "0:1: Need space between identifier and decimal point."},
1050     // allow_f_after_float not enabled, so this should be an error.
1051     {"1.0f", "0:3: Need space between number and identifier."},
1052 
1053     // Block comment errors.
1054     {"/*",
1055      "0:2: End-of-file inside block comment.\n0:0: Comment started here."},
1056     {"/*/*/ foo",
1057      "0:3: \"/*\" inside block comment.  Block comments cannot be nested."},
1058 
1059     // Control characters.  Multiple consecutive control characters should only
1060     // produce one error.
1061     {"\b foo", "0:0: Invalid control characters encountered in text."},
1062     {"\b\b foo", "0:0: Invalid control characters encountered in text."},
1063 
1064     // Check that control characters at end of input don't result in an
1065     // infinite loop.
1066     {"\b", "0:0: Invalid control characters encountered in text."},
1067 
1068     // Check recovery from '\0'.  We have to explicitly specify the length of
1069     // these strings because otherwise the string constructor will just call
1070     // strlen() which will see the first '\0' and think that is the end of the
1071     // string.
1072     {std::string("\0foo", 4),
1073      "0:0: Invalid control characters encountered in text."},
1074     {std::string("\0\0foo", 5),
1075      "0:0: Invalid control characters encountered in text."},
1076 
1077     // Check error from high order bits set
1078     {"\300", "0:0: Interpreting non ascii codepoint 192."},
1079 };
1080 
TEST_2D(TokenizerTest,Errors,kErrorCases,kBlockSizes)1081 TEST_2D(TokenizerTest, Errors, kErrorCases, kBlockSizes) {
1082   // Set up the tokenizer.
1083   upb::Arena arena;
1084   auto input = TestInputStream(kErrorCases_case.input.data(),
1085                                kErrorCases_case.input.size(), kBlockSizes_case,
1086                                arena.ptr());
1087   auto t = upb_Tokenizer_New(nullptr, 0, input, 0, arena.ptr());
1088 
1089   upb_Status status;
1090   upb_Status_Clear(&status);
1091 
1092   while (upb_Tokenizer_Next(t, &status))
1093     ;  // just keep looping
1094   EXPECT_TRUE(
1095       StringEquals(upb_Status_ErrorMessage(&status), kErrorCases_case.errors));
1096 }
1097 
1098 // -------------------------------------------------------------------
1099 
TEST_1D(TokenizerTest,BackUpOnDestruction,kBlockSizes)1100 TEST_1D(TokenizerTest, BackUpOnDestruction, kBlockSizes) {
1101   const std::string text = "foo bar";
1102   upb::Arena arena;
1103   auto input =
1104       TestInputStream(text.data(), text.size(), kBlockSizes_case, arena.ptr());
1105 
1106   // Create a tokenizer, read one token, then destroy it.
1107   auto t = upb_Tokenizer_New(nullptr, 0, input, 0, arena.ptr());
1108   upb_Tokenizer_Next(t, nullptr);
1109   upb_Tokenizer_Fini(t);
1110 
1111   // Only "foo" should have been read.
1112   EXPECT_EQ(strlen("foo"), upb_ZeroCopyInputStream_ByteCount(input));
1113 }
1114 
1115 static const char* kParseBenchmark[] = {
1116     "\"partner-google-mobile-modes-print\"",
1117     "\"partner-google-mobile-modes-products\"",
1118     "\"partner-google-mobile-modes-realtime\"",
1119     "\"partner-google-mobile-modes-video\"",
1120     "\"partner-google-modes-news\"",
1121     "\"partner-google-modes-places\"",
1122     "\"partner-google-news\"",
1123     "\"partner-google-print\"",
1124     "\"partner-google-products\"",
1125     "\"partner-google-realtime\"",
1126     "\"partner-google-video\"",
1127     "\"true\"",
1128     "\"BigImagesHover__js_list\"",
1129     "\"XFEExternJsVersionParameters\"",
1130     "\"Available versions of the big images hover javascript\"",
1131     "\"Version: {\n\"",
1132     "\"  script_name: \"extern_js/dummy_file_compiled_post20070813.js\"\n\"",
1133     "\"  version_number: 0\n\"",
1134     "\"}\"",
1135     "\"BigImagesHover__js_selection\"",
1136     "\"XFEExternJsVersionParameters\"",
1137     "\"Versioning info for the big images hover javascript.\"",
1138     "\"current_version: 0\"",
1139     "\"BigImagesHover__js_suppressed\"",
1140     "\"Indicates if the client-side javascript associated with big images.\"",
1141     "\"true\"",
1142     "\"BrowserAnyOf\"",
1143     "\"IsChrome5OrAbove\"",
1144     "\"IsFirefox3OrAbove\"",
1145     "IsIE8OrAboveBinary",
1146     "\"Abe \"Sausage King\" Froman\"",
1147     "\"Frank \"Meatball\" Febbraro\"",
1148 };
1149 
TEST(Benchmark,ParseStringAppendAccumulate)1150 TEST(Benchmark, ParseStringAppendAccumulate) {
1151   upb::Arena arena;
1152   size_t outsize = 0;
1153   int benchmark_len = arraysize(kParseBenchmark);
1154   for (int i = 0; i < benchmark_len; i++) {
1155     auto sv = upb_Parse_String(kParseBenchmark[i], arena.ptr());
1156     outsize += sv.size;
1157   }
1158   EXPECT_NE(0, outsize);
1159 }
1160 
TEST(Benchmark,ParseStringAppend)1161 TEST(Benchmark, ParseStringAppend) {
1162   upb::Arena arena;
1163   upb_String output;
1164   upb_String_Init(&output, arena.ptr());
1165   int benchmark_len = arraysize(kParseBenchmark);
1166   for (int i = 0; i < benchmark_len; i++) {
1167     auto sv = upb_Parse_String(kParseBenchmark[i], arena.ptr());
1168     upb_String_Append(&output, sv.data, sv.size);
1169   }
1170   EXPECT_NE(0, upb_String_Size(&output));
1171 }
1172 
1173 // These tests validate the Tokenizer's handling of Unicode escapes.
1174 
1175 // Encode a single code point as UTF8.
StandardUTF8(uint32_t code_point)1176 static std::string StandardUTF8(uint32_t code_point) {
1177   char buffer[4];
1178   int count = upb_Unicode_ToUTF8(code_point, &buffer[0]);
1179 
1180   EXPECT_NE(count, 0) << "Failed to encode point " << std::hex << code_point;
1181   return std::string(reinterpret_cast<const char*>(buffer), count);
1182 }
1183 
DisplayHex(const std::string & data)1184 static std::string DisplayHex(const std::string& data) {
1185   std::string output;
1186   for (size_t i = 0; i < data.size(); ++i) {
1187     absl::StrAppendFormat(&output, "%02x ", data[i]);
1188   }
1189   return output;
1190 }
1191 
ExpectFormat(const std::string & expectation,const std::string & formatted)1192 static void ExpectFormat(const std::string& expectation,
1193                          const std::string& formatted) {
1194   upb::Arena arena;
1195   auto sv = upb_Parse_String(formatted.data(), arena.ptr());
1196   EXPECT_EQ(strcmp(sv.data, expectation.data()), 0)
1197       << ": Incorrectly parsed " << formatted << ":\nGot      "
1198       << DisplayHex(sv.data) << "\nExpected " << DisplayHex(expectation);
1199 }
1200 
TEST(TokenizerHandlesUnicode,BMPCodes)1201 TEST(TokenizerHandlesUnicode, BMPCodes) {
1202   for (uint32_t code_point = 0; code_point < 0x10000; ++code_point) {
1203     // The UTF8 encoding of surrogates as single entities is not defined.
1204     if (upb_Unicode_IsHigh(code_point)) continue;
1205     if (upb_Unicode_IsLow(code_point)) continue;
1206 
1207     const std::string expectation = StandardUTF8(code_point);
1208 
1209     // Points in the BMP pages can be encoded using either \u with four hex
1210     // digits, or \U with eight hex digits.
1211     ExpectFormat(expectation, absl::StrFormat("'\\u%04x'", code_point));
1212     ExpectFormat(expectation, absl::StrFormat("'\\u%04X'", code_point));
1213     ExpectFormat(expectation, absl::StrFormat("'\\U%08x'", code_point));
1214     ExpectFormat(expectation, absl::StrFormat("'\\U%08X'", code_point));
1215   }
1216 }
1217 
TEST(TokenizerHandlesUnicode,NonBMPCodes)1218 TEST(TokenizerHandlesUnicode, NonBMPCodes) {
1219   for (uint32_t code_point = 0x10000; code_point < 0x110000; ++code_point) {
1220     const std::string expectation = StandardUTF8(code_point);
1221 
1222     // Points in the non-BMP pages can be encoded using either \U with eight hex
1223     // digits, or using UTF-16 surrogate pairs.
1224     ExpectFormat(expectation, absl::StrFormat("'\\U%08x'", code_point));
1225     ExpectFormat(expectation, absl::StrFormat("'\\U%08X'", code_point));
1226     ExpectFormat(expectation, absl::StrFormat("'\\u%04x\\u%04x'",
1227                                               upb_Unicode_ToHigh(code_point),
1228                                               upb_Unicode_ToLow(code_point)));
1229   }
1230 }
1231 
1232 }  // namespace
1233 }  // namespace io
1234 }  // namespace protobuf
1235 }  // namespace google
1236