1 // Protocol Buffers - Google's data interchange format
2 // Copyright 2023 Google LLC. All rights reserved.
3 //
4 // Use of this source code is governed by a BSD-style
5 // license that can be found in the LICENSE file or at
6 // https://developers.google.com/open-source/licenses/bsd
7
8 #include "upb/io/tokenizer.h"
9
10 #include <gtest/gtest.h>
11 #include "absl/strings/escaping.h"
12 #include "absl/strings/str_format.h"
13 #include "upb/io/chunked_input_stream.h"
14 #include "upb/io/string.h"
15 #include "upb/lex/unicode.h"
16 #include "upb/mem/arena.hpp"
17
18 // Must be last.
19 #include "upb/port/def.inc"
20
21 namespace google {
22 namespace protobuf {
23 namespace io {
24 namespace {
25
26 #ifndef arraysize
27 #define arraysize(a) (sizeof(a) / sizeof(a[0]))
28 #endif
29
StringEquals(const char * a,const char * b)30 static bool StringEquals(const char* a, const char* b) {
31 return strcmp(a, b) == 0;
32 }
33
34 // ===================================================================
35 // Data-Driven Test Infrastructure
36
37 // TODO: This is copied from coded_stream_unittest. This is
38 // temporary until these features are integrated into gTest itself.
39
40 // TEST_1D and TEST_2D are macros I'd eventually like to see added to
41 // gTest. These macros can be used to declare tests which should be
42 // run multiple times, once for each item in some input array. TEST_1D
43 // tests all cases in a single input array. TEST_2D tests all
44 // combinations of cases from two arrays. The arrays must be statically
45 // defined such that the arraysize() macro works on them. Example:
46 //
47 // int kCases[] = {1, 2, 3, 4}
48 // TEST_1D(MyFixture, MyTest, kCases) {
49 // EXPECT_GT(kCases_case, 0);
50 // }
51 //
52 // This test iterates through the numbers 1, 2, 3, and 4 and tests that
53 // they are all grater than zero. In case of failure, the exact case
54 // which failed will be printed. The case type must be printable using
55 // ostream::operator<<.
56
57 #define TEST_1D(FIXTURE, NAME, CASES) \
58 class FIXTURE##_##NAME##_DD : public FIXTURE { \
59 protected: \
60 template <typename CaseType> \
61 void DoSingleCase(const CaseType& CASES##_case); \
62 }; \
63 \
64 TEST_F(FIXTURE##_##NAME##_DD, NAME) { \
65 for (size_t i = 0; i < arraysize(CASES); i++) { \
66 SCOPED_TRACE(testing::Message() \
67 << #CASES " case #" << i << ": " << CASES[i]); \
68 DoSingleCase(CASES[i]); \
69 } \
70 } \
71 \
72 template <typename CaseType> \
73 void FIXTURE##_##NAME##_DD::DoSingleCase(const CaseType& CASES##_case)
74
75 #define TEST_2D(FIXTURE, NAME, CASES1, CASES2) \
76 class FIXTURE##_##NAME##_DD : public FIXTURE { \
77 protected: \
78 template <typename CaseType1, typename CaseType2> \
79 void DoSingleCase(const CaseType1& CASES1##_case, \
80 const CaseType2& CASES2##_case); \
81 }; \
82 \
83 TEST_F(FIXTURE##_##NAME##_DD, NAME) { \
84 for (size_t i = 0; i < arraysize(CASES1); i++) { \
85 for (size_t j = 0; j < arraysize(CASES2); j++) { \
86 SCOPED_TRACE(testing::Message() \
87 << #CASES1 " case #" << i << ": " << CASES1[i] << ", " \
88 << #CASES2 " case #" << j << ": " << CASES2[j]); \
89 DoSingleCase(CASES1[i], CASES2[j]); \
90 } \
91 } \
92 } \
93 \
94 template <typename CaseType1, typename CaseType2> \
95 void FIXTURE##_##NAME##_DD::DoSingleCase(const CaseType1& CASES1##_case, \
96 const CaseType2& CASES2##_case)
97
98 // -------------------------------------------------------------------
99
100 // In C, a size of zero from ZCIS_Next() means EOF so we can't play the same
101 // trick here that happens in the C++ version. Use ChunkedInputStream instead.
TestInputStream(const void * data,size_t size,size_t block_size,upb_Arena * arena)102 upb_ZeroCopyInputStream* TestInputStream(const void* data, size_t size,
103 size_t block_size, upb_Arena* arena) {
104 return upb_ChunkedInputStream_New(data, size, block_size, arena);
105 }
106
107 // -------------------------------------------------------------------
108
109 // We test each operation over a variety of block sizes to insure that
110 // we test cases where reads cross buffer boundaries as well as cases
111 // where they don't. This is sort of a brute-force approach to this,
112 // but it's easy to write and easy to understand.
113 const int kBlockSizes[] = {1, 2, 3, 5, 7, 13, 32, 1024};
114
115 class TokenizerTest : public testing::Test {
116 protected:
117 // For easy testing.
ParseInteger(const std::string & text)118 uint64_t ParseInteger(const std::string& text) {
119 uint64_t result;
120 EXPECT_TRUE(upb_Parse_Integer(text.data(), UINT64_MAX, &result))
121 << "'" << text << "'";
122 return result;
123 }
124 };
125
126 // ===================================================================
127
128 // These tests causes gcc 3.3.5 (and earlier?) to give the cryptic error:
129 // "sorry, unimplemented: `method_call_expr' not supported by dump_expr"
130 #if !defined(__GNUC__) || __GNUC__ > 3 || (__GNUC__ == 3 && __GNUC_MINOR__ > 3)
131
132 // In each test case, the entire input text should parse as a single token
133 // of the given type.
134 struct SimpleTokenCase {
135 std::string input;
136 upb_TokenType type;
137 };
138
operator <<(std::ostream & out,const SimpleTokenCase & test_case)139 inline std::ostream& operator<<(std::ostream& out,
140 const SimpleTokenCase& test_case) {
141 return out << absl::CEscape(test_case.input);
142 }
143
144 SimpleTokenCase kSimpleTokenCases[] = {
145 // Test identifiers.
146 {"hello", kUpb_TokenType_Identifier},
147
148 // Test integers.
149 {"123", kUpb_TokenType_Integer},
150 {"0xab6", kUpb_TokenType_Integer},
151 {"0XAB6", kUpb_TokenType_Integer},
152 {"0X1234567", kUpb_TokenType_Integer},
153 {"0x89abcdef", kUpb_TokenType_Integer},
154 {"0x89ABCDEF", kUpb_TokenType_Integer},
155 {"01234567", kUpb_TokenType_Integer},
156
157 // Test floats.
158 {"123.45", kUpb_TokenType_Float},
159 {"1.", kUpb_TokenType_Float},
160 {"1e3", kUpb_TokenType_Float},
161 {"1E3", kUpb_TokenType_Float},
162 {"1e-3", kUpb_TokenType_Float},
163 {"1e+3", kUpb_TokenType_Float},
164 {"1.e3", kUpb_TokenType_Float},
165 {"1.2e3", kUpb_TokenType_Float},
166 {".1", kUpb_TokenType_Float},
167 {".1e3", kUpb_TokenType_Float},
168 {".1e-3", kUpb_TokenType_Float},
169 {".1e+3", kUpb_TokenType_Float},
170
171 // Test strings.
172 {"'hello'", kUpb_TokenType_String},
173 {"\"foo\"", kUpb_TokenType_String},
174 {"'a\"b'", kUpb_TokenType_String},
175 {"\"a'b\"", kUpb_TokenType_String},
176 {"'a\\'b'", kUpb_TokenType_String},
177 {"\"a\\\"b\"", kUpb_TokenType_String},
178 {"'\\xf'", kUpb_TokenType_String},
179 {"'\\0'", kUpb_TokenType_String},
180
181 // Test symbols.
182 {"+", kUpb_TokenType_Symbol},
183 {".", kUpb_TokenType_Symbol},
184 };
185
TEST_2D(TokenizerTest,SimpleTokens,kSimpleTokenCases,kBlockSizes)186 TEST_2D(TokenizerTest, SimpleTokens, kSimpleTokenCases, kBlockSizes) {
187 upb::Arena arena;
188
189 // Set up the tokenizer.
190 auto input = TestInputStream(kSimpleTokenCases_case.input.data(),
191 kSimpleTokenCases_case.input.size(),
192 kBlockSizes_case, arena.ptr());
193 auto t = upb_Tokenizer_New(nullptr, 0, input, 0, arena.ptr());
194
195 // Before Next() is called, the initial token should always be TYPE_START.
196 EXPECT_EQ(upb_Tokenizer_Type(t), kUpb_TokenType_Start);
197 EXPECT_EQ(upb_Tokenizer_Line(t), 0);
198 EXPECT_EQ(upb_Tokenizer_Column(t), 0);
199 EXPECT_EQ(upb_Tokenizer_EndColumn(t), 0);
200 EXPECT_TRUE(StringEquals(upb_Tokenizer_TextData(t), ""));
201
202 // Parse the token.
203 EXPECT_TRUE(upb_Tokenizer_Next(t, nullptr));
204 // Check that it has the right type.
205 EXPECT_EQ(upb_Tokenizer_Type(t), kSimpleTokenCases_case.type);
206 // Check that it contains the complete input text.
207 EXPECT_TRUE(StringEquals(upb_Tokenizer_TextData(t),
208 kSimpleTokenCases_case.input.data()));
209
210 // Check that it is located at the beginning of the input
211 EXPECT_EQ(upb_Tokenizer_Line(t), 0);
212 EXPECT_EQ(upb_Tokenizer_Column(t), 0);
213 EXPECT_EQ(upb_Tokenizer_EndColumn(t), kSimpleTokenCases_case.input.size());
214
215 upb_Status status;
216 upb_Status_Clear(&status);
217
218 // There should be no more input and no errors..
219 EXPECT_FALSE(upb_Tokenizer_Next(t, &status));
220 EXPECT_TRUE(upb_Status_IsOk(&status));
221
222 // After Next() returns false, the token should have type TYPE_END.
223 EXPECT_EQ(upb_Tokenizer_Type(t), kUpb_TokenType_End);
224 EXPECT_EQ(upb_Tokenizer_Line(t), 0);
225 EXPECT_EQ(upb_Tokenizer_Column(t), kSimpleTokenCases_case.input.size());
226 EXPECT_EQ(upb_Tokenizer_EndColumn(t), kSimpleTokenCases_case.input.size());
227 EXPECT_TRUE(StringEquals(upb_Tokenizer_TextData(t), ""));
228 }
229
TEST_1D(TokenizerTest,FloatSuffix,kBlockSizes)230 TEST_1D(TokenizerTest, FloatSuffix, kBlockSizes) {
231 // Test the "allow_f_after_float" option.
232
233 // Set up the tokenizer.
234 upb::Arena arena;
235 const char* text = "1f 2.5f 6e3f 7F";
236 auto input =
237 TestInputStream(text, strlen(text), kBlockSizes_case, arena.ptr());
238 const int options = kUpb_TokenizerOption_AllowFAfterFloat;
239 auto t = upb_Tokenizer_New(nullptr, 0, input, options, arena.ptr());
240
241 // Advance through tokens and check that they are parsed as expected.
242
243 EXPECT_TRUE(upb_Tokenizer_Next(t, nullptr));
244 EXPECT_EQ(upb_Tokenizer_Type(t), kUpb_TokenType_Float);
245 EXPECT_TRUE(StringEquals(upb_Tokenizer_TextData(t), "1f"));
246
247 EXPECT_TRUE(upb_Tokenizer_Next(t, nullptr));
248 EXPECT_EQ(upb_Tokenizer_Type(t), kUpb_TokenType_Float);
249 EXPECT_TRUE(StringEquals(upb_Tokenizer_TextData(t), "2.5f"));
250
251 EXPECT_TRUE(upb_Tokenizer_Next(t, nullptr));
252 EXPECT_EQ(upb_Tokenizer_Type(t), kUpb_TokenType_Float);
253 EXPECT_TRUE(StringEquals(upb_Tokenizer_TextData(t), "6e3f"));
254
255 EXPECT_TRUE(upb_Tokenizer_Next(t, nullptr));
256 EXPECT_EQ(upb_Tokenizer_Type(t), kUpb_TokenType_Float);
257 EXPECT_TRUE(StringEquals(upb_Tokenizer_TextData(t), "7F"));
258
259 upb_Status status;
260 upb_Status_Clear(&status);
261
262 // There should be no more input and no errors..
263 EXPECT_FALSE(upb_Tokenizer_Next(t, &status));
264 EXPECT_TRUE(upb_Status_IsOk(&status));
265 }
266
267 SimpleTokenCase kWhitespaceTokenCases[] = {
268 {" ", kUpb_TokenType_Whitespace},
269 {" ", kUpb_TokenType_Whitespace},
270 {"\t", kUpb_TokenType_Whitespace},
271 {"\v", kUpb_TokenType_Whitespace},
272 {"\t ", kUpb_TokenType_Whitespace},
273 {"\v\t", kUpb_TokenType_Whitespace},
274 {" \t\r", kUpb_TokenType_Whitespace},
275 // Newlines:
276 {"\n", kUpb_TokenType_Newline},
277 };
278
TEST_2D(TokenizerTest,Whitespace,kWhitespaceTokenCases,kBlockSizes)279 TEST_2D(TokenizerTest, Whitespace, kWhitespaceTokenCases, kBlockSizes) {
280 upb::Arena arena;
281 {
282 auto input = TestInputStream(kWhitespaceTokenCases_case.input.data(),
283 kWhitespaceTokenCases_case.input.size(),
284 kBlockSizes_case, arena.ptr());
285 auto t = upb_Tokenizer_New(nullptr, 0, input, 0, arena.ptr());
286
287 EXPECT_FALSE(upb_Tokenizer_Next(t, nullptr));
288 }
289 {
290 auto input = TestInputStream(kWhitespaceTokenCases_case.input.data(),
291 kWhitespaceTokenCases_case.input.size(),
292 kBlockSizes_case, arena.ptr());
293 const int options = kUpb_TokenizerOption_ReportNewlines;
294 auto t = upb_Tokenizer_New(nullptr, 0, input, options, arena.ptr());
295
296 EXPECT_TRUE(upb_Tokenizer_Next(t, nullptr));
297
298 EXPECT_EQ(upb_Tokenizer_Type(t), kWhitespaceTokenCases_case.type);
299 EXPECT_TRUE(StringEquals(upb_Tokenizer_TextData(t),
300 kWhitespaceTokenCases_case.input.data()));
301 EXPECT_FALSE(upb_Tokenizer_Next(t, nullptr));
302 }
303 }
304
305 #endif
306
307 // -------------------------------------------------------------------
308
309 struct TokenFields {
310 upb_TokenType type;
311 std::string text;
312 size_t line;
313 size_t column;
314 size_t end_column;
315 };
316
317 // In each case, the input is parsed to produce a list of tokens. The
318 // last token in "output" must have type TYPE_END.
319 struct MultiTokenCase {
320 std::string input;
321 std::vector<TokenFields> output;
322 };
323
operator <<(std::ostream & out,const MultiTokenCase & test_case)324 inline std::ostream& operator<<(std::ostream& out,
325 const MultiTokenCase& test_case) {
326 return out << absl::CEscape(test_case.input);
327 }
328
329 MultiTokenCase kMultiTokenCases[] = {
330 // Test empty input.
331 {"",
332 {
333 {kUpb_TokenType_End, "", 0, 0, 0},
334 }},
335 // Test all token types at the same time.
336 {"foo 1 1.2 + 'bar'",
337 {
338 {kUpb_TokenType_Identifier, "foo", 0, 0, 3},
339 {kUpb_TokenType_Integer, "1", 0, 4, 5},
340 {kUpb_TokenType_Float, "1.2", 0, 6, 9},
341 {kUpb_TokenType_Symbol, "+", 0, 10, 11},
342 {kUpb_TokenType_String, "'bar'", 0, 12, 17},
343 {kUpb_TokenType_End, "", 0, 17, 17},
344 }},
345
346 // Test that consecutive symbols are parsed as separate tokens.
347 {"!@+%",
348 {
349 {kUpb_TokenType_Symbol, "!", 0, 0, 1},
350 {kUpb_TokenType_Symbol, "@", 0, 1, 2},
351 {kUpb_TokenType_Symbol, "+", 0, 2, 3},
352 {kUpb_TokenType_Symbol, "%", 0, 3, 4},
353 {kUpb_TokenType_End, "", 0, 4, 4},
354 }},
355
356 // Test that newlines affect line numbers correctly.
357 {"foo bar\nrab oof",
358 {
359 {kUpb_TokenType_Identifier, "foo", 0, 0, 3},
360 {kUpb_TokenType_Identifier, "bar", 0, 4, 7},
361 {kUpb_TokenType_Identifier, "rab", 1, 0, 3},
362 {kUpb_TokenType_Identifier, "oof", 1, 4, 7},
363 {kUpb_TokenType_End, "", 1, 7, 7},
364 }},
365
366 // Test that tabs affect column numbers correctly.
367 {"foo\tbar \tbaz",
368 {
369 {kUpb_TokenType_Identifier, "foo", 0, 0, 3},
370 {kUpb_TokenType_Identifier, "bar", 0, 8, 11},
371 {kUpb_TokenType_Identifier, "baz", 0, 16, 19},
372 {kUpb_TokenType_End, "", 0, 19, 19},
373 }},
374
375 // Test that tabs in string literals affect column numbers correctly.
376 {"\"foo\tbar\" baz",
377 {
378 {kUpb_TokenType_String, "\"foo\tbar\"", 0, 0, 12},
379 {kUpb_TokenType_Identifier, "baz", 0, 13, 16},
380 {kUpb_TokenType_End, "", 0, 16, 16},
381 }},
382
383 // Test that line comments are ignored.
384 {"foo // This is a comment\n"
385 "bar // This is another comment",
386 {
387 {kUpb_TokenType_Identifier, "foo", 0, 0, 3},
388 {kUpb_TokenType_Identifier, "bar", 1, 0, 3},
389 {kUpb_TokenType_End, "", 1, 30, 30},
390 }},
391
392 // Test that block comments are ignored.
393 {"foo /* This is a block comment */ bar",
394 {
395 {kUpb_TokenType_Identifier, "foo", 0, 0, 3},
396 {kUpb_TokenType_Identifier, "bar", 0, 34, 37},
397 {kUpb_TokenType_End, "", 0, 37, 37},
398 }},
399
400 // Test that sh-style comments are not ignored by default.
401 {"foo # bar\n"
402 "baz",
403 {
404 {kUpb_TokenType_Identifier, "foo", 0, 0, 3},
405 {kUpb_TokenType_Symbol, "#", 0, 4, 5},
406 {kUpb_TokenType_Identifier, "bar", 0, 6, 9},
407 {kUpb_TokenType_Identifier, "baz", 1, 0, 3},
408 {kUpb_TokenType_End, "", 1, 3, 3},
409 }},
410
411 // Test all whitespace chars
412 {"foo\n\t\r\v\fbar",
413 {
414 {kUpb_TokenType_Identifier, "foo", 0, 0, 3},
415 {kUpb_TokenType_Identifier, "bar", 1, 11, 14},
416 {kUpb_TokenType_End, "", 1, 14, 14},
417 }},
418 };
419
TEST_2D(TokenizerTest,MultipleTokens,kMultiTokenCases,kBlockSizes)420 TEST_2D(TokenizerTest, MultipleTokens, kMultiTokenCases, kBlockSizes) {
421 // Set up the tokenizer.
422 upb::Arena arena;
423 auto input = TestInputStream(kMultiTokenCases_case.input.data(),
424 kMultiTokenCases_case.input.size(),
425 kBlockSizes_case, arena.ptr());
426 auto t = upb_Tokenizer_New(nullptr, 0, input, 0, arena.ptr());
427
428 // Before Next() is called, the initial token should always be TYPE_START.
429 EXPECT_EQ(upb_Tokenizer_Type(t), kUpb_TokenType_Start);
430 EXPECT_EQ(upb_Tokenizer_Line(t), 0);
431 EXPECT_EQ(upb_Tokenizer_Column(t), 0);
432 EXPECT_EQ(upb_Tokenizer_EndColumn(t), 0);
433 EXPECT_TRUE(StringEquals(upb_Tokenizer_TextData(t), ""));
434
435 // Loop through all expected tokens.
436 TokenFields token_fields;
437 upb_Status status;
438 upb_Status_Clear(&status);
439 int i = 0;
440 do {
441 token_fields = kMultiTokenCases_case.output[i++];
442
443 SCOPED_TRACE(testing::Message()
444 << "Token #" << i << ": " << absl::CEscape(token_fields.text));
445
446 // Next() should only return false when it hits the end token.
447 if (token_fields.type == kUpb_TokenType_End) {
448 EXPECT_FALSE(upb_Tokenizer_Next(t, &status));
449 EXPECT_TRUE(upb_Status_IsOk(&status));
450 } else {
451 EXPECT_TRUE(upb_Tokenizer_Next(t, nullptr));
452 }
453
454 // Check that the token matches the expected one.
455 EXPECT_EQ(upb_Tokenizer_Type(t), token_fields.type);
456 EXPECT_EQ(upb_Tokenizer_Line(t), token_fields.line);
457 EXPECT_EQ(upb_Tokenizer_Column(t), token_fields.column);
458 EXPECT_EQ(upb_Tokenizer_EndColumn(t), token_fields.end_column);
459 EXPECT_EQ(upb_Tokenizer_TextSize(t), token_fields.text.size());
460 EXPECT_TRUE(
461 StringEquals(upb_Tokenizer_TextData(t), token_fields.text.data()));
462 } while (token_fields.type != kUpb_TokenType_End);
463 }
464
465 MultiTokenCase kMultiWhitespaceTokenCases[] = {
466 // Test all token types at the same time.
467 {"foo 1 \t1.2 \n +\v'bar'",
468 {
469 {kUpb_TokenType_Identifier, "foo", 0, 0, 3},
470 {kUpb_TokenType_Whitespace, " ", 0, 3, 4},
471 {kUpb_TokenType_Integer, "1", 0, 4, 5},
472 {kUpb_TokenType_Whitespace, " \t", 0, 5, 8},
473 {kUpb_TokenType_Float, "1.2", 0, 8, 11},
474 {kUpb_TokenType_Whitespace, " ", 0, 11, 13},
475 {kUpb_TokenType_Newline, "\n", 0, 13, 0},
476 {kUpb_TokenType_Whitespace, " ", 1, 0, 3},
477 {kUpb_TokenType_Symbol, "+", 1, 3, 4},
478 {kUpb_TokenType_Whitespace, "\v", 1, 4, 5},
479 {kUpb_TokenType_String, "'bar'", 1, 5, 10},
480 {kUpb_TokenType_End, "", 1, 10, 10},
481 }},
482
483 };
484
TEST_2D(TokenizerTest,MultipleWhitespaceTokens,kMultiWhitespaceTokenCases,kBlockSizes)485 TEST_2D(TokenizerTest, MultipleWhitespaceTokens, kMultiWhitespaceTokenCases,
486 kBlockSizes) {
487 // Set up the tokenizer.
488 upb::Arena arena;
489 auto input = TestInputStream(kMultiWhitespaceTokenCases_case.input.data(),
490 kMultiWhitespaceTokenCases_case.input.size(),
491 kBlockSizes_case, arena.ptr());
492 const int options = kUpb_TokenizerOption_ReportNewlines;
493 auto t = upb_Tokenizer_New(nullptr, 0, input, options, arena.ptr());
494
495 // Before Next() is called, the initial token should always be TYPE_START.
496 EXPECT_EQ(upb_Tokenizer_Type(t), kUpb_TokenType_Start);
497 EXPECT_EQ(upb_Tokenizer_Line(t), 0);
498 EXPECT_EQ(upb_Tokenizer_Column(t), 0);
499 EXPECT_EQ(upb_Tokenizer_EndColumn(t), 0);
500 EXPECT_TRUE(StringEquals(upb_Tokenizer_TextData(t), ""));
501
502 // Loop through all expected tokens.
503 TokenFields token_fields;
504 upb_Status status;
505 upb_Status_Clear(&status);
506 int i = 0;
507 do {
508 token_fields = kMultiWhitespaceTokenCases_case.output[i++];
509
510 SCOPED_TRACE(testing::Message()
511 << "Token #" << i << ": " << token_fields.text);
512
513 // Next() should only return false when it hits the end token.
514 if (token_fields.type == kUpb_TokenType_End) {
515 EXPECT_FALSE(upb_Tokenizer_Next(t, &status));
516 EXPECT_TRUE(upb_Status_IsOk(&status));
517 } else {
518 EXPECT_TRUE(upb_Tokenizer_Next(t, nullptr));
519 }
520
521 // Check that the token matches the expected one.
522 EXPECT_EQ(upb_Tokenizer_Type(t), token_fields.type);
523 EXPECT_EQ(upb_Tokenizer_Line(t), token_fields.line);
524 EXPECT_EQ(upb_Tokenizer_Column(t), token_fields.column);
525 EXPECT_EQ(upb_Tokenizer_EndColumn(t), token_fields.end_column);
526 EXPECT_TRUE(
527 StringEquals(upb_Tokenizer_TextData(t), token_fields.text.data()));
528 } while (token_fields.type != kUpb_TokenType_End);
529 }
530
531 // This test causes gcc 3.3.5 (and earlier?) to give the cryptic error:
532 // "sorry, unimplemented: `method_call_expr' not supported by dump_expr"
533 #if !defined(__GNUC__) || __GNUC__ > 3 || (__GNUC__ == 3 && __GNUC_MINOR__ > 3)
534
TEST_1D(TokenizerTest,ShCommentStyle,kBlockSizes)535 TEST_1D(TokenizerTest, ShCommentStyle, kBlockSizes) {
536 // Test the "comment_style" option.
537
538 const char* text =
539 "foo # bar\n"
540 "baz // qux\n"
541 "corge /* grault */\n"
542 "garply";
543 const char* const kTokens[] = {"foo", // "# bar" is ignored
544 "baz", "/", "/", "qux", "corge", "/",
545 "*", "grault", "*", "/", "garply"};
546
547 // Set up the tokenizer.
548 upb::Arena arena;
549 auto input =
550 TestInputStream(text, strlen(text), kBlockSizes_case, arena.ptr());
551 const int options = kUpb_TokenizerOption_CommentStyleShell;
552 auto t = upb_Tokenizer_New(nullptr, 0, input, options, arena.ptr());
553
554 // Advance through tokens and check that they are parsed as expected.
555 for (size_t i = 0; i < arraysize(kTokens); i++) {
556 EXPECT_TRUE(upb_Tokenizer_Next(t, nullptr));
557 EXPECT_TRUE(StringEquals(upb_Tokenizer_TextData(t), kTokens[i]));
558 }
559
560 // There should be no more input and no errors.
561 upb_Status status;
562 upb_Status_Clear(&status);
563 EXPECT_FALSE(upb_Tokenizer_Next(t, &status));
564 EXPECT_TRUE(upb_Status_IsOk(&status));
565 }
566
567 #endif
568
569 // -------------------------------------------------------------------
570
571 #if 0 // TODO: Extended comments are currently unimplemented.
572
573 // In each case, the input is expected to have two tokens named "prev" and
574 // "next" with comments in between.
575 struct DocCommentCase {
576 std::string input;
577
578 const char* prev_trailing_comments;
579 const char* detached_comments[10];
580 const char* next_leading_comments;
581 };
582
583 inline std::ostream& operator<<(std::ostream& out,
584 const DocCommentCase& test_case) {
585 return out << absl::CEscape(test_case.input);
586 }
587
588 DocCommentCase kDocCommentCases[] = {
589 {"prev next",
590
591 "",
592 {},
593 ""},
594
595 {"prev /* ignored */ next",
596
597 "",
598 {},
599 ""},
600
601 {"prev // trailing comment\n"
602 "next",
603
604 " trailing comment\n",
605 {},
606 ""},
607
608 {"prev\n"
609 "// leading comment\n"
610 "// line 2\n"
611 "next",
612
613 "",
614 {},
615 " leading comment\n"
616 " line 2\n"},
617
618 {"prev\n"
619 "// trailing comment\n"
620 "// line 2\n"
621 "\n"
622 "next",
623
624 " trailing comment\n"
625 " line 2\n",
626 {},
627 ""},
628
629 {"prev // trailing comment\n"
630 "// leading comment\n"
631 "// line 2\n"
632 "next",
633
634 " trailing comment\n",
635 {},
636 " leading comment\n"
637 " line 2\n"},
638
639 {"prev /* trailing block comment */\n"
640 "/* leading block comment\n"
641 " * line 2\n"
642 " * line 3 */"
643 "next",
644
645 " trailing block comment ",
646 {},
647 " leading block comment\n"
648 " line 2\n"
649 " line 3 "},
650
651 {"prev\n"
652 "/* trailing block comment\n"
653 " * line 2\n"
654 " * line 3\n"
655 " */\n"
656 "/* leading block comment\n"
657 " * line 2\n"
658 " * line 3 */"
659 "next",
660
661 " trailing block comment\n"
662 " line 2\n"
663 " line 3\n",
664 {},
665 " leading block comment\n"
666 " line 2\n"
667 " line 3 "},
668
669 {"prev\n"
670 "// trailing comment\n"
671 "\n"
672 "// detached comment\n"
673 "// line 2\n"
674 "\n"
675 "// second detached comment\n"
676 "/* third detached comment\n"
677 " * line 2 */\n"
678 "// leading comment\n"
679 "next",
680
681 " trailing comment\n",
682 {" detached comment\n"
683 " line 2\n",
684 " second detached comment\n",
685 " third detached comment\n"
686 " line 2 "},
687 " leading comment\n"},
688
689 {"prev /**/\n"
690 "\n"
691 "// detached comment\n"
692 "\n"
693 "// leading comment\n"
694 "next",
695
696 "",
697 {" detached comment\n"},
698 " leading comment\n"},
699
700 {"prev /**/\n"
701 "// leading comment\n"
702 "next",
703
704 "",
705 {},
706 " leading comment\n"},
707 };
708
709 TEST_2D(TokenizerTest, DocComments, kDocCommentCases, kBlockSizes) {
710 // Set up the tokenizer.
711 TestInputStream input(kDocCommentCases_case.input.data(),
712 kDocCommentCases_case.input.size(), kBlockSizes_case);
713 TestErrorCollector error_collector;
714 Tokenizer tokenizer(&input, &error_collector);
715
716 // Set up a second tokenizer where we'll pass all NULLs to NextWithComments().
717 TestInputStream input2(kDocCommentCases_case.input.data(),
718 kDocCommentCases_case.input.size(), kBlockSizes_case);
719 Tokenizer tokenizer2(&input2, &error_collector);
720
721 tokenizer.Next();
722 tokenizer2.Next();
723
724 EXPECT_EQ("prev", tokenizer.current().text);
725 EXPECT_EQ("prev", tokenizer2.current().text);
726
727 std::string prev_trailing_comments;
728 std::vector<std::string> detached_comments;
729 std::string next_leading_comments;
730 tokenizer.NextWithComments(&prev_trailing_comments, &detached_comments,
731 &next_leading_comments);
732 tokenizer2.NextWithComments(nullptr, nullptr, nullptr);
733 EXPECT_EQ("next", tokenizer.current().text);
734 EXPECT_EQ("next", tokenizer2.current().text);
735
736 EXPECT_EQ(kDocCommentCases_case.prev_trailing_comments,
737 prev_trailing_comments);
738
739 for (int i = 0; i < detached_comments.size(); i++) {
740 EXPECT_LT(i, arraysize(kDocCommentCases));
741 EXPECT_TRUE(kDocCommentCases_case.detached_comments[i] != nullptr);
742 EXPECT_EQ(kDocCommentCases_case.detached_comments[i], detached_comments[i]);
743 }
744
745 // Verify that we matched all the detached comments.
746 EXPECT_EQ(nullptr,
747 kDocCommentCases_case.detached_comments[detached_comments.size()]);
748
749 EXPECT_EQ(kDocCommentCases_case.next_leading_comments, next_leading_comments);
750 }
751
752 #endif // 0
753
754 // -------------------------------------------------------------------
755
756 // Test parse helpers.
757 // TODO: Add a fuzz test for this.
TEST_F(TokenizerTest,ParseInteger)758 TEST_F(TokenizerTest, ParseInteger) {
759 EXPECT_EQ(0, ParseInteger("0"));
760 EXPECT_EQ(123, ParseInteger("123"));
761 EXPECT_EQ(0xabcdef12u, ParseInteger("0xabcdef12"));
762 EXPECT_EQ(0xabcdef12u, ParseInteger("0xABCDEF12"));
763 EXPECT_EQ(UINT64_MAX, ParseInteger("0xFFFFFFFFFFFFFFFF"));
764 EXPECT_EQ(01234567, ParseInteger("01234567"));
765 EXPECT_EQ(0X123, ParseInteger("0X123"));
766
767 // Test invalid integers that may still be tokenized as integers.
768 EXPECT_EQ(0, ParseInteger("0x"));
769
770 uint64_t i;
771
772 // Test invalid integers that will never be tokenized as integers.
773 EXPECT_FALSE(upb_Parse_Integer("zxy", UINT64_MAX, &i));
774 EXPECT_FALSE(upb_Parse_Integer("1.2", UINT64_MAX, &i));
775 EXPECT_FALSE(upb_Parse_Integer("08", UINT64_MAX, &i));
776 EXPECT_FALSE(upb_Parse_Integer("0xg", UINT64_MAX, &i));
777 EXPECT_FALSE(upb_Parse_Integer("-1", UINT64_MAX, &i));
778
779 // Test overflows.
780 EXPECT_TRUE(upb_Parse_Integer("0", 0, &i));
781 EXPECT_FALSE(upb_Parse_Integer("1", 0, &i));
782 EXPECT_TRUE(upb_Parse_Integer("1", 1, &i));
783 EXPECT_TRUE(upb_Parse_Integer("12345", 12345, &i));
784 EXPECT_FALSE(upb_Parse_Integer("12346", 12345, &i));
785 EXPECT_TRUE(upb_Parse_Integer("0xFFFFFFFFFFFFFFFF", UINT64_MAX, &i));
786 EXPECT_FALSE(upb_Parse_Integer("0x10000000000000000", UINT64_MAX, &i));
787
788 // Test near the limits of signed parsing (values in INT64_MAX +/- 1600)
789 for (int64_t offset = -1600; offset <= 1600; ++offset) {
790 // We make sure to perform an unsigned addition so that we avoid signed
791 // overflow, which would be undefined behavior.
792 uint64_t i = 0x7FFFFFFFFFFFFFFFu + static_cast<uint64_t>(offset);
793 char decimal[32];
794 snprintf(decimal, 32, "%llu", static_cast<unsigned long long>(i));
795 if (offset > 0) {
796 uint64_t parsed = -1;
797 EXPECT_FALSE(upb_Parse_Integer(decimal, INT64_MAX, &parsed))
798 << decimal << "=>" << parsed;
799 } else {
800 uint64_t parsed = -1;
801 EXPECT_TRUE(upb_Parse_Integer(decimal, INT64_MAX, &parsed))
802 << decimal << "=>" << parsed;
803 EXPECT_EQ(parsed, i);
804 }
805 char octal[32];
806 snprintf(octal, 32, "0%llo", static_cast<unsigned long long>(i));
807 if (offset > 0) {
808 uint64_t parsed = -1;
809 EXPECT_FALSE(upb_Parse_Integer(octal, INT64_MAX, &parsed))
810 << octal << "=>" << parsed;
811 } else {
812 uint64_t parsed = -1;
813 EXPECT_TRUE(upb_Parse_Integer(octal, INT64_MAX, &parsed))
814 << octal << "=>" << parsed;
815 EXPECT_EQ(parsed, i);
816 }
817 char hex[32];
818 snprintf(hex, 32, "0x%llx", static_cast<unsigned long long>(i));
819 if (offset > 0) {
820 uint64_t parsed = -1;
821 EXPECT_FALSE(upb_Parse_Integer(hex, INT64_MAX, &parsed))
822 << hex << "=>" << parsed;
823 } else {
824 uint64_t parsed = -1;
825 EXPECT_TRUE(upb_Parse_Integer(hex, INT64_MAX, &parsed)) << hex;
826 EXPECT_EQ(parsed, i);
827 }
828 // EXPECT_NE(offset, -237);
829 }
830
831 // Test near the limits of unsigned parsing (values in UINT64_MAX +/- 1600)
832 // By definition, values greater than UINT64_MAX cannot be held in a uint64_t
833 // variable, so printing them is a little tricky; fortunately all but the
834 // last four digits are known, so we can hard-code them in the printf string,
835 // and we only need to format the last 4.
836 for (int64_t offset = -1600; offset <= 1600; ++offset) {
837 {
838 uint64_t i = 18446744073709551615u + offset;
839 char decimal[32];
840 snprintf(decimal, 32, "1844674407370955%04llu",
841 static_cast<unsigned long long>(1615 + offset));
842 if (offset > 0) {
843 uint64_t parsed = -1;
844 EXPECT_FALSE(upb_Parse_Integer(decimal, UINT64_MAX, &parsed))
845 << decimal << "=>" << parsed;
846 } else {
847 uint64_t parsed = -1;
848 EXPECT_TRUE(upb_Parse_Integer(decimal, UINT64_MAX, &parsed)) << decimal;
849 EXPECT_EQ(parsed, i);
850 }
851 }
852 {
853 uint64_t i = 01777777777777777777777u + offset;
854 if (offset > 0) {
855 char octal[32];
856 snprintf(octal, 32, "0200000000000000000%04llo",
857 static_cast<unsigned long long>(offset - 1));
858 uint64_t parsed = -1;
859 EXPECT_FALSE(upb_Parse_Integer(octal, UINT64_MAX, &parsed))
860 << octal << "=>" << parsed;
861 } else {
862 char octal[32];
863 snprintf(octal, 32, "0%llo", static_cast<unsigned long long>(i));
864 uint64_t parsed = -1;
865 EXPECT_TRUE(upb_Parse_Integer(octal, UINT64_MAX, &parsed)) << octal;
866 EXPECT_EQ(parsed, i);
867 }
868 }
869 {
870 uint64_t ui = 0xffffffffffffffffu + offset;
871 char hex[32];
872 if (offset > 0) {
873 snprintf(hex, 32, "0x1000000000000%04llx",
874 static_cast<unsigned long long>(offset - 1));
875 uint64_t parsed = -1;
876 EXPECT_FALSE(upb_Parse_Integer(hex, UINT64_MAX, &parsed))
877 << hex << "=>" << parsed;
878 } else {
879 snprintf(hex, 32, "0x%llx", static_cast<unsigned long long>(ui));
880 uint64_t parsed = -1;
881 EXPECT_TRUE(upb_Parse_Integer(hex, UINT64_MAX, &parsed)) << hex;
882 EXPECT_EQ(parsed, ui);
883 }
884 }
885 }
886 }
887
TEST_F(TokenizerTest,ParseFloat)888 TEST_F(TokenizerTest, ParseFloat) {
889 EXPECT_DOUBLE_EQ(1, upb_Parse_Float("1."));
890 EXPECT_DOUBLE_EQ(1e3, upb_Parse_Float("1e3"));
891 EXPECT_DOUBLE_EQ(1e3, upb_Parse_Float("1E3"));
892 EXPECT_DOUBLE_EQ(1.5e3, upb_Parse_Float("1.5e3"));
893 EXPECT_DOUBLE_EQ(.1, upb_Parse_Float(".1"));
894 EXPECT_DOUBLE_EQ(.25, upb_Parse_Float(".25"));
895 EXPECT_DOUBLE_EQ(.1e3, upb_Parse_Float(".1e3"));
896 EXPECT_DOUBLE_EQ(.25e3, upb_Parse_Float(".25e3"));
897 EXPECT_DOUBLE_EQ(.1e+3, upb_Parse_Float(".1e+3"));
898 EXPECT_DOUBLE_EQ(.1e-3, upb_Parse_Float(".1e-3"));
899 EXPECT_DOUBLE_EQ(5, upb_Parse_Float("5"));
900 EXPECT_DOUBLE_EQ(6e-12, upb_Parse_Float("6e-12"));
901 EXPECT_DOUBLE_EQ(1.2, upb_Parse_Float("1.2"));
902 EXPECT_DOUBLE_EQ(1.e2, upb_Parse_Float("1.e2"));
903
904 // Test invalid integers that may still be tokenized as integers.
905 EXPECT_DOUBLE_EQ(1, upb_Parse_Float("1e"));
906 EXPECT_DOUBLE_EQ(1, upb_Parse_Float("1e-"));
907 EXPECT_DOUBLE_EQ(1, upb_Parse_Float("1.e"));
908
909 // Test 'f' suffix.
910 EXPECT_DOUBLE_EQ(1, upb_Parse_Float("1f"));
911 EXPECT_DOUBLE_EQ(1, upb_Parse_Float("1.0f"));
912 EXPECT_DOUBLE_EQ(1, upb_Parse_Float("1F"));
913
914 // These should parse successfully even though they are out of range.
915 // Overflows become infinity and underflows become zero.
916 EXPECT_EQ(0.0, upb_Parse_Float("1e-9999999999999999999999999999"));
917 EXPECT_EQ(HUGE_VAL, upb_Parse_Float("1e+9999999999999999999999999999"));
918
919 #if GTEST_HAS_DEATH_TEST // death tests do not work on Windows yet
920 // Test invalid integers that will never be tokenized as integers.
921 EXPECT_DEBUG_DEATH(
922 upb_Parse_Float("zxy"),
923 "passed text that could not have been tokenized as a float");
924 EXPECT_DEBUG_DEATH(
925 upb_Parse_Float("1-e0"),
926 "passed text that could not have been tokenized as a float");
927 EXPECT_DEBUG_DEATH(
928 upb_Parse_Float("-1.0"),
929 "passed text that could not have been tokenized as a float");
930 #endif // GTEST_HAS_DEATH_TEST
931 }
932
TEST_F(TokenizerTest,ParseString)933 TEST_F(TokenizerTest, ParseString) {
934 const std::string inputs[] = {
935 "'hello'",
936 "\"blah\\nblah2\"",
937 "'\\1x\\1\\123\\739\\52\\334n\\3'",
938 "'\\x20\\x4'",
939
940 // Test invalid strings that may still be tokenized as strings.
941 "\"\\a\\l\\v\\t", // \l is invalid
942 "'",
943 "'\\",
944
945 // Experiment with Unicode escapes.
946 // Here are one-, two- and three-byte Unicode characters.
947 "'\\u0024\\u00a2\\u20ac\\U00024b62XX'",
948 "'\\u0024\\u00a2\\u20ac\\ud852\\udf62XX'", // Same, encoded using UTF16.
949
950 // Here's some broken UTF16: a head surrogate with no tail surrogate.
951 // We just output this as if it were UTF8; it's not a defined code point,
952 // but it has a defined encoding.
953 "'\\ud852XX'",
954
955 // Malformed escape: Demons may fly out of the nose.
956 "'\\u0'",
957
958 // Beyond the range of valid UTF-32 code units.
959 "'\\U00110000\\U00200000\\UFFFFFFFF'",
960 };
961
962 const std::string outputs[] = {
963 "hello",
964 "blah\nblah2",
965 "\1x\1\123\739\52\334n\3",
966 "\x20\x4",
967
968 "\a?\v\t",
969 "",
970 "\\",
971
972 "$¢€XX",
973 "$¢€XX",
974
975 "\xed\xa1\x92XX",
976
977 "u0",
978
979 "\\U00110000\\U00200000\\Uffffffff",
980 };
981
982 upb::Arena arena;
983
984 for (size_t i = 0; i < sizeof(inputs) / sizeof(inputs[0]); i++) {
985 auto sv = upb_Parse_String(inputs[i].data(), arena.ptr());
986 EXPECT_TRUE(StringEquals(sv.data, outputs[i].data()));
987 }
988
989 // Test invalid strings that will never be tokenized as strings.
990 #if GTEST_HAS_DEATH_TEST // death tests do not work on Windows yet
991 EXPECT_DEBUG_DEATH(
992 upb_Parse_String("", arena.ptr()),
993 "passed text that could not have been tokenized as a string");
994 #endif // GTEST_HAS_DEATH_TEST
995 }
996
TEST_F(TokenizerTest,ParseStringAppend)997 TEST_F(TokenizerTest, ParseStringAppend) {
998 upb::Arena arena;
999 upb_String output;
1000 upb_String_Init(&output, arena.ptr());
1001
1002 upb_String_Assign(&output, "stuff+", 6);
1003 auto sv = upb_Parse_String("'hello'", arena.ptr());
1004 EXPECT_TRUE(StringEquals(sv.data, "hello"));
1005 upb_String_Append(&output, sv.data, sv.size);
1006 EXPECT_TRUE(StringEquals(upb_String_Data(&output), "stuff+hello"));
1007 }
1008
1009 // -------------------------------------------------------------------
1010
1011 // Each case parses some input text, ignoring the tokens produced, and
1012 // checks that the error output matches what is expected.
1013 struct ErrorCase {
1014 std::string input;
1015 const char* errors;
1016 };
1017
operator <<(std::ostream & out,const ErrorCase & test_case)1018 inline std::ostream& operator<<(std::ostream& out, const ErrorCase& test_case) {
1019 return out << absl::CEscape(test_case.input);
1020 }
1021
1022 ErrorCase kErrorCases[] = {
1023 // String errors.
1024 {"'\\l'", "0:2: Invalid escape sequence in string literal."},
1025 {"'\\X'", "0:2: Invalid escape sequence in string literal."},
1026 {"'\\x'", "0:3: Expected hex digits for escape sequence."},
1027 {"'foo", "0:4: Unexpected end of string."},
1028 {"'bar\nfoo", "0:4: String literals cannot cross line boundaries."},
1029 {"'\\u01'", "0:5: Expected four hex digits for \\u escape sequence."},
1030 {"'\\uXYZ'", "0:3: Expected four hex digits for \\u escape sequence."},
1031
1032 // Integer errors.
1033 {"123foo", "0:3: Need space between number and identifier."},
1034
1035 // Hex/octal errors.
1036 {"0x foo", "0:2: \"0x\" must be followed by hex digits."},
1037 {"0541823", "0:4: Numbers starting with leading zero must be in octal."},
1038 {"0x123z", "0:5: Need space between number and identifier."},
1039 {"0x123.4", "0:5: Hex and octal numbers must be integers."},
1040 {"0123.4", "0:4: Hex and octal numbers must be integers."},
1041
1042 // Float errors.
1043 {"1e foo", "0:2: \"e\" must be followed by exponent."},
1044 {"1e- foo", "0:3: \"e\" must be followed by exponent."},
1045 {"1.2.3",
1046 "0:3: Already saw decimal point or exponent; can't have another one."},
1047 {"1e2.3",
1048 "0:3: Already saw decimal point or exponent; can't have another one."},
1049 {"a.1", "0:1: Need space between identifier and decimal point."},
1050 // allow_f_after_float not enabled, so this should be an error.
1051 {"1.0f", "0:3: Need space between number and identifier."},
1052
1053 // Block comment errors.
1054 {"/*",
1055 "0:2: End-of-file inside block comment.\n0:0: Comment started here."},
1056 {"/*/*/ foo",
1057 "0:3: \"/*\" inside block comment. Block comments cannot be nested."},
1058
1059 // Control characters. Multiple consecutive control characters should only
1060 // produce one error.
1061 {"\b foo", "0:0: Invalid control characters encountered in text."},
1062 {"\b\b foo", "0:0: Invalid control characters encountered in text."},
1063
1064 // Check that control characters at end of input don't result in an
1065 // infinite loop.
1066 {"\b", "0:0: Invalid control characters encountered in text."},
1067
1068 // Check recovery from '\0'. We have to explicitly specify the length of
1069 // these strings because otherwise the string constructor will just call
1070 // strlen() which will see the first '\0' and think that is the end of the
1071 // string.
1072 {std::string("\0foo", 4),
1073 "0:0: Invalid control characters encountered in text."},
1074 {std::string("\0\0foo", 5),
1075 "0:0: Invalid control characters encountered in text."},
1076
1077 // Check error from high order bits set
1078 {"\300", "0:0: Interpreting non ascii codepoint 192."},
1079 };
1080
TEST_2D(TokenizerTest,Errors,kErrorCases,kBlockSizes)1081 TEST_2D(TokenizerTest, Errors, kErrorCases, kBlockSizes) {
1082 // Set up the tokenizer.
1083 upb::Arena arena;
1084 auto input = TestInputStream(kErrorCases_case.input.data(),
1085 kErrorCases_case.input.size(), kBlockSizes_case,
1086 arena.ptr());
1087 auto t = upb_Tokenizer_New(nullptr, 0, input, 0, arena.ptr());
1088
1089 upb_Status status;
1090 upb_Status_Clear(&status);
1091
1092 while (upb_Tokenizer_Next(t, &status))
1093 ; // just keep looping
1094 EXPECT_TRUE(
1095 StringEquals(upb_Status_ErrorMessage(&status), kErrorCases_case.errors));
1096 }
1097
1098 // -------------------------------------------------------------------
1099
TEST_1D(TokenizerTest,BackUpOnDestruction,kBlockSizes)1100 TEST_1D(TokenizerTest, BackUpOnDestruction, kBlockSizes) {
1101 const std::string text = "foo bar";
1102 upb::Arena arena;
1103 auto input =
1104 TestInputStream(text.data(), text.size(), kBlockSizes_case, arena.ptr());
1105
1106 // Create a tokenizer, read one token, then destroy it.
1107 auto t = upb_Tokenizer_New(nullptr, 0, input, 0, arena.ptr());
1108 upb_Tokenizer_Next(t, nullptr);
1109 upb_Tokenizer_Fini(t);
1110
1111 // Only "foo" should have been read.
1112 EXPECT_EQ(strlen("foo"), upb_ZeroCopyInputStream_ByteCount(input));
1113 }
1114
1115 static const char* kParseBenchmark[] = {
1116 "\"partner-google-mobile-modes-print\"",
1117 "\"partner-google-mobile-modes-products\"",
1118 "\"partner-google-mobile-modes-realtime\"",
1119 "\"partner-google-mobile-modes-video\"",
1120 "\"partner-google-modes-news\"",
1121 "\"partner-google-modes-places\"",
1122 "\"partner-google-news\"",
1123 "\"partner-google-print\"",
1124 "\"partner-google-products\"",
1125 "\"partner-google-realtime\"",
1126 "\"partner-google-video\"",
1127 "\"true\"",
1128 "\"BigImagesHover__js_list\"",
1129 "\"XFEExternJsVersionParameters\"",
1130 "\"Available versions of the big images hover javascript\"",
1131 "\"Version: {\n\"",
1132 "\" script_name: \"extern_js/dummy_file_compiled_post20070813.js\"\n\"",
1133 "\" version_number: 0\n\"",
1134 "\"}\"",
1135 "\"BigImagesHover__js_selection\"",
1136 "\"XFEExternJsVersionParameters\"",
1137 "\"Versioning info for the big images hover javascript.\"",
1138 "\"current_version: 0\"",
1139 "\"BigImagesHover__js_suppressed\"",
1140 "\"Indicates if the client-side javascript associated with big images.\"",
1141 "\"true\"",
1142 "\"BrowserAnyOf\"",
1143 "\"IsChrome5OrAbove\"",
1144 "\"IsFirefox3OrAbove\"",
1145 "IsIE8OrAboveBinary",
1146 "\"Abe \"Sausage King\" Froman\"",
1147 "\"Frank \"Meatball\" Febbraro\"",
1148 };
1149
TEST(Benchmark,ParseStringAppendAccumulate)1150 TEST(Benchmark, ParseStringAppendAccumulate) {
1151 upb::Arena arena;
1152 size_t outsize = 0;
1153 int benchmark_len = arraysize(kParseBenchmark);
1154 for (int i = 0; i < benchmark_len; i++) {
1155 auto sv = upb_Parse_String(kParseBenchmark[i], arena.ptr());
1156 outsize += sv.size;
1157 }
1158 EXPECT_NE(0, outsize);
1159 }
1160
TEST(Benchmark,ParseStringAppend)1161 TEST(Benchmark, ParseStringAppend) {
1162 upb::Arena arena;
1163 upb_String output;
1164 upb_String_Init(&output, arena.ptr());
1165 int benchmark_len = arraysize(kParseBenchmark);
1166 for (int i = 0; i < benchmark_len; i++) {
1167 auto sv = upb_Parse_String(kParseBenchmark[i], arena.ptr());
1168 upb_String_Append(&output, sv.data, sv.size);
1169 }
1170 EXPECT_NE(0, upb_String_Size(&output));
1171 }
1172
1173 // These tests validate the Tokenizer's handling of Unicode escapes.
1174
1175 // Encode a single code point as UTF8.
StandardUTF8(uint32_t code_point)1176 static std::string StandardUTF8(uint32_t code_point) {
1177 char buffer[4];
1178 int count = upb_Unicode_ToUTF8(code_point, &buffer[0]);
1179
1180 EXPECT_NE(count, 0) << "Failed to encode point " << std::hex << code_point;
1181 return std::string(reinterpret_cast<const char*>(buffer), count);
1182 }
1183
DisplayHex(const std::string & data)1184 static std::string DisplayHex(const std::string& data) {
1185 std::string output;
1186 for (size_t i = 0; i < data.size(); ++i) {
1187 absl::StrAppendFormat(&output, "%02x ", data[i]);
1188 }
1189 return output;
1190 }
1191
ExpectFormat(const std::string & expectation,const std::string & formatted)1192 static void ExpectFormat(const std::string& expectation,
1193 const std::string& formatted) {
1194 upb::Arena arena;
1195 auto sv = upb_Parse_String(formatted.data(), arena.ptr());
1196 EXPECT_EQ(strcmp(sv.data, expectation.data()), 0)
1197 << ": Incorrectly parsed " << formatted << ":\nGot "
1198 << DisplayHex(sv.data) << "\nExpected " << DisplayHex(expectation);
1199 }
1200
TEST(TokenizerHandlesUnicode,BMPCodes)1201 TEST(TokenizerHandlesUnicode, BMPCodes) {
1202 for (uint32_t code_point = 0; code_point < 0x10000; ++code_point) {
1203 // The UTF8 encoding of surrogates as single entities is not defined.
1204 if (upb_Unicode_IsHigh(code_point)) continue;
1205 if (upb_Unicode_IsLow(code_point)) continue;
1206
1207 const std::string expectation = StandardUTF8(code_point);
1208
1209 // Points in the BMP pages can be encoded using either \u with four hex
1210 // digits, or \U with eight hex digits.
1211 ExpectFormat(expectation, absl::StrFormat("'\\u%04x'", code_point));
1212 ExpectFormat(expectation, absl::StrFormat("'\\u%04X'", code_point));
1213 ExpectFormat(expectation, absl::StrFormat("'\\U%08x'", code_point));
1214 ExpectFormat(expectation, absl::StrFormat("'\\U%08X'", code_point));
1215 }
1216 }
1217
TEST(TokenizerHandlesUnicode,NonBMPCodes)1218 TEST(TokenizerHandlesUnicode, NonBMPCodes) {
1219 for (uint32_t code_point = 0x10000; code_point < 0x110000; ++code_point) {
1220 const std::string expectation = StandardUTF8(code_point);
1221
1222 // Points in the non-BMP pages can be encoded using either \U with eight hex
1223 // digits, or using UTF-16 surrogate pairs.
1224 ExpectFormat(expectation, absl::StrFormat("'\\U%08x'", code_point));
1225 ExpectFormat(expectation, absl::StrFormat("'\\U%08X'", code_point));
1226 ExpectFormat(expectation, absl::StrFormat("'\\u%04x\\u%04x'",
1227 upb_Unicode_ToHigh(code_point),
1228 upb_Unicode_ToLow(code_point)));
1229 }
1230 }
1231
1232 } // namespace
1233 } // namespace io
1234 } // namespace protobuf
1235 } // namespace google
1236