1 /*
2 __ _____ _____ _____
3 __| | __| | | | JSON for Modern C++ (test suite)
4 | | |__ | | | | | | version 3.9.1
5 |_____|_____|_____|_|___| https://github.com/nlohmann/json
6
7 Licensed under the MIT License <http://opensource.org/licenses/MIT>.
8 SPDX-License-Identifier: MIT
9 Copyright (c) 2013-2019 Niels Lohmann <http://nlohmann.me>.
10
11 Permission is hereby granted, free of charge, to any person obtaining a copy
12 of this software and associated documentation files (the "Software"), to deal
13 in the Software without restriction, including without limitation the rights
14 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
15 copies of the Software, and to permit persons to whom the Software is
16 furnished to do so, subject to the following conditions:
17
18 The above copyright notice and this permission notice shall be included in all
19 copies or substantial portions of the Software.
20
21 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
22 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
23 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
24 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
25 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
26 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
27 SOFTWARE.
28 */
29
30 #include "doctest_compatibility.h"
31
32 // for some reason including this after the json header leads to linker errors with VS 2017...
33 #include <locale>
34
35 #define private public
36 #include <nlohmann/json.hpp>
37 using nlohmann::json;
38 #undef private
39
40 #include <fstream>
41 #include <sstream>
42 #include <iostream>
43 #include <iomanip>
44 #include <test_data.hpp>
45
46 namespace
47 {
48 extern size_t calls;
49 size_t calls = 0;
50
51 void check_utf8dump(bool success_expected, int byte1, int byte2, int byte3, int byte4);
52
check_utf8dump(bool success_expected,int byte1,int byte2=-1,int byte3=-1,int byte4=-1)53 void check_utf8dump(bool success_expected, int byte1, int byte2 = -1, int byte3 = -1, int byte4 = -1)
54 {
55 std::string json_string;
56
57 CAPTURE(byte1)
58 CAPTURE(byte2)
59 CAPTURE(byte3)
60 CAPTURE(byte4)
61
62 json_string += std::string(1, static_cast<char>(byte1));
63
64 if (byte2 != -1)
65 {
66 json_string += std::string(1, static_cast<char>(byte2));
67 }
68
69 if (byte3 != -1)
70 {
71 json_string += std::string(1, static_cast<char>(byte3));
72 }
73
74 if (byte4 != -1)
75 {
76 json_string += std::string(1, static_cast<char>(byte4));
77 }
78
79 CAPTURE(json_string)
80
81 // store the string in a JSON value
82 json j = json_string;
83 json j2 = "abc" + json_string + "xyz";
84
85 // dumping with ignore/replace must not throw in any case
86 auto s_ignored = j.dump(-1, ' ', false, json::error_handler_t::ignore);
87 auto s_ignored2 = j2.dump(-1, ' ', false, json::error_handler_t::ignore);
88 auto s_ignored_ascii = j.dump(-1, ' ', true, json::error_handler_t::ignore);
89 auto s_ignored2_ascii = j2.dump(-1, ' ', true, json::error_handler_t::ignore);
90 auto s_replaced = j.dump(-1, ' ', false, json::error_handler_t::replace);
91 auto s_replaced2 = j2.dump(-1, ' ', false, json::error_handler_t::replace);
92 auto s_replaced_ascii = j.dump(-1, ' ', true, json::error_handler_t::replace);
93 auto s_replaced2_ascii = j2.dump(-1, ' ', true, json::error_handler_t::replace);
94
95 if (success_expected)
96 {
97 // strict mode must not throw if success is expected
98 auto s_strict = j.dump();
99 // all dumps should agree on the string
100 CHECK(s_strict == s_ignored);
101 CHECK(s_strict == s_replaced);
102 }
103 else
104 {
105 // strict mode must throw if success is not expected
106 CHECK_THROWS_AS(j.dump(), json::type_error&);
107 // ignore and replace must create different dumps
108 CHECK(s_ignored != s_replaced);
109
110 // check that replace string contains a replacement character
111 CHECK(s_replaced.find("\xEF\xBF\xBD") != std::string::npos);
112 }
113
114 // check that prefix and suffix are preserved
115 CHECK(s_ignored2.substr(1, 3) == "abc");
116 CHECK(s_ignored2.substr(s_ignored2.size() - 4, 3) == "xyz");
117 CHECK(s_ignored2_ascii.substr(1, 3) == "abc");
118 CHECK(s_ignored2_ascii.substr(s_ignored2_ascii.size() - 4, 3) == "xyz");
119 CHECK(s_replaced2.substr(1, 3) == "abc");
120 CHECK(s_replaced2.substr(s_replaced2.size() - 4, 3) == "xyz");
121 CHECK(s_replaced2_ascii.substr(1, 3) == "abc");
122 CHECK(s_replaced2_ascii.substr(s_replaced2_ascii.size() - 4, 3) == "xyz");
123 }
124
125 void check_utf8string(bool success_expected, int byte1, int byte2, int byte3, int byte4);
126
127 // create and check a JSON string with up to four UTF-8 bytes
check_utf8string(bool success_expected,int byte1,int byte2=-1,int byte3=-1,int byte4=-1)128 void check_utf8string(bool success_expected, int byte1, int byte2 = -1, int byte3 = -1, int byte4 = -1)
129 {
130 if (++calls % 100000 == 0)
131 {
132 std::cout << calls << " of 8860608 UTF-8 strings checked" << std::endl;
133 }
134
135 std::string json_string = "\"";
136
137 CAPTURE(byte1)
138 json_string += std::string(1, static_cast<char>(byte1));
139
140 if (byte2 != -1)
141 {
142 CAPTURE(byte2)
143 json_string += std::string(1, static_cast<char>(byte2));
144 }
145
146 if (byte3 != -1)
147 {
148 CAPTURE(byte3)
149 json_string += std::string(1, static_cast<char>(byte3));
150 }
151
152 if (byte4 != -1)
153 {
154 CAPTURE(byte4)
155 json_string += std::string(1, static_cast<char>(byte4));
156 }
157
158 json_string += "\"";
159
160 CAPTURE(json_string)
161
162 json _;
163 if (success_expected)
164 {
165 CHECK_NOTHROW(_ = json::parse(json_string));
166 }
167 else
168 {
169 CHECK_THROWS_AS(_ = json::parse(json_string), json::parse_error&);
170 }
171 }
172 }
173
skip()174 TEST_CASE("Unicode" * doctest::skip())
175 {
176 SECTION("RFC 3629")
177 {
178 /*
179 RFC 3629 describes in Sect. 4 the syntax of UTF-8 byte sequences as
180 follows:
181
182 A UTF-8 string is a sequence of octets representing a sequence of UCS
183 characters. An octet sequence is valid UTF-8 only if it matches the
184 following syntax, which is derived from the rules for encoding UTF-8
185 and is expressed in the ABNF of [RFC2234].
186
187 UTF8-octets = *( UTF8-char )
188 UTF8-char = UTF8-1 / UTF8-2 / UTF8-3 / UTF8-4
189 UTF8-1 = %x00-7F
190 UTF8-2 = %xC2-DF UTF8-tail
191 UTF8-3 = %xE0 %xA0-BF UTF8-tail / %xE1-EC 2( UTF8-tail ) /
192 %xED %x80-9F UTF8-tail / %xEE-EF 2( UTF8-tail )
193 UTF8-4 = %xF0 %x90-BF 2( UTF8-tail ) / %xF1-F3 3( UTF8-tail ) /
194 %xF4 %x80-8F 2( UTF8-tail )
195 UTF8-tail = %x80-BF
196 */
197
198 SECTION("ill-formed first byte")
199 {
200 for (int byte1 = 0x80; byte1 <= 0xC1; ++byte1)
201 {
202 check_utf8string(false, byte1);
203 check_utf8dump(false, byte1);
204 }
205
206 for (int byte1 = 0xF5; byte1 <= 0xFF; ++byte1)
207 {
208 check_utf8string(false, byte1);
209 check_utf8dump(false, byte1);
210 }
211 }
212
213 SECTION("UTF8-1 (x00-x7F)")
214 {
215 SECTION("well-formed")
216 {
217 for (int byte1 = 0x00; byte1 <= 0x7F; ++byte1)
218 {
219 // unescaped control characters are parse errors in JSON
220 if (0x00 <= byte1 && byte1 <= 0x1F)
221 {
222 check_utf8string(false, byte1);
223 continue;
224 }
225
226 // a single quote is a parse error in JSON
227 if (byte1 == 0x22)
228 {
229 check_utf8string(false, byte1);
230 continue;
231 }
232
233 // a single backslash is a parse error in JSON
234 if (byte1 == 0x5C)
235 {
236 check_utf8string(false, byte1);
237 continue;
238 }
239
240 // all other characters are OK
241 check_utf8string(true, byte1);
242 check_utf8dump(true, byte1);
243 }
244 }
245 }
246
247 SECTION("UTF8-2 (xC2-xDF UTF8-tail)")
248 {
249 SECTION("well-formed")
250 {
251 for (int byte1 = 0xC2; byte1 <= 0xDF; ++byte1)
252 {
253 for (int byte2 = 0x80; byte2 <= 0xBF; ++byte2)
254 {
255 check_utf8string(true, byte1, byte2);
256 check_utf8dump(true, byte1, byte2);
257 }
258 }
259 }
260
261 SECTION("ill-formed: missing second byte")
262 {
263 for (int byte1 = 0xC2; byte1 <= 0xDF; ++byte1)
264 {
265 check_utf8string(false, byte1);
266 check_utf8dump(false, byte1);
267 }
268 }
269
270 SECTION("ill-formed: wrong second byte")
271 {
272 for (int byte1 = 0xC2; byte1 <= 0xDF; ++byte1)
273 {
274 for (int byte2 = 0x00; byte2 <= 0xFF; ++byte2)
275 {
276 // skip correct second byte
277 if (0x80 <= byte2 && byte2 <= 0xBF)
278 {
279 continue;
280 }
281
282 check_utf8string(false, byte1, byte2);
283 check_utf8dump(false, byte1, byte2);
284 }
285 }
286 }
287 }
288
289 SECTION("UTF8-3 (xE0 xA0-BF UTF8-tail)")
290 {
291 SECTION("well-formed")
292 {
293 for (int byte1 = 0xE0; byte1 <= 0xE0; ++byte1)
294 {
295 for (int byte2 = 0xA0; byte2 <= 0xBF; ++byte2)
296 {
297 for (int byte3 = 0x80; byte3 <= 0xBF; ++byte3)
298 {
299 check_utf8string(true, byte1, byte2, byte3);
300 check_utf8dump(true, byte1, byte2, byte3);
301 }
302 }
303 }
304 }
305
306 SECTION("ill-formed: missing second byte")
307 {
308 for (int byte1 = 0xE0; byte1 <= 0xE0; ++byte1)
309 {
310 check_utf8string(false, byte1);
311 check_utf8dump(false, byte1);
312 }
313 }
314
315 SECTION("ill-formed: missing third byte")
316 {
317 for (int byte1 = 0xE0; byte1 <= 0xE0; ++byte1)
318 {
319 for (int byte2 = 0xA0; byte2 <= 0xBF; ++byte2)
320 {
321 check_utf8string(false, byte1, byte2);
322 check_utf8dump(false, byte1, byte2);
323 }
324 }
325 }
326
327 SECTION("ill-formed: wrong second byte")
328 {
329 for (int byte1 = 0xE0; byte1 <= 0xE0; ++byte1)
330 {
331 for (int byte2 = 0x00; byte2 <= 0xFF; ++byte2)
332 {
333 // skip correct second byte
334 if (0xA0 <= byte2 && byte2 <= 0xBF)
335 {
336 continue;
337 }
338
339 for (int byte3 = 0x80; byte3 <= 0xBF; ++byte3)
340 {
341 check_utf8string(false, byte1, byte2, byte3);
342 check_utf8dump(false, byte1, byte2, byte3);
343 }
344 }
345 }
346 }
347
348 SECTION("ill-formed: wrong third byte")
349 {
350 for (int byte1 = 0xE0; byte1 <= 0xE0; ++byte1)
351 {
352 for (int byte2 = 0xA0; byte2 <= 0xBF; ++byte2)
353 {
354 for (int byte3 = 0x00; byte3 <= 0xFF; ++byte3)
355 {
356 // skip correct third byte
357 if (0x80 <= byte3 && byte3 <= 0xBF)
358 {
359 continue;
360 }
361
362 check_utf8string(false, byte1, byte2, byte3);
363 check_utf8dump(false, byte1, byte2, byte3);
364 }
365 }
366 }
367 }
368 }
369
370 SECTION("UTF8-3 (xE1-xEC UTF8-tail UTF8-tail)")
371 {
372 SECTION("well-formed")
373 {
374 for (int byte1 = 0xE1; byte1 <= 0xEC; ++byte1)
375 {
376 for (int byte2 = 0x80; byte2 <= 0xBF; ++byte2)
377 {
378 for (int byte3 = 0x80; byte3 <= 0xBF; ++byte3)
379 {
380 check_utf8string(true, byte1, byte2, byte3);
381 check_utf8dump(true, byte1, byte2, byte3);
382 }
383 }
384 }
385 }
386
387 SECTION("ill-formed: missing second byte")
388 {
389 for (int byte1 = 0xE1; byte1 <= 0xEC; ++byte1)
390 {
391 check_utf8string(false, byte1);
392 check_utf8dump(false, byte1);
393 }
394 }
395
396 SECTION("ill-formed: missing third byte")
397 {
398 for (int byte1 = 0xE1; byte1 <= 0xEC; ++byte1)
399 {
400 for (int byte2 = 0x80; byte2 <= 0xBF; ++byte2)
401 {
402 check_utf8string(false, byte1, byte2);
403 check_utf8dump(false, byte1, byte2);
404 }
405 }
406 }
407
408 SECTION("ill-formed: wrong second byte")
409 {
410 for (int byte1 = 0xE1; byte1 <= 0xEC; ++byte1)
411 {
412 for (int byte2 = 0x00; byte2 <= 0xFF; ++byte2)
413 {
414 // skip correct second byte
415 if (0x80 <= byte2 && byte2 <= 0xBF)
416 {
417 continue;
418 }
419
420 for (int byte3 = 0x80; byte3 <= 0xBF; ++byte3)
421 {
422 check_utf8string(false, byte1, byte2, byte3);
423 check_utf8dump(false, byte1, byte2, byte3);
424 }
425 }
426 }
427 }
428
429 SECTION("ill-formed: wrong third byte")
430 {
431 for (int byte1 = 0xE1; byte1 <= 0xEC; ++byte1)
432 {
433 for (int byte2 = 0x80; byte2 <= 0xBF; ++byte2)
434 {
435 for (int byte3 = 0x00; byte3 <= 0xFF; ++byte3)
436 {
437 // skip correct third byte
438 if (0x80 <= byte3 && byte3 <= 0xBF)
439 {
440 continue;
441 }
442
443 check_utf8string(false, byte1, byte2, byte3);
444 check_utf8dump(false, byte1, byte2, byte3);
445 }
446 }
447 }
448 }
449 }
450
451 SECTION("UTF8-3 (xED x80-9F UTF8-tail)")
452 {
453 SECTION("well-formed")
454 {
455 for (int byte1 = 0xED; byte1 <= 0xED; ++byte1)
456 {
457 for (int byte2 = 0x80; byte2 <= 0x9F; ++byte2)
458 {
459 for (int byte3 = 0x80; byte3 <= 0xBF; ++byte3)
460 {
461 check_utf8string(true, byte1, byte2, byte3);
462 check_utf8dump(true, byte1, byte2, byte3);
463 }
464 }
465 }
466 }
467
468 SECTION("ill-formed: missing second byte")
469 {
470 for (int byte1 = 0xED; byte1 <= 0xED; ++byte1)
471 {
472 check_utf8string(false, byte1);
473 check_utf8dump(false, byte1);
474 }
475 }
476
477 SECTION("ill-formed: missing third byte")
478 {
479 for (int byte1 = 0xED; byte1 <= 0xED; ++byte1)
480 {
481 for (int byte2 = 0x80; byte2 <= 0x9F; ++byte2)
482 {
483 check_utf8string(false, byte1, byte2);
484 check_utf8dump(false, byte1, byte2);
485 }
486 }
487 }
488
489 SECTION("ill-formed: wrong second byte")
490 {
491 for (int byte1 = 0xED; byte1 <= 0xED; ++byte1)
492 {
493 for (int byte2 = 0x00; byte2 <= 0xFF; ++byte2)
494 {
495 // skip correct second byte
496 if (0x80 <= byte2 && byte2 <= 0x9F)
497 {
498 continue;
499 }
500
501 for (int byte3 = 0x80; byte3 <= 0xBF; ++byte3)
502 {
503 check_utf8string(false, byte1, byte2, byte3);
504 check_utf8dump(false, byte1, byte2, byte3);
505 }
506 }
507 }
508 }
509
510 SECTION("ill-formed: wrong third byte")
511 {
512 for (int byte1 = 0xED; byte1 <= 0xED; ++byte1)
513 {
514 for (int byte2 = 0x80; byte2 <= 0x9F; ++byte2)
515 {
516 for (int byte3 = 0x00; byte3 <= 0xFF; ++byte3)
517 {
518 // skip correct third byte
519 if (0x80 <= byte3 && byte3 <= 0xBF)
520 {
521 continue;
522 }
523
524 check_utf8string(false, byte1, byte2, byte3);
525 check_utf8dump(false, byte1, byte2, byte3);
526 }
527 }
528 }
529 }
530 }
531
532 SECTION("UTF8-3 (xEE-xEF UTF8-tail UTF8-tail)")
533 {
534 SECTION("well-formed")
535 {
536 for (int byte1 = 0xEE; byte1 <= 0xEF; ++byte1)
537 {
538 for (int byte2 = 0x80; byte2 <= 0xBF; ++byte2)
539 {
540 for (int byte3 = 0x80; byte3 <= 0xBF; ++byte3)
541 {
542 check_utf8string(true, byte1, byte2, byte3);
543 check_utf8dump(true, byte1, byte2, byte3);
544 }
545 }
546 }
547 }
548
549 SECTION("ill-formed: missing second byte")
550 {
551 for (int byte1 = 0xEE; byte1 <= 0xEF; ++byte1)
552 {
553 check_utf8string(false, byte1);
554 check_utf8dump(false, byte1);
555 }
556 }
557
558 SECTION("ill-formed: missing third byte")
559 {
560 for (int byte1 = 0xEE; byte1 <= 0xEF; ++byte1)
561 {
562 for (int byte2 = 0x80; byte2 <= 0xBF; ++byte2)
563 {
564 check_utf8string(false, byte1, byte2);
565 check_utf8dump(false, byte1, byte2);
566 }
567 }
568 }
569
570 SECTION("ill-formed: wrong second byte")
571 {
572 for (int byte1 = 0xEE; byte1 <= 0xEF; ++byte1)
573 {
574 for (int byte2 = 0x00; byte2 <= 0xFF; ++byte2)
575 {
576 // skip correct second byte
577 if (0x80 <= byte2 && byte2 <= 0xBF)
578 {
579 continue;
580 }
581
582 for (int byte3 = 0x80; byte3 <= 0xBF; ++byte3)
583 {
584 check_utf8string(false, byte1, byte2, byte3);
585 check_utf8dump(false, byte1, byte2, byte3);
586 }
587 }
588 }
589 }
590
591 SECTION("ill-formed: wrong third byte")
592 {
593 for (int byte1 = 0xEE; byte1 <= 0xEF; ++byte1)
594 {
595 for (int byte2 = 0x80; byte2 <= 0xBF; ++byte2)
596 {
597 for (int byte3 = 0x00; byte3 <= 0xFF; ++byte3)
598 {
599 // skip correct third byte
600 if (0x80 <= byte3 && byte3 <= 0xBF)
601 {
602 continue;
603 }
604
605 check_utf8string(false, byte1, byte2, byte3);
606 check_utf8dump(false, byte1, byte2, byte3);
607 }
608 }
609 }
610 }
611 }
612
613 SECTION("UTF8-4 (xF0 x90-BF UTF8-tail UTF8-tail)")
614 {
615 SECTION("well-formed")
616 {
617 for (int byte1 = 0xF0; byte1 <= 0xF0; ++byte1)
618 {
619 for (int byte2 = 0x90; byte2 <= 0xBF; ++byte2)
620 {
621 for (int byte3 = 0x80; byte3 <= 0xBF; ++byte3)
622 {
623 for (int byte4 = 0x80; byte4 <= 0xBF; ++byte4)
624 {
625 check_utf8string(true, byte1, byte2, byte3, byte4);
626 check_utf8dump(true, byte1, byte2, byte3, byte4);
627 }
628 }
629 }
630 }
631 }
632
633 SECTION("ill-formed: missing second byte")
634 {
635 for (int byte1 = 0xF0; byte1 <= 0xF0; ++byte1)
636 {
637 check_utf8string(false, byte1);
638 check_utf8dump(false, byte1);
639 }
640 }
641
642 SECTION("ill-formed: missing third byte")
643 {
644 for (int byte1 = 0xF0; byte1 <= 0xF0; ++byte1)
645 {
646 for (int byte2 = 0x90; byte2 <= 0xBF; ++byte2)
647 {
648 check_utf8string(false, byte1, byte2);
649 check_utf8dump(false, byte1, byte2);
650 }
651 }
652 }
653
654 SECTION("ill-formed: missing fourth byte")
655 {
656 for (int byte1 = 0xF0; byte1 <= 0xF0; ++byte1)
657 {
658 for (int byte2 = 0x90; byte2 <= 0xBF; ++byte2)
659 {
660 for (int byte3 = 0x80; byte3 <= 0xBF; ++byte3)
661 {
662 check_utf8string(false, byte1, byte2, byte3);
663 check_utf8dump(false, byte1, byte2, byte3);
664 }
665 }
666 }
667 }
668
669 SECTION("ill-formed: wrong second byte")
670 {
671 for (int byte1 = 0xF0; byte1 <= 0xF0; ++byte1)
672 {
673 for (int byte2 = 0x00; byte2 <= 0xFF; ++byte2)
674 {
675 // skip correct second byte
676 if (0x90 <= byte2 && byte2 <= 0xBF)
677 {
678 continue;
679 }
680
681 for (int byte3 = 0x80; byte3 <= 0xBF; ++byte3)
682 {
683 for (int byte4 = 0x80; byte4 <= 0xBF; ++byte4)
684 {
685 check_utf8string(false, byte1, byte2, byte3, byte4);
686 check_utf8dump(false, byte1, byte2, byte3, byte4);
687 }
688 }
689 }
690 }
691 }
692
693 SECTION("ill-formed: wrong third byte")
694 {
695 for (int byte1 = 0xF0; byte1 <= 0xF0; ++byte1)
696 {
697 for (int byte2 = 0x90; byte2 <= 0xBF; ++byte2)
698 {
699 for (int byte3 = 0x00; byte3 <= 0xFF; ++byte3)
700 {
701 // skip correct third byte
702 if (0x80 <= byte3 && byte3 <= 0xBF)
703 {
704 continue;
705 }
706
707 for (int byte4 = 0x80; byte4 <= 0xBF; ++byte4)
708 {
709 check_utf8string(false, byte1, byte2, byte3, byte4);
710 check_utf8dump(false, byte1, byte2, byte3, byte4);
711 }
712 }
713 }
714 }
715 }
716
717 SECTION("ill-formed: wrong fourth byte")
718 {
719 for (int byte1 = 0xF0; byte1 <= 0xF0; ++byte1)
720 {
721 for (int byte2 = 0x90; byte2 <= 0xBF; ++byte2)
722 {
723 for (int byte3 = 0x80; byte3 <= 0xBF; ++byte3)
724 {
725 for (int byte4 = 0x00; byte4 <= 0xFF; ++byte4)
726 {
727 // skip fourth second byte
728 if (0x80 <= byte3 && byte3 <= 0xBF)
729 {
730 continue;
731 }
732
733 check_utf8string(false, byte1, byte2, byte3, byte4);
734 check_utf8dump(false, byte1, byte2, byte3, byte4);
735 }
736 }
737 }
738 }
739 }
740 }
741
742 SECTION("UTF8-4 (xF1-F3 UTF8-tail UTF8-tail UTF8-tail)")
743 {
744 SECTION("well-formed")
745 {
746 for (int byte1 = 0xF1; byte1 <= 0xF3; ++byte1)
747 {
748 for (int byte2 = 0x80; byte2 <= 0xBF; ++byte2)
749 {
750 for (int byte3 = 0x80; byte3 <= 0xBF; ++byte3)
751 {
752 for (int byte4 = 0x80; byte4 <= 0xBF; ++byte4)
753 {
754 check_utf8string(true, byte1, byte2, byte3, byte4);
755 check_utf8dump(true, byte1, byte2, byte3, byte4);
756 }
757 }
758 }
759 }
760 }
761
762 SECTION("ill-formed: missing second byte")
763 {
764 for (int byte1 = 0xF1; byte1 <= 0xF3; ++byte1)
765 {
766 check_utf8string(false, byte1);
767 check_utf8dump(false, byte1);
768 }
769 }
770
771 SECTION("ill-formed: missing third byte")
772 {
773 for (int byte1 = 0xF1; byte1 <= 0xF3; ++byte1)
774 {
775 for (int byte2 = 0x80; byte2 <= 0xBF; ++byte2)
776 {
777 check_utf8string(false, byte1, byte2);
778 check_utf8dump(false, byte1, byte2);
779 }
780 }
781 }
782
783 SECTION("ill-formed: missing fourth byte")
784 {
785 for (int byte1 = 0xF1; byte1 <= 0xF3; ++byte1)
786 {
787 for (int byte2 = 0x80; byte2 <= 0xBF; ++byte2)
788 {
789 for (int byte3 = 0x80; byte3 <= 0xBF; ++byte3)
790 {
791 check_utf8string(false, byte1, byte2, byte3);
792 check_utf8dump(false, byte1, byte2, byte3);
793 }
794 }
795 }
796 }
797
798 SECTION("ill-formed: wrong second byte")
799 {
800 for (int byte1 = 0xF1; byte1 <= 0xF3; ++byte1)
801 {
802 for (int byte2 = 0x00; byte2 <= 0xFF; ++byte2)
803 {
804 // skip correct second byte
805 if (0x80 <= byte2 && byte2 <= 0xBF)
806 {
807 continue;
808 }
809
810 for (int byte3 = 0x80; byte3 <= 0xBF; ++byte3)
811 {
812 for (int byte4 = 0x80; byte4 <= 0xBF; ++byte4)
813 {
814 check_utf8string(false, byte1, byte2, byte3, byte4);
815 check_utf8dump(false, byte1, byte2, byte3, byte4);
816 }
817 }
818 }
819 }
820 }
821
822 SECTION("ill-formed: wrong third byte")
823 {
824 for (int byte1 = 0xF1; byte1 <= 0xF3; ++byte1)
825 {
826 for (int byte2 = 0x80; byte2 <= 0xBF; ++byte2)
827 {
828 for (int byte3 = 0x00; byte3 <= 0xFF; ++byte3)
829 {
830 // skip correct third byte
831 if (0x80 <= byte3 && byte3 <= 0xBF)
832 {
833 continue;
834 }
835
836 for (int byte4 = 0x80; byte4 <= 0xBF; ++byte4)
837 {
838 check_utf8string(false, byte1, byte2, byte3, byte4);
839 check_utf8dump(false, byte1, byte2, byte3, byte4);
840 }
841 }
842 }
843 }
844 }
845
846 SECTION("ill-formed: wrong fourth byte")
847 {
848 for (int byte1 = 0xF1; byte1 <= 0xF3; ++byte1)
849 {
850 for (int byte2 = 0x80; byte2 <= 0xBF; ++byte2)
851 {
852 for (int byte3 = 0x80; byte3 <= 0xBF; ++byte3)
853 {
854 for (int byte4 = 0x00; byte4 <= 0xFF; ++byte4)
855 {
856 // skip correct fourth byte
857 if (0x80 <= byte3 && byte3 <= 0xBF)
858 {
859 continue;
860 }
861
862 check_utf8string(false, byte1, byte2, byte3, byte4);
863 check_utf8dump(false, byte1, byte2, byte3, byte4);
864 }
865 }
866 }
867 }
868 }
869 }
870
871 SECTION("UTF8-4 (xF4 x80-8F UTF8-tail UTF8-tail)")
872 {
873 SECTION("well-formed")
874 {
875 for (int byte1 = 0xF4; byte1 <= 0xF4; ++byte1)
876 {
877 for (int byte2 = 0x80; byte2 <= 0x8F; ++byte2)
878 {
879 for (int byte3 = 0x80; byte3 <= 0xBF; ++byte3)
880 {
881 for (int byte4 = 0x80; byte4 <= 0xBF; ++byte4)
882 {
883 check_utf8string(true, byte1, byte2, byte3, byte4);
884 check_utf8dump(true, byte1, byte2, byte3, byte4);
885 }
886 }
887 }
888 }
889 }
890
891 SECTION("ill-formed: missing second byte")
892 {
893 for (int byte1 = 0xF4; byte1 <= 0xF4; ++byte1)
894 {
895 check_utf8string(false, byte1);
896 check_utf8dump(false, byte1);
897 }
898 }
899
900 SECTION("ill-formed: missing third byte")
901 {
902 for (int byte1 = 0xF4; byte1 <= 0xF4; ++byte1)
903 {
904 for (int byte2 = 0x80; byte2 <= 0x8F; ++byte2)
905 {
906 check_utf8string(false, byte1, byte2);
907 check_utf8dump(false, byte1, byte2);
908 }
909 }
910 }
911
912 SECTION("ill-formed: missing fourth byte")
913 {
914 for (int byte1 = 0xF4; byte1 <= 0xF4; ++byte1)
915 {
916 for (int byte2 = 0x80; byte2 <= 0x8F; ++byte2)
917 {
918 for (int byte3 = 0x80; byte3 <= 0xBF; ++byte3)
919 {
920 check_utf8string(false, byte1, byte2, byte3);
921 check_utf8dump(false, byte1, byte2, byte3);
922 }
923 }
924 }
925 }
926
927 SECTION("ill-formed: wrong second byte")
928 {
929 for (int byte1 = 0xF4; byte1 <= 0xF4; ++byte1)
930 {
931 for (int byte2 = 0x00; byte2 <= 0xFF; ++byte2)
932 {
933 // skip correct second byte
934 if (0x80 <= byte2 && byte2 <= 0x8F)
935 {
936 continue;
937 }
938
939 for (int byte3 = 0x80; byte3 <= 0xBF; ++byte3)
940 {
941 for (int byte4 = 0x80; byte4 <= 0xBF; ++byte4)
942 {
943 check_utf8string(false, byte1, byte2, byte3, byte4);
944 check_utf8dump(false, byte1, byte2, byte3, byte4);
945 }
946 }
947 }
948 }
949 }
950
951 SECTION("ill-formed: wrong third byte")
952 {
953 for (int byte1 = 0xF4; byte1 <= 0xF4; ++byte1)
954 {
955 for (int byte2 = 0x80; byte2 <= 0x8F; ++byte2)
956 {
957 for (int byte3 = 0x00; byte3 <= 0xFF; ++byte3)
958 {
959 // skip correct third byte
960 if (0x80 <= byte3 && byte3 <= 0xBF)
961 {
962 continue;
963 }
964
965 for (int byte4 = 0x80; byte4 <= 0xBF; ++byte4)
966 {
967 check_utf8string(false, byte1, byte2, byte3, byte4);
968 check_utf8dump(false, byte1, byte2, byte3, byte4);
969 }
970 }
971 }
972 }
973 }
974
975 SECTION("ill-formed: wrong fourth byte")
976 {
977 for (int byte1 = 0xF4; byte1 <= 0xF4; ++byte1)
978 {
979 for (int byte2 = 0x80; byte2 <= 0x8F; ++byte2)
980 {
981 for (int byte3 = 0x80; byte3 <= 0xBF; ++byte3)
982 {
983 for (int byte4 = 0x00; byte4 <= 0xFF; ++byte4)
984 {
985 // skip correct fourth byte
986 if (0x80 <= byte3 && byte3 <= 0xBF)
987 {
988 continue;
989 }
990
991 check_utf8string(false, byte1, byte2, byte3, byte4);
992 check_utf8dump(false, byte1, byte2, byte3, byte4);
993 }
994 }
995 }
996 }
997 }
998 }
999 }
1000
1001 SECTION("\\uxxxx sequences")
1002 {
1003 // create an escaped string from a code point
1004 const auto codepoint_to_unicode = [](std::size_t cp)
1005 {
1006 // code points are represented as a six-character sequence: a
1007 // reverse solidus, followed by the lowercase letter u, followed
1008 // by four hexadecimal digits that encode the character's code
1009 // point
1010 std::stringstream ss;
1011 ss << "\\u" << std::setw(4) << std::setfill('0') << std::hex << cp;
1012 return ss.str();
1013 };
1014
1015 SECTION("correct sequences")
1016 {
1017 // generate all UTF-8 code points; in total, 1112064 code points are
1018 // generated: 0x1FFFFF code points - 2048 invalid values between
1019 // 0xD800 and 0xDFFF.
1020 for (std::size_t cp = 0; cp <= 0x10FFFFu; ++cp)
1021 {
1022 // string to store the code point as in \uxxxx format
1023 std::string json_text = "\"";
1024
1025 // decide whether to use one or two \uxxxx sequences
1026 if (cp < 0x10000u)
1027 {
1028 // The Unicode standard permanently reserves these code point
1029 // values for UTF-16 encoding of the high and low surrogates, and
1030 // they will never be assigned a character, so there should be no
1031 // reason to encode them. The official Unicode standard says that
1032 // no UTF forms, including UTF-16, can encode these code points.
1033 if (cp >= 0xD800u && cp <= 0xDFFFu)
1034 {
1035 // if we would not skip these code points, we would get a
1036 // "missing low surrogate" exception
1037 continue;
1038 }
1039
1040 // code points in the Basic Multilingual Plane can be
1041 // represented with one \uxxxx sequence
1042 json_text += codepoint_to_unicode(cp);
1043 }
1044 else
1045 {
1046 // To escape an extended character that is not in the Basic
1047 // Multilingual Plane, the character is represented as a
1048 // 12-character sequence, encoding the UTF-16 surrogate pair
1049 const auto codepoint1 = 0xd800u + (((cp - 0x10000u) >> 10) & 0x3ffu);
1050 const auto codepoint2 = 0xdc00u + ((cp - 0x10000u) & 0x3ffu);
1051 json_text += codepoint_to_unicode(codepoint1) + codepoint_to_unicode(codepoint2);
1052 }
1053
1054 json_text += "\"";
1055 CAPTURE(json_text)
1056 json _;
1057 CHECK_NOTHROW(_ = json::parse(json_text));
1058 }
1059 }
1060
1061 SECTION("incorrect sequences")
1062 {
1063 SECTION("incorrect surrogate values")
1064 {
1065 json _;
1066
1067 CHECK_THROWS_AS(_ = json::parse("\"\\uDC00\\uDC00\""), json::parse_error&);
1068 CHECK_THROWS_WITH(_ = json::parse("\"\\uDC00\\uDC00\""),
1069 "[json.exception.parse_error.101] parse error at line 1, column 7: syntax error while parsing value - invalid string: surrogate U+DC00..U+DFFF must follow U+D800..U+DBFF; last read: '\"\\uDC00'");
1070
1071 CHECK_THROWS_AS(_ = json::parse("\"\\uD7FF\\uDC00\""), json::parse_error&);
1072 CHECK_THROWS_WITH(_ = json::parse("\"\\uD7FF\\uDC00\""),
1073 "[json.exception.parse_error.101] parse error at line 1, column 13: syntax error while parsing value - invalid string: surrogate U+DC00..U+DFFF must follow U+D800..U+DBFF; last read: '\"\\uD7FF\\uDC00'");
1074
1075 CHECK_THROWS_AS(_ = json::parse("\"\\uD800]\""), json::parse_error&);
1076 CHECK_THROWS_WITH(_ = json::parse("\"\\uD800]\""),
1077 "[json.exception.parse_error.101] parse error at line 1, column 8: syntax error while parsing value - invalid string: surrogate U+D800..U+DBFF must be followed by U+DC00..U+DFFF; last read: '\"\\uD800]'");
1078
1079 CHECK_THROWS_AS(_ = json::parse("\"\\uD800\\v\""), json::parse_error&);
1080 CHECK_THROWS_WITH(_ = json::parse("\"\\uD800\\v\""),
1081 "[json.exception.parse_error.101] parse error at line 1, column 9: syntax error while parsing value - invalid string: surrogate U+D800..U+DBFF must be followed by U+DC00..U+DFFF; last read: '\"\\uD800\\v'");
1082
1083 CHECK_THROWS_AS(_ = json::parse("\"\\uD800\\u123\""), json::parse_error&);
1084 CHECK_THROWS_WITH(_ = json::parse("\"\\uD800\\u123\""),
1085 "[json.exception.parse_error.101] parse error at line 1, column 13: syntax error while parsing value - invalid string: '\\u' must be followed by 4 hex digits; last read: '\"\\uD800\\u123\"'");
1086
1087 CHECK_THROWS_AS(_ = json::parse("\"\\uD800\\uDBFF\""), json::parse_error&);
1088 CHECK_THROWS_WITH(_ = json::parse("\"\\uD800\\uDBFF\""),
1089 "[json.exception.parse_error.101] parse error at line 1, column 13: syntax error while parsing value - invalid string: surrogate U+D800..U+DBFF must be followed by U+DC00..U+DFFF; last read: '\"\\uD800\\uDBFF'");
1090
1091 CHECK_THROWS_AS(_ = json::parse("\"\\uD800\\uE000\""), json::parse_error&);
1092 CHECK_THROWS_WITH(_ = json::parse("\"\\uD800\\uE000\""),
1093 "[json.exception.parse_error.101] parse error at line 1, column 13: syntax error while parsing value - invalid string: surrogate U+D800..U+DBFF must be followed by U+DC00..U+DFFF; last read: '\"\\uD800\\uE000'");
1094 }
1095 }
1096
1097 #if 0
1098 SECTION("incorrect sequences")
1099 {
1100 SECTION("high surrogate without low surrogate")
1101 {
1102 // D800..DBFF are high surrogates and must be followed by low
1103 // surrogates DC00..DFFF; here, nothing follows
1104 for (std::size_t cp = 0xD800u; cp <= 0xDBFFu; ++cp)
1105 {
1106 std::string json_text = "\"" + codepoint_to_unicode(cp) + "\"";
1107 CAPTURE(json_text)
1108 CHECK_THROWS_AS(json::parse(json_text), json::parse_error&);
1109 }
1110 }
1111
1112 SECTION("high surrogate with wrong low surrogate")
1113 {
1114 // D800..DBFF are high surrogates and must be followed by low
1115 // surrogates DC00..DFFF; here a different sequence follows
1116 for (std::size_t cp1 = 0xD800u; cp1 <= 0xDBFFu; ++cp1)
1117 {
1118 for (std::size_t cp2 = 0x0000u; cp2 <= 0xFFFFu; ++cp2)
1119 {
1120 if (0xDC00u <= cp2 && cp2 <= 0xDFFFu)
1121 {
1122 continue;
1123 }
1124
1125 std::string json_text = "\"" + codepoint_to_unicode(cp1) + codepoint_to_unicode(cp2) + "\"";
1126 CAPTURE(json_text)
1127 CHECK_THROWS_AS(json::parse(json_text), json::parse_error&);
1128 }
1129 }
1130 }
1131
1132 SECTION("low surrogate without high surrogate")
1133 {
1134 // low surrogates DC00..DFFF must follow high surrogates; here,
1135 // they occur alone
1136 for (std::size_t cp = 0xDC00u; cp <= 0xDFFFu; ++cp)
1137 {
1138 std::string json_text = "\"" + codepoint_to_unicode(cp) + "\"";
1139 CAPTURE(json_text)
1140 CHECK_THROWS_AS(json::parse(json_text), json::parse_error&);
1141 }
1142 }
1143
1144 }
1145 #endif
1146 }
1147
1148 SECTION("read all unicode characters")
1149 {
1150 // read a file with all unicode characters stored as single-character
1151 // strings in a JSON array
1152 std::ifstream f(TEST_DATA_DIRECTORY "/json_nlohmann_tests/all_unicode.json");
1153 json j;
1154 CHECK_NOTHROW(f >> j);
1155
1156 // the array has 1112064 + 1 elements (a terminating "null" value)
1157 // Note: 1112064 = 0x1FFFFF code points - 2048 invalid values between
1158 // 0xD800 and 0xDFFF.
1159 CHECK(j.size() == 1112065);
1160
1161 SECTION("check JSON Pointers")
1162 {
1163 for (auto s : j)
1164 {
1165 // skip non-string JSON values
1166 if (!s.is_string())
1167 {
1168 continue;
1169 }
1170
1171 auto ptr = s.get<std::string>();
1172
1173 // tilde must be followed by 0 or 1
1174 if (ptr == "~")
1175 {
1176 ptr += "0";
1177 }
1178
1179 // JSON Pointers must begin with "/"
1180 ptr = "/" + ptr;
1181
1182 CHECK_NOTHROW(json::json_pointer("/" + ptr));
1183
1184 // check escape/unescape roundtrip
1185 auto escaped = json::json_pointer::escape(ptr);
1186 json::json_pointer::unescape(escaped);
1187 CHECK(escaped == ptr);
1188 }
1189 }
1190 }
1191
1192 SECTION("ignore byte-order-mark")
1193 {
1194 SECTION("in a stream")
1195 {
1196 // read a file with a UTF-8 BOM
1197 std::ifstream f(TEST_DATA_DIRECTORY "/json_nlohmann_tests/bom.json");
1198 json j;
1199 CHECK_NOTHROW(f >> j);
1200 }
1201
1202 SECTION("with an iterator")
1203 {
1204 std::string i = "\xef\xbb\xbf{\n \"foo\": true\n}";
1205 CHECK_NOTHROW(json::parse(i.begin(), i.end()));
1206 }
1207 }
1208
1209 SECTION("error for incomplete/wrong BOM")
1210 {
1211 json _;
1212 CHECK_THROWS_AS(_ = json::parse("\xef\xbb"), json::parse_error&);
1213 CHECK_THROWS_AS(_ = json::parse("\xef\xbb\xbb"), json::parse_error&);
1214 }
1215 }
1216
1217 namespace
1218 {
1219 void roundtrip(bool success_expected, const std::string& s);
1220
roundtrip(bool success_expected,const std::string & s)1221 void roundtrip(bool success_expected, const std::string& s)
1222 {
1223 CAPTURE(s)
1224 json _;
1225
1226 // create JSON string value
1227 json j = s;
1228 // create JSON text
1229 std::string ps = std::string("\"") + s + "\"";
1230
1231 if (success_expected)
1232 {
1233 // serialization succeeds
1234 CHECK_NOTHROW(j.dump());
1235
1236 // exclude parse test for U+0000
1237 if (s[0] != '\0')
1238 {
1239 // parsing JSON text succeeds
1240 CHECK_NOTHROW(_ = json::parse(ps));
1241 }
1242
1243 // roundtrip succeeds
1244 CHECK_NOTHROW(_ = json::parse(j.dump()));
1245
1246 // after roundtrip, the same string is stored
1247 json jr = json::parse(j.dump());
1248 CHECK(jr.get<std::string>() == s);
1249 }
1250 else
1251 {
1252 // serialization fails
1253 CHECK_THROWS_AS(j.dump(), json::type_error&);
1254
1255 // parsing JSON text fails
1256 CHECK_THROWS_AS(_ = json::parse(ps), json::parse_error&);
1257 }
1258 }
1259 }
1260
1261 TEST_CASE("Markus Kuhn's UTF-8 decoder capability and stress test")
1262 {
1263 // Markus Kuhn <http://www.cl.cam.ac.uk/~mgk25/> - 2015-08-28 - CC BY 4.0
1264 // http://www.cl.cam.ac.uk/~mgk25/ucs/examples/UTF-8-test.txt
1265
1266 SECTION("1 Some correct UTF-8 text")
1267 {
1268 roundtrip(true, "κόσμε");
1269 }
1270
1271 SECTION("2 Boundary condition test cases")
1272 {
1273 SECTION("2.1 First possible sequence of a certain length")
1274 {
1275 // 2.1.1 1 byte (U-00000000)
1276 roundtrip(true, std::string("\0", 1));
1277 // 2.1.2 2 bytes (U-00000080)
1278 roundtrip(true, "\xc2\x80");
1279 // 2.1.3 3 bytes (U-00000800)
1280 roundtrip(true, "\xe0\xa0\x80");
1281 // 2.1.4 4 bytes (U-00010000)
1282 roundtrip(true, "\xf0\x90\x80\x80");
1283
1284 // 2.1.5 5 bytes (U-00200000)
1285 roundtrip(false, "\xF8\x88\x80\x80\x80");
1286 // 2.1.6 6 bytes (U-04000000)
1287 roundtrip(false, "\xFC\x84\x80\x80\x80\x80");
1288 }
1289
1290 SECTION("2.2 Last possible sequence of a certain length")
1291 {
1292 // 2.2.1 1 byte (U-0000007F)
1293 roundtrip(true, "\x7f");
1294 // 2.2.2 2 bytes (U-000007FF)
1295 roundtrip(true, "\xdf\xbf");
1296 // 2.2.3 3 bytes (U-0000FFFF)
1297 roundtrip(true, "\xef\xbf\xbf");
1298
1299 // 2.2.4 4 bytes (U-001FFFFF)
1300 roundtrip(false, "\xF7\xBF\xBF\xBF");
1301 // 2.2.5 5 bytes (U-03FFFFFF)
1302 roundtrip(false, "\xFB\xBF\xBF\xBF\xBF");
1303 // 2.2.6 6 bytes (U-7FFFFFFF)
1304 roundtrip(false, "\xFD\xBF\xBF\xBF\xBF\xBF");
1305 }
1306
1307 SECTION("2.3 Other boundary conditions")
1308 {
1309 // 2.3.1 U-0000D7FF = ed 9f bf
1310 roundtrip(true, "\xed\x9f\xbf");
1311 // 2.3.2 U-0000E000 = ee 80 80
1312 roundtrip(true, "\xee\x80\x80");
1313 // 2.3.3 U-0000FFFD = ef bf bd
1314 roundtrip(true, "\xef\xbf\xbd");
1315 // 2.3.4 U-0010FFFF = f4 8f bf bf
1316 roundtrip(true, "\xf4\x8f\xbf\xbf");
1317
1318 // 2.3.5 U-00110000 = f4 90 80 80
1319 roundtrip(false, "\xf4\x90\x80\x80");
1320 }
1321 }
1322
1323 SECTION("3 Malformed sequences")
1324 {
1325 SECTION("3.1 Unexpected continuation bytes")
1326 {
1327 // Each unexpected continuation byte should be separately signalled as a
1328 // malformed sequence of its own.
1329
1330 // 3.1.1 First continuation byte 0x80
1331 roundtrip(false, "\x80");
1332 // 3.1.2 Last continuation byte 0xbf
1333 roundtrip(false, "\xbf");
1334
1335 // 3.1.3 2 continuation bytes
1336 roundtrip(false, "\x80\xbf");
1337 // 3.1.4 3 continuation bytes
1338 roundtrip(false, "\x80\xbf\x80");
1339 // 3.1.5 4 continuation bytes
1340 roundtrip(false, "\x80\xbf\x80\xbf");
1341 // 3.1.6 5 continuation bytes
1342 roundtrip(false, "\x80\xbf\x80\xbf\x80");
1343 // 3.1.7 6 continuation bytes
1344 roundtrip(false, "\x80\xbf\x80\xbf\x80\xbf");
1345 // 3.1.8 7 continuation bytes
1346 roundtrip(false, "\x80\xbf\x80\xbf\x80\xbf\x80");
1347
1348 // 3.1.9 Sequence of all 64 possible continuation bytes (0x80-0xbf)
1349 roundtrip(false, "\x80\x81\x82\x83\x84\x85\x86\x87\x88\x89\x8a\x8b\x8c\x8d\x8e\x8f\x90\x91\x92\x93\x94\x95\x96\x97\x98\x99\x9a\x9b\x9c\x9d\x9e\x9f\xa0\xa1\xa2\xa3\xa4\xa5\xa6\xa7\xa8\xa9\xaa\xab\xac\xad\xae\xaf\xb0\xb1\xb2\xb3\xb4\xb5\xb6\xb7\xb8\xb9\xba\xbb\xbc\xbd\xbe\xbf");
1350 }
1351
1352 SECTION("3.2 Lonely start characters")
1353 {
1354 // 3.2.1 All 32 first bytes of 2-byte sequences (0xc0-0xdf)
1355 roundtrip(false, "\xc0 \xc1 \xc2 \xc3 \xc4 \xc5 \xc6 \xc7 \xc8 \xc9 \xca \xcb \xcc \xcd \xce \xcf \xd0 \xd1 \xd2 \xd3 \xd4 \xd5 \xd6 \xd7 \xd8 \xd9 \xda \xdb \xdc \xdd \xde \xdf");
1356 // 3.2.2 All 16 first bytes of 3-byte sequences (0xe0-0xef)
1357 roundtrip(false, "\xe0 \xe1 \xe2 \xe3 \xe4 \xe5 \xe6 \xe7 \xe8 \xe9 \xea \xeb \xec \xed \xee \xef");
1358 // 3.2.3 All 8 first bytes of 4-byte sequences (0xf0-0xf7)
1359 roundtrip(false, "\xf0 \xf1 \xf2 \xf3 \xf4 \xf5 \xf6 \xf7");
1360 // 3.2.4 All 4 first bytes of 5-byte sequences (0xf8-0xfb)
1361 roundtrip(false, "\xf8 \xf9 \xfa \xfb");
1362 // 3.2.5 All 2 first bytes of 6-byte sequences (0xfc-0xfd)
1363 roundtrip(false, "\xfc \xfd");
1364 }
1365
1366 SECTION("3.3 Sequences with last continuation byte missing")
1367 {
1368 // All bytes of an incomplete sequence should be signalled as a single
1369 // malformed sequence, i.e., you should see only a single replacement
1370 // character in each of the next 10 tests. (Characters as in section 2)
1371
1372 // 3.3.1 2-byte sequence with last byte missing (U+0000)
1373 roundtrip(false, "\xc0");
1374 // 3.3.2 3-byte sequence with last byte missing (U+0000)
1375 roundtrip(false, "\xe0\x80");
1376 // 3.3.3 4-byte sequence with last byte missing (U+0000)
1377 roundtrip(false, "\xf0\x80\x80");
1378 // 3.3.4 5-byte sequence with last byte missing (U+0000)
1379 roundtrip(false, "\xf8\x80\x80\x80");
1380 // 3.3.5 6-byte sequence with last byte missing (U+0000)
1381 roundtrip(false, "\xfc\x80\x80\x80\x80");
1382 // 3.3.6 2-byte sequence with last byte missing (U-000007FF)
1383 roundtrip(false, "\xdf");
1384 // 3.3.7 3-byte sequence with last byte missing (U-0000FFFF)
1385 roundtrip(false, "\xef\xbf");
1386 // 3.3.8 4-byte sequence with last byte missing (U-001FFFFF)
1387 roundtrip(false, "\xf7\xbf\xbf");
1388 // 3.3.9 5-byte sequence with last byte missing (U-03FFFFFF)
1389 roundtrip(false, "\xfb\xbf\xbf\xbf");
1390 // 3.3.10 6-byte sequence with last byte missing (U-7FFFFFFF)
1391 roundtrip(false, "\xfd\xbf\xbf\xbf\xbf");
1392 }
1393
1394 SECTION("3.4 Concatenation of incomplete sequences")
1395 {
1396 // All the 10 sequences of 3.3 concatenated, you should see 10 malformed
1397 // sequences being signalled:
1398 roundtrip(false, "\xc0\xe0\x80\xf0\x80\x80\xf8\x80\x80\x80\xfc\x80\x80\x80\x80\xdf\xef\xbf\xf7\xbf\xbf\xfb\xbf\xbf\xbf\xfd\xbf\xbf\xbf\xbf");
1399 }
1400
1401 SECTION("3.5 Impossible bytes")
1402 {
1403 // The following two bytes cannot appear in a correct UTF-8 string
1404
1405 // 3.5.1 fe
1406 roundtrip(false, "\xfe");
1407 // 3.5.2 ff
1408 roundtrip(false, "\xff");
1409 // 3.5.3 fe fe ff ff
1410 roundtrip(false, "\xfe\xfe\xff\xff");
1411 }
1412 }
1413
1414 SECTION("4 Overlong sequences")
1415 {
1416 // The following sequences are not malformed according to the letter of
1417 // the Unicode 2.0 standard. However, they are longer then necessary and
1418 // a correct UTF-8 encoder is not allowed to produce them. A "safe UTF-8
1419 // decoder" should reject them just like malformed sequences for two
1420 // reasons: (1) It helps to debug applications if overlong sequences are
1421 // not treated as valid representations of characters, because this helps
1422 // to spot problems more quickly. (2) Overlong sequences provide
1423 // alternative representations of characters, that could maliciously be
1424 // used to bypass filters that check only for ASCII characters. For
1425 // instance, a 2-byte encoded line feed (LF) would not be caught by a
1426 // line counter that counts only 0x0a bytes, but it would still be
1427 // processed as a line feed by an unsafe UTF-8 decoder later in the
1428 // pipeline. From a security point of view, ASCII compatibility of UTF-8
1429 // sequences means also, that ASCII characters are *only* allowed to be
1430 // represented by ASCII bytes in the range 0x00-0x7f. To ensure this
1431 // aspect of ASCII compatibility, use only "safe UTF-8 decoders" that
1432 // reject overlong UTF-8 sequences for which a shorter encoding exists.
1433
1434 SECTION("4.1 Examples of an overlong ASCII character")
1435 {
1436 // With a safe UTF-8 decoder, all of the following five overlong
1437 // representations of the ASCII character slash ("/") should be rejected
1438 // like a malformed UTF-8 sequence, for instance by substituting it with
1439 // a replacement character. If you see a slash below, you do not have a
1440 // safe UTF-8 decoder!
1441
1442 // 4.1.1 U+002F = c0 af
1443 roundtrip(false, "\xc0\xaf");
1444 // 4.1.2 U+002F = e0 80 af
1445 roundtrip(false, "\xe0\x80\xaf");
1446 // 4.1.3 U+002F = f0 80 80 af
1447 roundtrip(false, "\xf0\x80\x80\xaf");
1448 // 4.1.4 U+002F = f8 80 80 80 af
1449 roundtrip(false, "\xf8\x80\x80\x80\xaf");
1450 // 4.1.5 U+002F = fc 80 80 80 80 af
1451 roundtrip(false, "\xfc\x80\x80\x80\x80\xaf");
1452 }
1453
1454 SECTION("4.2 Maximum overlong sequences")
1455 {
1456 // Below you see the highest Unicode value that is still resulting in an
1457 // overlong sequence if represented with the given number of bytes. This
1458 // is a boundary test for safe UTF-8 decoders. All five characters should
1459 // be rejected like malformed UTF-8 sequences.
1460
1461 // 4.2.1 U-0000007F = c1 bf
1462 roundtrip(false, "\xc1\xbf");
1463 // 4.2.2 U-000007FF = e0 9f bf
1464 roundtrip(false, "\xe0\x9f\xbf");
1465 // 4.2.3 U-0000FFFF = f0 8f bf bf
1466 roundtrip(false, "\xf0\x8f\xbf\xbf");
1467 // 4.2.4 U-001FFFFF = f8 87 bf bf bf
1468 roundtrip(false, "\xf8\x87\xbf\xbf\xbf");
1469 // 4.2.5 U-03FFFFFF = fc 83 bf bf bf bf
1470 roundtrip(false, "\xfc\x83\xbf\xbf\xbf\xbf");
1471 }
1472
1473 SECTION("4.3 Overlong representation of the NUL character")
1474 {
1475 // The following five sequences should also be rejected like malformed
1476 // UTF-8 sequences and should not be treated like the ASCII NUL
1477 // character.
1478
1479 // 4.3.1 U+0000 = c0 80
1480 roundtrip(false, "\xc0\x80");
1481 // 4.3.2 U+0000 = e0 80 80
1482 roundtrip(false, "\xe0\x80\x80");
1483 // 4.3.3 U+0000 = f0 80 80 80
1484 roundtrip(false, "\xf0\x80\x80\x80");
1485 // 4.3.4 U+0000 = f8 80 80 80 80
1486 roundtrip(false, "\xf8\x80\x80\x80\x80");
1487 // 4.3.5 U+0000 = fc 80 80 80 80 80
1488 roundtrip(false, "\xfc\x80\x80\x80\x80\x80");
1489 }
1490 }
1491
1492 SECTION("5 Illegal code positions")
1493 {
1494 // The following UTF-8 sequences should be rejected like malformed
1495 // sequences, because they never represent valid ISO 10646 characters and
1496 // a UTF-8 decoder that accepts them might introduce security problems
1497 // comparable to overlong UTF-8 sequences.
1498
1499 SECTION("5.1 Single UTF-16 surrogates")
1500 {
1501 // 5.1.1 U+D800 = ed a0 80
1502 roundtrip(false, "\xed\xa0\x80");
1503 // 5.1.2 U+DB7F = ed ad bf
1504 roundtrip(false, "\xed\xad\xbf");
1505 // 5.1.3 U+DB80 = ed ae 80
1506 roundtrip(false, "\xed\xae\x80");
1507 // 5.1.4 U+DBFF = ed af bf
1508 roundtrip(false, "\xed\xaf\xbf");
1509 // 5.1.5 U+DC00 = ed b0 80
1510 roundtrip(false, "\xed\xb0\x80");
1511 // 5.1.6 U+DF80 = ed be 80
1512 roundtrip(false, "\xed\xbe\x80");
1513 // 5.1.7 U+DFFF = ed bf bf
1514 roundtrip(false, "\xed\xbf\xbf");
1515 }
1516
1517 SECTION("5.2 Paired UTF-16 surrogates")
1518 {
1519 // 5.2.1 U+D800 U+DC00 = ed a0 80 ed b0 80
1520 roundtrip(false, "\xed\xa0\x80\xed\xb0\x80");
1521 // 5.2.2 U+D800 U+DFFF = ed a0 80 ed bf bf
1522 roundtrip(false, "\xed\xa0\x80\xed\xbf\xbf");
1523 // 5.2.3 U+DB7F U+DC00 = ed ad bf ed b0 80
1524 roundtrip(false, "\xed\xad\xbf\xed\xb0\x80");
1525 // 5.2.4 U+DB7F U+DFFF = ed ad bf ed bf bf
1526 roundtrip(false, "\xed\xad\xbf\xed\xbf\xbf");
1527 // 5.2.5 U+DB80 U+DC00 = ed ae 80 ed b0 80
1528 roundtrip(false, "\xed\xae\x80\xed\xb0\x80");
1529 // 5.2.6 U+DB80 U+DFFF = ed ae 80 ed bf bf
1530 roundtrip(false, "\xed\xae\x80\xed\xbf\xbf");
1531 // 5.2.7 U+DBFF U+DC00 = ed af bf ed b0 80
1532 roundtrip(false, "\xed\xaf\xbf\xed\xb0\x80");
1533 // 5.2.8 U+DBFF U+DFFF = ed af bf ed bf bf
1534 roundtrip(false, "\xed\xaf\xbf\xed\xbf\xbf");
1535 }
1536
1537 SECTION("5.3 Noncharacter code positions")
1538 {
1539 // The following "noncharacters" are "reserved for internal use" by
1540 // applications, and according to older versions of the Unicode Standard
1541 // "should never be interchanged". Unicode Corrigendum #9 dropped the
1542 // latter restriction. Nevertheless, their presence in incoming UTF-8 data
1543 // can remain a potential security risk, depending on what use is made of
1544 // these codes subsequently. Examples of such internal use:
1545 //
1546 // - Some file APIs with 16-bit characters may use the integer value -1
1547 // = U+FFFF to signal an end-of-file (EOF) or error condition.
1548 //
1549 // - In some UTF-16 receivers, code point U+FFFE might trigger a
1550 // byte-swap operation (to convert between UTF-16LE and UTF-16BE).
1551 //
1552 // With such internal use of noncharacters, it may be desirable and safer
1553 // to block those code points in UTF-8 decoders, as they should never
1554 // occur legitimately in incoming UTF-8 data, and could trigger unsafe
1555 // behaviour in subsequent processing.
1556
1557 // Particularly problematic noncharacters in 16-bit applications:
1558
1559 // 5.3.1 U+FFFE = ef bf be
1560 roundtrip(true, "\xef\xbf\xbe");
1561 // 5.3.2 U+FFFF = ef bf bf
1562 roundtrip(true, "\xef\xbf\xbf");
1563
1564 // 5.3.3 U+FDD0 .. U+FDEF
1565 roundtrip(true, "\xEF\xB7\x90");
1566 roundtrip(true, "\xEF\xB7\x91");
1567 roundtrip(true, "\xEF\xB7\x92");
1568 roundtrip(true, "\xEF\xB7\x93");
1569 roundtrip(true, "\xEF\xB7\x94");
1570 roundtrip(true, "\xEF\xB7\x95");
1571 roundtrip(true, "\xEF\xB7\x96");
1572 roundtrip(true, "\xEF\xB7\x97");
1573 roundtrip(true, "\xEF\xB7\x98");
1574 roundtrip(true, "\xEF\xB7\x99");
1575 roundtrip(true, "\xEF\xB7\x9A");
1576 roundtrip(true, "\xEF\xB7\x9B");
1577 roundtrip(true, "\xEF\xB7\x9C");
1578 roundtrip(true, "\xEF\xB7\x9D");
1579 roundtrip(true, "\xEF\xB7\x9E");
1580 roundtrip(true, "\xEF\xB7\x9F");
1581 roundtrip(true, "\xEF\xB7\xA0");
1582 roundtrip(true, "\xEF\xB7\xA1");
1583 roundtrip(true, "\xEF\xB7\xA2");
1584 roundtrip(true, "\xEF\xB7\xA3");
1585 roundtrip(true, "\xEF\xB7\xA4");
1586 roundtrip(true, "\xEF\xB7\xA5");
1587 roundtrip(true, "\xEF\xB7\xA6");
1588 roundtrip(true, "\xEF\xB7\xA7");
1589 roundtrip(true, "\xEF\xB7\xA8");
1590 roundtrip(true, "\xEF\xB7\xA9");
1591 roundtrip(true, "\xEF\xB7\xAA");
1592 roundtrip(true, "\xEF\xB7\xAB");
1593 roundtrip(true, "\xEF\xB7\xAC");
1594 roundtrip(true, "\xEF\xB7\xAD");
1595 roundtrip(true, "\xEF\xB7\xAE");
1596 roundtrip(true, "\xEF\xB7\xAF");
1597
1598 // 5.3.4 U+nFFFE U+nFFFF (for n = 1..10)
1599 roundtrip(true, "\xF0\x9F\xBF\xBF");
1600 roundtrip(true, "\xF0\xAF\xBF\xBF");
1601 roundtrip(true, "\xF0\xBF\xBF\xBF");
1602 roundtrip(true, "\xF1\x8F\xBF\xBF");
1603 roundtrip(true, "\xF1\x9F\xBF\xBF");
1604 roundtrip(true, "\xF1\xAF\xBF\xBF");
1605 roundtrip(true, "\xF1\xBF\xBF\xBF");
1606 roundtrip(true, "\xF2\x8F\xBF\xBF");
1607 roundtrip(true, "\xF2\x9F\xBF\xBF");
1608 roundtrip(true, "\xF2\xAF\xBF\xBF");
1609 }
1610 }
1611 }
1612