• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2     __ _____ _____ _____
3  __|  |   __|     |   | |  JSON for Modern C++ (test suite)
4 |  |  |__   |  |  | | | |  version 3.9.1
5 |_____|_____|_____|_|___|  https://github.com/nlohmann/json
6 
7 Licensed under the MIT License <http://opensource.org/licenses/MIT>.
8 SPDX-License-Identifier: MIT
9 Copyright (c) 2013-2019 Niels Lohmann <http://nlohmann.me>.
10 
11 Permission is hereby  granted, free of charge, to any  person obtaining a copy
12 of this software and associated  documentation files (the "Software"), to deal
13 in the Software  without restriction, including without  limitation the rights
14 to  use, copy,  modify, merge,  publish, distribute,  sublicense, and/or  sell
15 copies  of  the Software,  and  to  permit persons  to  whom  the Software  is
16 furnished to do so, subject to the following conditions:
17 
18 The above copyright notice and this permission notice shall be included in all
19 copies or substantial portions of the Software.
20 
21 THE SOFTWARE  IS PROVIDED "AS  IS", WITHOUT WARRANTY  OF ANY KIND,  EXPRESS OR
22 IMPLIED,  INCLUDING BUT  NOT  LIMITED TO  THE  WARRANTIES OF  MERCHANTABILITY,
23 FITNESS FOR  A PARTICULAR PURPOSE AND  NONINFRINGEMENT. IN NO EVENT  SHALL THE
24 AUTHORS  OR COPYRIGHT  HOLDERS  BE  LIABLE FOR  ANY  CLAIM,  DAMAGES OR  OTHER
25 LIABILITY, WHETHER IN AN ACTION OF  CONTRACT, TORT OR OTHERWISE, ARISING FROM,
26 OUT OF OR IN CONNECTION WITH THE SOFTWARE  OR THE USE OR OTHER DEALINGS IN THE
27 SOFTWARE.
28 */
29 
30 #include "doctest_compatibility.h"
31 
32 // for some reason including this after the json header leads to linker errors with VS 2017...
33 #include <locale>
34 
35 #define private public
36 #include <nlohmann/json.hpp>
37 using nlohmann::json;
38 #undef private
39 
40 #include <fstream>
41 #include <sstream>
42 #include <iostream>
43 #include <iomanip>
44 #include <test_data.hpp>
45 
46 namespace
47 {
48 extern size_t calls;
49 size_t calls = 0;
50 
51 void check_utf8dump(bool success_expected, int byte1, int byte2, int byte3, int byte4);
52 
check_utf8dump(bool success_expected,int byte1,int byte2=-1,int byte3=-1,int byte4=-1)53 void check_utf8dump(bool success_expected, int byte1, int byte2 = -1, int byte3 = -1, int byte4 = -1)
54 {
55     std::string json_string;
56 
57     CAPTURE(byte1)
58     CAPTURE(byte2)
59     CAPTURE(byte3)
60     CAPTURE(byte4)
61 
62     json_string += std::string(1, static_cast<char>(byte1));
63 
64     if (byte2 != -1)
65     {
66         json_string += std::string(1, static_cast<char>(byte2));
67     }
68 
69     if (byte3 != -1)
70     {
71         json_string += std::string(1, static_cast<char>(byte3));
72     }
73 
74     if (byte4 != -1)
75     {
76         json_string += std::string(1, static_cast<char>(byte4));
77     }
78 
79     CAPTURE(json_string)
80 
81     // store the string in a JSON value
82     json j = json_string;
83     json j2 = "abc" + json_string + "xyz";
84 
85     // dumping with ignore/replace must not throw in any case
86     auto s_ignored = j.dump(-1, ' ', false, json::error_handler_t::ignore);
87     auto s_ignored2 = j2.dump(-1, ' ', false, json::error_handler_t::ignore);
88     auto s_ignored_ascii = j.dump(-1, ' ', true, json::error_handler_t::ignore);
89     auto s_ignored2_ascii = j2.dump(-1, ' ', true, json::error_handler_t::ignore);
90     auto s_replaced = j.dump(-1, ' ', false, json::error_handler_t::replace);
91     auto s_replaced2 = j2.dump(-1, ' ', false, json::error_handler_t::replace);
92     auto s_replaced_ascii = j.dump(-1, ' ', true, json::error_handler_t::replace);
93     auto s_replaced2_ascii = j2.dump(-1, ' ', true, json::error_handler_t::replace);
94 
95     if (success_expected)
96     {
97         // strict mode must not throw if success is expected
98         auto s_strict = j.dump();
99         // all dumps should agree on the string
100         CHECK(s_strict == s_ignored);
101         CHECK(s_strict == s_replaced);
102     }
103     else
104     {
105         // strict mode must throw if success is not expected
106         CHECK_THROWS_AS(j.dump(), json::type_error&);
107         // ignore and replace must create different dumps
108         CHECK(s_ignored != s_replaced);
109 
110         // check that replace string contains a replacement character
111         CHECK(s_replaced.find("\xEF\xBF\xBD") != std::string::npos);
112     }
113 
114     // check that prefix and suffix are preserved
115     CHECK(s_ignored2.substr(1, 3) == "abc");
116     CHECK(s_ignored2.substr(s_ignored2.size() - 4, 3) == "xyz");
117     CHECK(s_ignored2_ascii.substr(1, 3) == "abc");
118     CHECK(s_ignored2_ascii.substr(s_ignored2_ascii.size() - 4, 3) == "xyz");
119     CHECK(s_replaced2.substr(1, 3) == "abc");
120     CHECK(s_replaced2.substr(s_replaced2.size() - 4, 3) == "xyz");
121     CHECK(s_replaced2_ascii.substr(1, 3) == "abc");
122     CHECK(s_replaced2_ascii.substr(s_replaced2_ascii.size() - 4, 3) == "xyz");
123 }
124 
125 void check_utf8string(bool success_expected, int byte1, int byte2, int byte3, int byte4);
126 
127 // create and check a JSON string with up to four UTF-8 bytes
check_utf8string(bool success_expected,int byte1,int byte2=-1,int byte3=-1,int byte4=-1)128 void check_utf8string(bool success_expected, int byte1, int byte2 = -1, int byte3 = -1, int byte4 = -1)
129 {
130     if (++calls % 100000 == 0)
131     {
132         std::cout << calls << " of 8860608 UTF-8 strings checked" << std::endl;
133     }
134 
135     std::string json_string = "\"";
136 
137     CAPTURE(byte1)
138     json_string += std::string(1, static_cast<char>(byte1));
139 
140     if (byte2 != -1)
141     {
142         CAPTURE(byte2)
143         json_string += std::string(1, static_cast<char>(byte2));
144     }
145 
146     if (byte3 != -1)
147     {
148         CAPTURE(byte3)
149         json_string += std::string(1, static_cast<char>(byte3));
150     }
151 
152     if (byte4 != -1)
153     {
154         CAPTURE(byte4)
155         json_string += std::string(1, static_cast<char>(byte4));
156     }
157 
158     json_string += "\"";
159 
160     CAPTURE(json_string)
161 
162     json _;
163     if (success_expected)
164     {
165         CHECK_NOTHROW(_ = json::parse(json_string));
166     }
167     else
168     {
169         CHECK_THROWS_AS(_ = json::parse(json_string), json::parse_error&);
170     }
171 }
172 }
173 
skip()174 TEST_CASE("Unicode" * doctest::skip())
175 {
176     SECTION("RFC 3629")
177     {
178         /*
179         RFC 3629 describes in Sect. 4 the syntax of UTF-8 byte sequences as
180         follows:
181 
182             A UTF-8 string is a sequence of octets representing a sequence of UCS
183             characters.  An octet sequence is valid UTF-8 only if it matches the
184             following syntax, which is derived from the rules for encoding UTF-8
185             and is expressed in the ABNF of [RFC2234].
186 
187             UTF8-octets = *( UTF8-char )
188             UTF8-char   = UTF8-1 / UTF8-2 / UTF8-3 / UTF8-4
189             UTF8-1      = %x00-7F
190             UTF8-2      = %xC2-DF UTF8-tail
191             UTF8-3      = %xE0 %xA0-BF UTF8-tail / %xE1-EC 2( UTF8-tail ) /
192                           %xED %x80-9F UTF8-tail / %xEE-EF 2( UTF8-tail )
193             UTF8-4      = %xF0 %x90-BF 2( UTF8-tail ) / %xF1-F3 3( UTF8-tail ) /
194                           %xF4 %x80-8F 2( UTF8-tail )
195             UTF8-tail   = %x80-BF
196         */
197 
198         SECTION("ill-formed first byte")
199         {
200             for (int byte1 = 0x80; byte1 <= 0xC1; ++byte1)
201             {
202                 check_utf8string(false, byte1);
203                 check_utf8dump(false, byte1);
204             }
205 
206             for (int byte1 = 0xF5; byte1 <= 0xFF; ++byte1)
207             {
208                 check_utf8string(false, byte1);
209                 check_utf8dump(false, byte1);
210             }
211         }
212 
213         SECTION("UTF8-1 (x00-x7F)")
214         {
215             SECTION("well-formed")
216             {
217                 for (int byte1 = 0x00; byte1 <= 0x7F; ++byte1)
218                 {
219                     // unescaped control characters are parse errors in JSON
220                     if (0x00 <= byte1 && byte1 <= 0x1F)
221                     {
222                         check_utf8string(false, byte1);
223                         continue;
224                     }
225 
226                     // a single quote is a parse error in JSON
227                     if (byte1 == 0x22)
228                     {
229                         check_utf8string(false, byte1);
230                         continue;
231                     }
232 
233                     // a single backslash is a parse error in JSON
234                     if (byte1 == 0x5C)
235                     {
236                         check_utf8string(false, byte1);
237                         continue;
238                     }
239 
240                     // all other characters are OK
241                     check_utf8string(true, byte1);
242                     check_utf8dump(true, byte1);
243                 }
244             }
245         }
246 
247         SECTION("UTF8-2 (xC2-xDF UTF8-tail)")
248         {
249             SECTION("well-formed")
250             {
251                 for (int byte1 = 0xC2; byte1 <= 0xDF; ++byte1)
252                 {
253                     for (int byte2 = 0x80; byte2 <= 0xBF; ++byte2)
254                     {
255                         check_utf8string(true, byte1, byte2);
256                         check_utf8dump(true, byte1, byte2);
257                     }
258                 }
259             }
260 
261             SECTION("ill-formed: missing second byte")
262             {
263                 for (int byte1 = 0xC2; byte1 <= 0xDF; ++byte1)
264                 {
265                     check_utf8string(false, byte1);
266                     check_utf8dump(false, byte1);
267                 }
268             }
269 
270             SECTION("ill-formed: wrong second byte")
271             {
272                 for (int byte1 = 0xC2; byte1 <= 0xDF; ++byte1)
273                 {
274                     for (int byte2 = 0x00; byte2 <= 0xFF; ++byte2)
275                     {
276                         // skip correct second byte
277                         if (0x80 <= byte2 && byte2 <= 0xBF)
278                         {
279                             continue;
280                         }
281 
282                         check_utf8string(false, byte1, byte2);
283                         check_utf8dump(false, byte1, byte2);
284                     }
285                 }
286             }
287         }
288 
289         SECTION("UTF8-3 (xE0 xA0-BF UTF8-tail)")
290         {
291             SECTION("well-formed")
292             {
293                 for (int byte1 = 0xE0; byte1 <= 0xE0; ++byte1)
294                 {
295                     for (int byte2 = 0xA0; byte2 <= 0xBF; ++byte2)
296                     {
297                         for (int byte3 = 0x80; byte3 <= 0xBF; ++byte3)
298                         {
299                             check_utf8string(true, byte1, byte2, byte3);
300                             check_utf8dump(true, byte1, byte2, byte3);
301                         }
302                     }
303                 }
304             }
305 
306             SECTION("ill-formed: missing second byte")
307             {
308                 for (int byte1 = 0xE0; byte1 <= 0xE0; ++byte1)
309                 {
310                     check_utf8string(false, byte1);
311                     check_utf8dump(false, byte1);
312                 }
313             }
314 
315             SECTION("ill-formed: missing third byte")
316             {
317                 for (int byte1 = 0xE0; byte1 <= 0xE0; ++byte1)
318                 {
319                     for (int byte2 = 0xA0; byte2 <= 0xBF; ++byte2)
320                     {
321                         check_utf8string(false, byte1, byte2);
322                         check_utf8dump(false, byte1, byte2);
323                     }
324                 }
325             }
326 
327             SECTION("ill-formed: wrong second byte")
328             {
329                 for (int byte1 = 0xE0; byte1 <= 0xE0; ++byte1)
330                 {
331                     for (int byte2 = 0x00; byte2 <= 0xFF; ++byte2)
332                     {
333                         // skip correct second byte
334                         if (0xA0 <= byte2 && byte2 <= 0xBF)
335                         {
336                             continue;
337                         }
338 
339                         for (int byte3 = 0x80; byte3 <= 0xBF; ++byte3)
340                         {
341                             check_utf8string(false, byte1, byte2, byte3);
342                             check_utf8dump(false, byte1, byte2, byte3);
343                         }
344                     }
345                 }
346             }
347 
348             SECTION("ill-formed: wrong third byte")
349             {
350                 for (int byte1 = 0xE0; byte1 <= 0xE0; ++byte1)
351                 {
352                     for (int byte2 = 0xA0; byte2 <= 0xBF; ++byte2)
353                     {
354                         for (int byte3 = 0x00; byte3 <= 0xFF; ++byte3)
355                         {
356                             // skip correct third byte
357                             if (0x80 <= byte3 && byte3 <= 0xBF)
358                             {
359                                 continue;
360                             }
361 
362                             check_utf8string(false, byte1, byte2, byte3);
363                             check_utf8dump(false, byte1, byte2, byte3);
364                         }
365                     }
366                 }
367             }
368         }
369 
370         SECTION("UTF8-3 (xE1-xEC UTF8-tail UTF8-tail)")
371         {
372             SECTION("well-formed")
373             {
374                 for (int byte1 = 0xE1; byte1 <= 0xEC; ++byte1)
375                 {
376                     for (int byte2 = 0x80; byte2 <= 0xBF; ++byte2)
377                     {
378                         for (int byte3 = 0x80; byte3 <= 0xBF; ++byte3)
379                         {
380                             check_utf8string(true, byte1, byte2, byte3);
381                             check_utf8dump(true, byte1, byte2, byte3);
382                         }
383                     }
384                 }
385             }
386 
387             SECTION("ill-formed: missing second byte")
388             {
389                 for (int byte1 = 0xE1; byte1 <= 0xEC; ++byte1)
390                 {
391                     check_utf8string(false, byte1);
392                     check_utf8dump(false, byte1);
393                 }
394             }
395 
396             SECTION("ill-formed: missing third byte")
397             {
398                 for (int byte1 = 0xE1; byte1 <= 0xEC; ++byte1)
399                 {
400                     for (int byte2 = 0x80; byte2 <= 0xBF; ++byte2)
401                     {
402                         check_utf8string(false, byte1, byte2);
403                         check_utf8dump(false, byte1, byte2);
404                     }
405                 }
406             }
407 
408             SECTION("ill-formed: wrong second byte")
409             {
410                 for (int byte1 = 0xE1; byte1 <= 0xEC; ++byte1)
411                 {
412                     for (int byte2 = 0x00; byte2 <= 0xFF; ++byte2)
413                     {
414                         // skip correct second byte
415                         if (0x80 <= byte2 && byte2 <= 0xBF)
416                         {
417                             continue;
418                         }
419 
420                         for (int byte3 = 0x80; byte3 <= 0xBF; ++byte3)
421                         {
422                             check_utf8string(false, byte1, byte2, byte3);
423                             check_utf8dump(false, byte1, byte2, byte3);
424                         }
425                     }
426                 }
427             }
428 
429             SECTION("ill-formed: wrong third byte")
430             {
431                 for (int byte1 = 0xE1; byte1 <= 0xEC; ++byte1)
432                 {
433                     for (int byte2 = 0x80; byte2 <= 0xBF; ++byte2)
434                     {
435                         for (int byte3 = 0x00; byte3 <= 0xFF; ++byte3)
436                         {
437                             // skip correct third byte
438                             if (0x80 <= byte3 && byte3 <= 0xBF)
439                             {
440                                 continue;
441                             }
442 
443                             check_utf8string(false, byte1, byte2, byte3);
444                             check_utf8dump(false, byte1, byte2, byte3);
445                         }
446                     }
447                 }
448             }
449         }
450 
451         SECTION("UTF8-3 (xED x80-9F UTF8-tail)")
452         {
453             SECTION("well-formed")
454             {
455                 for (int byte1 = 0xED; byte1 <= 0xED; ++byte1)
456                 {
457                     for (int byte2 = 0x80; byte2 <= 0x9F; ++byte2)
458                     {
459                         for (int byte3 = 0x80; byte3 <= 0xBF; ++byte3)
460                         {
461                             check_utf8string(true, byte1, byte2, byte3);
462                             check_utf8dump(true, byte1, byte2, byte3);
463                         }
464                     }
465                 }
466             }
467 
468             SECTION("ill-formed: missing second byte")
469             {
470                 for (int byte1 = 0xED; byte1 <= 0xED; ++byte1)
471                 {
472                     check_utf8string(false, byte1);
473                     check_utf8dump(false, byte1);
474                 }
475             }
476 
477             SECTION("ill-formed: missing third byte")
478             {
479                 for (int byte1 = 0xED; byte1 <= 0xED; ++byte1)
480                 {
481                     for (int byte2 = 0x80; byte2 <= 0x9F; ++byte2)
482                     {
483                         check_utf8string(false, byte1, byte2);
484                         check_utf8dump(false, byte1, byte2);
485                     }
486                 }
487             }
488 
489             SECTION("ill-formed: wrong second byte")
490             {
491                 for (int byte1 = 0xED; byte1 <= 0xED; ++byte1)
492                 {
493                     for (int byte2 = 0x00; byte2 <= 0xFF; ++byte2)
494                     {
495                         // skip correct second byte
496                         if (0x80 <= byte2 && byte2 <= 0x9F)
497                         {
498                             continue;
499                         }
500 
501                         for (int byte3 = 0x80; byte3 <= 0xBF; ++byte3)
502                         {
503                             check_utf8string(false, byte1, byte2, byte3);
504                             check_utf8dump(false, byte1, byte2, byte3);
505                         }
506                     }
507                 }
508             }
509 
510             SECTION("ill-formed: wrong third byte")
511             {
512                 for (int byte1 = 0xED; byte1 <= 0xED; ++byte1)
513                 {
514                     for (int byte2 = 0x80; byte2 <= 0x9F; ++byte2)
515                     {
516                         for (int byte3 = 0x00; byte3 <= 0xFF; ++byte3)
517                         {
518                             // skip correct third byte
519                             if (0x80 <= byte3 && byte3 <= 0xBF)
520                             {
521                                 continue;
522                             }
523 
524                             check_utf8string(false, byte1, byte2, byte3);
525                             check_utf8dump(false, byte1, byte2, byte3);
526                         }
527                     }
528                 }
529             }
530         }
531 
532         SECTION("UTF8-3 (xEE-xEF UTF8-tail UTF8-tail)")
533         {
534             SECTION("well-formed")
535             {
536                 for (int byte1 = 0xEE; byte1 <= 0xEF; ++byte1)
537                 {
538                     for (int byte2 = 0x80; byte2 <= 0xBF; ++byte2)
539                     {
540                         for (int byte3 = 0x80; byte3 <= 0xBF; ++byte3)
541                         {
542                             check_utf8string(true, byte1, byte2, byte3);
543                             check_utf8dump(true, byte1, byte2, byte3);
544                         }
545                     }
546                 }
547             }
548 
549             SECTION("ill-formed: missing second byte")
550             {
551                 for (int byte1 = 0xEE; byte1 <= 0xEF; ++byte1)
552                 {
553                     check_utf8string(false, byte1);
554                     check_utf8dump(false, byte1);
555                 }
556             }
557 
558             SECTION("ill-formed: missing third byte")
559             {
560                 for (int byte1 = 0xEE; byte1 <= 0xEF; ++byte1)
561                 {
562                     for (int byte2 = 0x80; byte2 <= 0xBF; ++byte2)
563                     {
564                         check_utf8string(false, byte1, byte2);
565                         check_utf8dump(false, byte1, byte2);
566                     }
567                 }
568             }
569 
570             SECTION("ill-formed: wrong second byte")
571             {
572                 for (int byte1 = 0xEE; byte1 <= 0xEF; ++byte1)
573                 {
574                     for (int byte2 = 0x00; byte2 <= 0xFF; ++byte2)
575                     {
576                         // skip correct second byte
577                         if (0x80 <= byte2 && byte2 <= 0xBF)
578                         {
579                             continue;
580                         }
581 
582                         for (int byte3 = 0x80; byte3 <= 0xBF; ++byte3)
583                         {
584                             check_utf8string(false, byte1, byte2, byte3);
585                             check_utf8dump(false, byte1, byte2, byte3);
586                         }
587                     }
588                 }
589             }
590 
591             SECTION("ill-formed: wrong third byte")
592             {
593                 for (int byte1 = 0xEE; byte1 <= 0xEF; ++byte1)
594                 {
595                     for (int byte2 = 0x80; byte2 <= 0xBF; ++byte2)
596                     {
597                         for (int byte3 = 0x00; byte3 <= 0xFF; ++byte3)
598                         {
599                             // skip correct third byte
600                             if (0x80 <= byte3 && byte3 <= 0xBF)
601                             {
602                                 continue;
603                             }
604 
605                             check_utf8string(false, byte1, byte2, byte3);
606                             check_utf8dump(false, byte1, byte2, byte3);
607                         }
608                     }
609                 }
610             }
611         }
612 
613         SECTION("UTF8-4 (xF0 x90-BF UTF8-tail UTF8-tail)")
614         {
615             SECTION("well-formed")
616             {
617                 for (int byte1 = 0xF0; byte1 <= 0xF0; ++byte1)
618                 {
619                     for (int byte2 = 0x90; byte2 <= 0xBF; ++byte2)
620                     {
621                         for (int byte3 = 0x80; byte3 <= 0xBF; ++byte3)
622                         {
623                             for (int byte4 = 0x80; byte4 <= 0xBF; ++byte4)
624                             {
625                                 check_utf8string(true, byte1, byte2, byte3, byte4);
626                                 check_utf8dump(true, byte1, byte2, byte3, byte4);
627                             }
628                         }
629                     }
630                 }
631             }
632 
633             SECTION("ill-formed: missing second byte")
634             {
635                 for (int byte1 = 0xF0; byte1 <= 0xF0; ++byte1)
636                 {
637                     check_utf8string(false, byte1);
638                     check_utf8dump(false, byte1);
639                 }
640             }
641 
642             SECTION("ill-formed: missing third byte")
643             {
644                 for (int byte1 = 0xF0; byte1 <= 0xF0; ++byte1)
645                 {
646                     for (int byte2 = 0x90; byte2 <= 0xBF; ++byte2)
647                     {
648                         check_utf8string(false, byte1, byte2);
649                         check_utf8dump(false, byte1, byte2);
650                     }
651                 }
652             }
653 
654             SECTION("ill-formed: missing fourth byte")
655             {
656                 for (int byte1 = 0xF0; byte1 <= 0xF0; ++byte1)
657                 {
658                     for (int byte2 = 0x90; byte2 <= 0xBF; ++byte2)
659                     {
660                         for (int byte3 = 0x80; byte3 <= 0xBF; ++byte3)
661                         {
662                             check_utf8string(false, byte1, byte2, byte3);
663                             check_utf8dump(false, byte1, byte2, byte3);
664                         }
665                     }
666                 }
667             }
668 
669             SECTION("ill-formed: wrong second byte")
670             {
671                 for (int byte1 = 0xF0; byte1 <= 0xF0; ++byte1)
672                 {
673                     for (int byte2 = 0x00; byte2 <= 0xFF; ++byte2)
674                     {
675                         // skip correct second byte
676                         if (0x90 <= byte2 && byte2 <= 0xBF)
677                         {
678                             continue;
679                         }
680 
681                         for (int byte3 = 0x80; byte3 <= 0xBF; ++byte3)
682                         {
683                             for (int byte4 = 0x80; byte4 <= 0xBF; ++byte4)
684                             {
685                                 check_utf8string(false, byte1, byte2, byte3, byte4);
686                                 check_utf8dump(false, byte1, byte2, byte3, byte4);
687                             }
688                         }
689                     }
690                 }
691             }
692 
693             SECTION("ill-formed: wrong third byte")
694             {
695                 for (int byte1 = 0xF0; byte1 <= 0xF0; ++byte1)
696                 {
697                     for (int byte2 = 0x90; byte2 <= 0xBF; ++byte2)
698                     {
699                         for (int byte3 = 0x00; byte3 <= 0xFF; ++byte3)
700                         {
701                             // skip correct third byte
702                             if (0x80 <= byte3 && byte3 <= 0xBF)
703                             {
704                                 continue;
705                             }
706 
707                             for (int byte4 = 0x80; byte4 <= 0xBF; ++byte4)
708                             {
709                                 check_utf8string(false, byte1, byte2, byte3, byte4);
710                                 check_utf8dump(false, byte1, byte2, byte3, byte4);
711                             }
712                         }
713                     }
714                 }
715             }
716 
717             SECTION("ill-formed: wrong fourth byte")
718             {
719                 for (int byte1 = 0xF0; byte1 <= 0xF0; ++byte1)
720                 {
721                     for (int byte2 = 0x90; byte2 <= 0xBF; ++byte2)
722                     {
723                         for (int byte3 = 0x80; byte3 <= 0xBF; ++byte3)
724                         {
725                             for (int byte4 = 0x00; byte4 <= 0xFF; ++byte4)
726                             {
727                                 // skip fourth second byte
728                                 if (0x80 <= byte3 && byte3 <= 0xBF)
729                                 {
730                                     continue;
731                                 }
732 
733                                 check_utf8string(false, byte1, byte2, byte3, byte4);
734                                 check_utf8dump(false, byte1, byte2, byte3, byte4);
735                             }
736                         }
737                     }
738                 }
739             }
740         }
741 
742         SECTION("UTF8-4 (xF1-F3 UTF8-tail UTF8-tail UTF8-tail)")
743         {
744             SECTION("well-formed")
745             {
746                 for (int byte1 = 0xF1; byte1 <= 0xF3; ++byte1)
747                 {
748                     for (int byte2 = 0x80; byte2 <= 0xBF; ++byte2)
749                     {
750                         for (int byte3 = 0x80; byte3 <= 0xBF; ++byte3)
751                         {
752                             for (int byte4 = 0x80; byte4 <= 0xBF; ++byte4)
753                             {
754                                 check_utf8string(true, byte1, byte2, byte3, byte4);
755                                 check_utf8dump(true, byte1, byte2, byte3, byte4);
756                             }
757                         }
758                     }
759                 }
760             }
761 
762             SECTION("ill-formed: missing second byte")
763             {
764                 for (int byte1 = 0xF1; byte1 <= 0xF3; ++byte1)
765                 {
766                     check_utf8string(false, byte1);
767                     check_utf8dump(false, byte1);
768                 }
769             }
770 
771             SECTION("ill-formed: missing third byte")
772             {
773                 for (int byte1 = 0xF1; byte1 <= 0xF3; ++byte1)
774                 {
775                     for (int byte2 = 0x80; byte2 <= 0xBF; ++byte2)
776                     {
777                         check_utf8string(false, byte1, byte2);
778                         check_utf8dump(false, byte1, byte2);
779                     }
780                 }
781             }
782 
783             SECTION("ill-formed: missing fourth byte")
784             {
785                 for (int byte1 = 0xF1; byte1 <= 0xF3; ++byte1)
786                 {
787                     for (int byte2 = 0x80; byte2 <= 0xBF; ++byte2)
788                     {
789                         for (int byte3 = 0x80; byte3 <= 0xBF; ++byte3)
790                         {
791                             check_utf8string(false, byte1, byte2, byte3);
792                             check_utf8dump(false, byte1, byte2, byte3);
793                         }
794                     }
795                 }
796             }
797 
798             SECTION("ill-formed: wrong second byte")
799             {
800                 for (int byte1 = 0xF1; byte1 <= 0xF3; ++byte1)
801                 {
802                     for (int byte2 = 0x00; byte2 <= 0xFF; ++byte2)
803                     {
804                         // skip correct second byte
805                         if (0x80 <= byte2 && byte2 <= 0xBF)
806                         {
807                             continue;
808                         }
809 
810                         for (int byte3 = 0x80; byte3 <= 0xBF; ++byte3)
811                         {
812                             for (int byte4 = 0x80; byte4 <= 0xBF; ++byte4)
813                             {
814                                 check_utf8string(false, byte1, byte2, byte3, byte4);
815                                 check_utf8dump(false, byte1, byte2, byte3, byte4);
816                             }
817                         }
818                     }
819                 }
820             }
821 
822             SECTION("ill-formed: wrong third byte")
823             {
824                 for (int byte1 = 0xF1; byte1 <= 0xF3; ++byte1)
825                 {
826                     for (int byte2 = 0x80; byte2 <= 0xBF; ++byte2)
827                     {
828                         for (int byte3 = 0x00; byte3 <= 0xFF; ++byte3)
829                         {
830                             // skip correct third byte
831                             if (0x80 <= byte3 && byte3 <= 0xBF)
832                             {
833                                 continue;
834                             }
835 
836                             for (int byte4 = 0x80; byte4 <= 0xBF; ++byte4)
837                             {
838                                 check_utf8string(false, byte1, byte2, byte3, byte4);
839                                 check_utf8dump(false, byte1, byte2, byte3, byte4);
840                             }
841                         }
842                     }
843                 }
844             }
845 
846             SECTION("ill-formed: wrong fourth byte")
847             {
848                 for (int byte1 = 0xF1; byte1 <= 0xF3; ++byte1)
849                 {
850                     for (int byte2 = 0x80; byte2 <= 0xBF; ++byte2)
851                     {
852                         for (int byte3 = 0x80; byte3 <= 0xBF; ++byte3)
853                         {
854                             for (int byte4 = 0x00; byte4 <= 0xFF; ++byte4)
855                             {
856                                 // skip correct fourth byte
857                                 if (0x80 <= byte3 && byte3 <= 0xBF)
858                                 {
859                                     continue;
860                                 }
861 
862                                 check_utf8string(false, byte1, byte2, byte3, byte4);
863                                 check_utf8dump(false, byte1, byte2, byte3, byte4);
864                             }
865                         }
866                     }
867                 }
868             }
869         }
870 
871         SECTION("UTF8-4 (xF4 x80-8F UTF8-tail UTF8-tail)")
872         {
873             SECTION("well-formed")
874             {
875                 for (int byte1 = 0xF4; byte1 <= 0xF4; ++byte1)
876                 {
877                     for (int byte2 = 0x80; byte2 <= 0x8F; ++byte2)
878                     {
879                         for (int byte3 = 0x80; byte3 <= 0xBF; ++byte3)
880                         {
881                             for (int byte4 = 0x80; byte4 <= 0xBF; ++byte4)
882                             {
883                                 check_utf8string(true, byte1, byte2, byte3, byte4);
884                                 check_utf8dump(true, byte1, byte2, byte3, byte4);
885                             }
886                         }
887                     }
888                 }
889             }
890 
891             SECTION("ill-formed: missing second byte")
892             {
893                 for (int byte1 = 0xF4; byte1 <= 0xF4; ++byte1)
894                 {
895                     check_utf8string(false, byte1);
896                     check_utf8dump(false, byte1);
897                 }
898             }
899 
900             SECTION("ill-formed: missing third byte")
901             {
902                 for (int byte1 = 0xF4; byte1 <= 0xF4; ++byte1)
903                 {
904                     for (int byte2 = 0x80; byte2 <= 0x8F; ++byte2)
905                     {
906                         check_utf8string(false, byte1, byte2);
907                         check_utf8dump(false, byte1, byte2);
908                     }
909                 }
910             }
911 
912             SECTION("ill-formed: missing fourth byte")
913             {
914                 for (int byte1 = 0xF4; byte1 <= 0xF4; ++byte1)
915                 {
916                     for (int byte2 = 0x80; byte2 <= 0x8F; ++byte2)
917                     {
918                         for (int byte3 = 0x80; byte3 <= 0xBF; ++byte3)
919                         {
920                             check_utf8string(false, byte1, byte2, byte3);
921                             check_utf8dump(false, byte1, byte2, byte3);
922                         }
923                     }
924                 }
925             }
926 
927             SECTION("ill-formed: wrong second byte")
928             {
929                 for (int byte1 = 0xF4; byte1 <= 0xF4; ++byte1)
930                 {
931                     for (int byte2 = 0x00; byte2 <= 0xFF; ++byte2)
932                     {
933                         // skip correct second byte
934                         if (0x80 <= byte2 && byte2 <= 0x8F)
935                         {
936                             continue;
937                         }
938 
939                         for (int byte3 = 0x80; byte3 <= 0xBF; ++byte3)
940                         {
941                             for (int byte4 = 0x80; byte4 <= 0xBF; ++byte4)
942                             {
943                                 check_utf8string(false, byte1, byte2, byte3, byte4);
944                                 check_utf8dump(false, byte1, byte2, byte3, byte4);
945                             }
946                         }
947                     }
948                 }
949             }
950 
951             SECTION("ill-formed: wrong third byte")
952             {
953                 for (int byte1 = 0xF4; byte1 <= 0xF4; ++byte1)
954                 {
955                     for (int byte2 = 0x80; byte2 <= 0x8F; ++byte2)
956                     {
957                         for (int byte3 = 0x00; byte3 <= 0xFF; ++byte3)
958                         {
959                             // skip correct third byte
960                             if (0x80 <= byte3 && byte3 <= 0xBF)
961                             {
962                                 continue;
963                             }
964 
965                             for (int byte4 = 0x80; byte4 <= 0xBF; ++byte4)
966                             {
967                                 check_utf8string(false, byte1, byte2, byte3, byte4);
968                                 check_utf8dump(false, byte1, byte2, byte3, byte4);
969                             }
970                         }
971                     }
972                 }
973             }
974 
975             SECTION("ill-formed: wrong fourth byte")
976             {
977                 for (int byte1 = 0xF4; byte1 <= 0xF4; ++byte1)
978                 {
979                     for (int byte2 = 0x80; byte2 <= 0x8F; ++byte2)
980                     {
981                         for (int byte3 = 0x80; byte3 <= 0xBF; ++byte3)
982                         {
983                             for (int byte4 = 0x00; byte4 <= 0xFF; ++byte4)
984                             {
985                                 // skip correct fourth byte
986                                 if (0x80 <= byte3 && byte3 <= 0xBF)
987                                 {
988                                     continue;
989                                 }
990 
991                                 check_utf8string(false, byte1, byte2, byte3, byte4);
992                                 check_utf8dump(false, byte1, byte2, byte3, byte4);
993                             }
994                         }
995                     }
996                 }
997             }
998         }
999     }
1000 
1001     SECTION("\\uxxxx sequences")
1002     {
1003         // create an escaped string from a code point
1004         const auto codepoint_to_unicode = [](std::size_t cp)
1005         {
1006             // code points are represented as a six-character sequence: a
1007             // reverse solidus, followed by the lowercase letter u, followed
1008             // by four hexadecimal digits that encode the character's code
1009             // point
1010             std::stringstream ss;
1011             ss << "\\u" << std::setw(4) << std::setfill('0') << std::hex << cp;
1012             return ss.str();
1013         };
1014 
1015         SECTION("correct sequences")
1016         {
1017             // generate all UTF-8 code points; in total, 1112064 code points are
1018             // generated: 0x1FFFFF code points - 2048 invalid values between
1019             // 0xD800 and 0xDFFF.
1020             for (std::size_t cp = 0; cp <= 0x10FFFFu; ++cp)
1021             {
1022                 // string to store the code point as in \uxxxx format
1023                 std::string json_text = "\"";
1024 
1025                 // decide whether to use one or two \uxxxx sequences
1026                 if (cp < 0x10000u)
1027                 {
1028                     // The Unicode standard permanently reserves these code point
1029                     // values for UTF-16 encoding of the high and low surrogates, and
1030                     // they will never be assigned a character, so there should be no
1031                     // reason to encode them. The official Unicode standard says that
1032                     // no UTF forms, including UTF-16, can encode these code points.
1033                     if (cp >= 0xD800u && cp <= 0xDFFFu)
1034                     {
1035                         // if we would not skip these code points, we would get a
1036                         // "missing low surrogate" exception
1037                         continue;
1038                     }
1039 
1040                     // code points in the Basic Multilingual Plane can be
1041                     // represented with one \uxxxx sequence
1042                     json_text += codepoint_to_unicode(cp);
1043                 }
1044                 else
1045                 {
1046                     // To escape an extended character that is not in the Basic
1047                     // Multilingual Plane, the character is represented as a
1048                     // 12-character sequence, encoding the UTF-16 surrogate pair
1049                     const auto codepoint1 = 0xd800u + (((cp - 0x10000u) >> 10) & 0x3ffu);
1050                     const auto codepoint2 = 0xdc00u + ((cp - 0x10000u) & 0x3ffu);
1051                     json_text += codepoint_to_unicode(codepoint1) + codepoint_to_unicode(codepoint2);
1052                 }
1053 
1054                 json_text += "\"";
1055                 CAPTURE(json_text)
1056                 json _;
1057                 CHECK_NOTHROW(_ = json::parse(json_text));
1058             }
1059         }
1060 
1061         SECTION("incorrect sequences")
1062         {
1063             SECTION("incorrect surrogate values")
1064             {
1065                 json _;
1066 
1067                 CHECK_THROWS_AS(_ = json::parse("\"\\uDC00\\uDC00\""), json::parse_error&);
1068                 CHECK_THROWS_WITH(_ = json::parse("\"\\uDC00\\uDC00\""),
1069                                   "[json.exception.parse_error.101] parse error at line 1, column 7: syntax error while parsing value - invalid string: surrogate U+DC00..U+DFFF must follow U+D800..U+DBFF; last read: '\"\\uDC00'");
1070 
1071                 CHECK_THROWS_AS(_ = json::parse("\"\\uD7FF\\uDC00\""), json::parse_error&);
1072                 CHECK_THROWS_WITH(_ = json::parse("\"\\uD7FF\\uDC00\""),
1073                                   "[json.exception.parse_error.101] parse error at line 1, column 13: syntax error while parsing value - invalid string: surrogate U+DC00..U+DFFF must follow U+D800..U+DBFF; last read: '\"\\uD7FF\\uDC00'");
1074 
1075                 CHECK_THROWS_AS(_ = json::parse("\"\\uD800]\""), json::parse_error&);
1076                 CHECK_THROWS_WITH(_ = json::parse("\"\\uD800]\""),
1077                                   "[json.exception.parse_error.101] parse error at line 1, column 8: syntax error while parsing value - invalid string: surrogate U+D800..U+DBFF must be followed by U+DC00..U+DFFF; last read: '\"\\uD800]'");
1078 
1079                 CHECK_THROWS_AS(_ = json::parse("\"\\uD800\\v\""), json::parse_error&);
1080                 CHECK_THROWS_WITH(_ = json::parse("\"\\uD800\\v\""),
1081                                   "[json.exception.parse_error.101] parse error at line 1, column 9: syntax error while parsing value - invalid string: surrogate U+D800..U+DBFF must be followed by U+DC00..U+DFFF; last read: '\"\\uD800\\v'");
1082 
1083                 CHECK_THROWS_AS(_ = json::parse("\"\\uD800\\u123\""), json::parse_error&);
1084                 CHECK_THROWS_WITH(_ = json::parse("\"\\uD800\\u123\""),
1085                                   "[json.exception.parse_error.101] parse error at line 1, column 13: syntax error while parsing value - invalid string: '\\u' must be followed by 4 hex digits; last read: '\"\\uD800\\u123\"'");
1086 
1087                 CHECK_THROWS_AS(_ = json::parse("\"\\uD800\\uDBFF\""), json::parse_error&);
1088                 CHECK_THROWS_WITH(_ = json::parse("\"\\uD800\\uDBFF\""),
1089                                   "[json.exception.parse_error.101] parse error at line 1, column 13: syntax error while parsing value - invalid string: surrogate U+D800..U+DBFF must be followed by U+DC00..U+DFFF; last read: '\"\\uD800\\uDBFF'");
1090 
1091                 CHECK_THROWS_AS(_ = json::parse("\"\\uD800\\uE000\""), json::parse_error&);
1092                 CHECK_THROWS_WITH(_ = json::parse("\"\\uD800\\uE000\""),
1093                                   "[json.exception.parse_error.101] parse error at line 1, column 13: syntax error while parsing value - invalid string: surrogate U+D800..U+DBFF must be followed by U+DC00..U+DFFF; last read: '\"\\uD800\\uE000'");
1094             }
1095         }
1096 
1097 #if 0
1098         SECTION("incorrect sequences")
1099         {
1100             SECTION("high surrogate without low surrogate")
1101             {
1102                 // D800..DBFF are high surrogates and must be followed by low
1103                 // surrogates DC00..DFFF; here, nothing follows
1104                 for (std::size_t cp = 0xD800u; cp <= 0xDBFFu; ++cp)
1105                 {
1106                     std::string json_text = "\"" + codepoint_to_unicode(cp) + "\"";
1107                     CAPTURE(json_text)
1108                     CHECK_THROWS_AS(json::parse(json_text), json::parse_error&);
1109                 }
1110             }
1111 
1112             SECTION("high surrogate with wrong low surrogate")
1113             {
1114                 // D800..DBFF are high surrogates and must be followed by low
1115                 // surrogates DC00..DFFF; here a different sequence follows
1116                 for (std::size_t cp1 = 0xD800u; cp1 <= 0xDBFFu; ++cp1)
1117                 {
1118                     for (std::size_t cp2 = 0x0000u; cp2 <= 0xFFFFu; ++cp2)
1119                     {
1120                         if (0xDC00u <= cp2 && cp2 <= 0xDFFFu)
1121                         {
1122                             continue;
1123                         }
1124 
1125                         std::string json_text = "\"" + codepoint_to_unicode(cp1) + codepoint_to_unicode(cp2) + "\"";
1126                         CAPTURE(json_text)
1127                         CHECK_THROWS_AS(json::parse(json_text), json::parse_error&);
1128                     }
1129                 }
1130             }
1131 
1132             SECTION("low surrogate without high surrogate")
1133             {
1134                 // low surrogates DC00..DFFF must follow high surrogates; here,
1135                 // they occur alone
1136                 for (std::size_t cp = 0xDC00u; cp <= 0xDFFFu; ++cp)
1137                 {
1138                     std::string json_text = "\"" + codepoint_to_unicode(cp) + "\"";
1139                     CAPTURE(json_text)
1140                     CHECK_THROWS_AS(json::parse(json_text), json::parse_error&);
1141                 }
1142             }
1143 
1144         }
1145 #endif
1146     }
1147 
1148     SECTION("read all unicode characters")
1149     {
1150         // read a file with all unicode characters stored as single-character
1151         // strings in a JSON array
1152         std::ifstream f(TEST_DATA_DIRECTORY "/json_nlohmann_tests/all_unicode.json");
1153         json j;
1154         CHECK_NOTHROW(f >> j);
1155 
1156         // the array has 1112064 + 1 elements (a terminating "null" value)
1157         // Note: 1112064 = 0x1FFFFF code points - 2048 invalid values between
1158         // 0xD800 and 0xDFFF.
1159         CHECK(j.size() == 1112065);
1160 
1161         SECTION("check JSON Pointers")
1162         {
1163             for (auto s : j)
1164             {
1165                 // skip non-string JSON values
1166                 if (!s.is_string())
1167                 {
1168                     continue;
1169                 }
1170 
1171                 auto ptr = s.get<std::string>();
1172 
1173                 // tilde must be followed by 0 or 1
1174                 if (ptr == "~")
1175                 {
1176                     ptr += "0";
1177                 }
1178 
1179                 // JSON Pointers must begin with "/"
1180                 ptr = "/" + ptr;
1181 
1182                 CHECK_NOTHROW(json::json_pointer("/" + ptr));
1183 
1184                 // check escape/unescape roundtrip
1185                 auto escaped = json::json_pointer::escape(ptr);
1186                 json::json_pointer::unescape(escaped);
1187                 CHECK(escaped == ptr);
1188             }
1189         }
1190     }
1191 
1192     SECTION("ignore byte-order-mark")
1193     {
1194         SECTION("in a stream")
1195         {
1196             // read a file with a UTF-8 BOM
1197             std::ifstream f(TEST_DATA_DIRECTORY "/json_nlohmann_tests/bom.json");
1198             json j;
1199             CHECK_NOTHROW(f >> j);
1200         }
1201 
1202         SECTION("with an iterator")
1203         {
1204             std::string i = "\xef\xbb\xbf{\n   \"foo\": true\n}";
1205             CHECK_NOTHROW(json::parse(i.begin(), i.end()));
1206         }
1207     }
1208 
1209     SECTION("error for incomplete/wrong BOM")
1210     {
1211         json _;
1212         CHECK_THROWS_AS(_ = json::parse("\xef\xbb"), json::parse_error&);
1213         CHECK_THROWS_AS(_ = json::parse("\xef\xbb\xbb"), json::parse_error&);
1214     }
1215 }
1216 
1217 namespace
1218 {
1219 void roundtrip(bool success_expected, const std::string& s);
1220 
roundtrip(bool success_expected,const std::string & s)1221 void roundtrip(bool success_expected, const std::string& s)
1222 {
1223     CAPTURE(s)
1224     json _;
1225 
1226     // create JSON string value
1227     json j = s;
1228     // create JSON text
1229     std::string ps = std::string("\"") + s + "\"";
1230 
1231     if (success_expected)
1232     {
1233         // serialization succeeds
1234         CHECK_NOTHROW(j.dump());
1235 
1236         // exclude parse test for U+0000
1237         if (s[0] != '\0')
1238         {
1239             // parsing JSON text succeeds
1240             CHECK_NOTHROW(_ = json::parse(ps));
1241         }
1242 
1243         // roundtrip succeeds
1244         CHECK_NOTHROW(_ = json::parse(j.dump()));
1245 
1246         // after roundtrip, the same string is stored
1247         json jr = json::parse(j.dump());
1248         CHECK(jr.get<std::string>() == s);
1249     }
1250     else
1251     {
1252         // serialization fails
1253         CHECK_THROWS_AS(j.dump(), json::type_error&);
1254 
1255         // parsing JSON text fails
1256         CHECK_THROWS_AS(_ = json::parse(ps), json::parse_error&);
1257     }
1258 }
1259 }
1260 
1261 TEST_CASE("Markus Kuhn's UTF-8 decoder capability and stress test")
1262 {
1263     // Markus Kuhn <http://www.cl.cam.ac.uk/~mgk25/> - 2015-08-28 - CC BY 4.0
1264     // http://www.cl.cam.ac.uk/~mgk25/ucs/examples/UTF-8-test.txt
1265 
1266     SECTION("1  Some correct UTF-8 text")
1267     {
1268         roundtrip(true, "κόσμε");
1269     }
1270 
1271     SECTION("2  Boundary condition test cases")
1272     {
1273         SECTION("2.1  First possible sequence of a certain length")
1274         {
1275             // 2.1.1  1 byte  (U-00000000)
1276             roundtrip(true, std::string("\0", 1));
1277             // 2.1.2  2 bytes (U-00000080)
1278             roundtrip(true, "\xc2\x80");
1279             // 2.1.3  3 bytes (U-00000800)
1280             roundtrip(true, "\xe0\xa0\x80");
1281             // 2.1.4  4 bytes (U-00010000)
1282             roundtrip(true, "\xf0\x90\x80\x80");
1283 
1284             // 2.1.5  5 bytes (U-00200000)
1285             roundtrip(false, "\xF8\x88\x80\x80\x80");
1286             // 2.1.6  6 bytes (U-04000000)
1287             roundtrip(false, "\xFC\x84\x80\x80\x80\x80");
1288         }
1289 
1290         SECTION("2.2  Last possible sequence of a certain length")
1291         {
1292             // 2.2.1  1 byte  (U-0000007F)
1293             roundtrip(true, "\x7f");
1294             // 2.2.2  2 bytes (U-000007FF)
1295             roundtrip(true, "\xdf\xbf");
1296             // 2.2.3  3 bytes (U-0000FFFF)
1297             roundtrip(true, "\xef\xbf\xbf");
1298 
1299             // 2.2.4  4 bytes (U-001FFFFF)
1300             roundtrip(false, "\xF7\xBF\xBF\xBF");
1301             // 2.2.5  5 bytes (U-03FFFFFF)
1302             roundtrip(false, "\xFB\xBF\xBF\xBF\xBF");
1303             // 2.2.6  6 bytes (U-7FFFFFFF)
1304             roundtrip(false, "\xFD\xBF\xBF\xBF\xBF\xBF");
1305         }
1306 
1307         SECTION("2.3  Other boundary conditions")
1308         {
1309             // 2.3.1  U-0000D7FF = ed 9f bf
1310             roundtrip(true, "\xed\x9f\xbf");
1311             // 2.3.2  U-0000E000 = ee 80 80
1312             roundtrip(true, "\xee\x80\x80");
1313             // 2.3.3  U-0000FFFD = ef bf bd
1314             roundtrip(true, "\xef\xbf\xbd");
1315             // 2.3.4  U-0010FFFF = f4 8f bf bf
1316             roundtrip(true, "\xf4\x8f\xbf\xbf");
1317 
1318             // 2.3.5  U-00110000 = f4 90 80 80
1319             roundtrip(false, "\xf4\x90\x80\x80");
1320         }
1321     }
1322 
1323     SECTION("3  Malformed sequences")
1324     {
1325         SECTION("3.1  Unexpected continuation bytes")
1326         {
1327             // Each unexpected continuation byte should be separately signalled as a
1328             // malformed sequence of its own.
1329 
1330             // 3.1.1  First continuation byte 0x80
1331             roundtrip(false, "\x80");
1332             // 3.1.2  Last  continuation byte 0xbf
1333             roundtrip(false, "\xbf");
1334 
1335             // 3.1.3  2 continuation bytes
1336             roundtrip(false, "\x80\xbf");
1337             // 3.1.4  3 continuation bytes
1338             roundtrip(false, "\x80\xbf\x80");
1339             // 3.1.5  4 continuation bytes
1340             roundtrip(false, "\x80\xbf\x80\xbf");
1341             // 3.1.6  5 continuation bytes
1342             roundtrip(false, "\x80\xbf\x80\xbf\x80");
1343             // 3.1.7  6 continuation bytes
1344             roundtrip(false, "\x80\xbf\x80\xbf\x80\xbf");
1345             // 3.1.8  7 continuation bytes
1346             roundtrip(false, "\x80\xbf\x80\xbf\x80\xbf\x80");
1347 
1348             // 3.1.9  Sequence of all 64 possible continuation bytes (0x80-0xbf)
1349             roundtrip(false, "\x80\x81\x82\x83\x84\x85\x86\x87\x88\x89\x8a\x8b\x8c\x8d\x8e\x8f\x90\x91\x92\x93\x94\x95\x96\x97\x98\x99\x9a\x9b\x9c\x9d\x9e\x9f\xa0\xa1\xa2\xa3\xa4\xa5\xa6\xa7\xa8\xa9\xaa\xab\xac\xad\xae\xaf\xb0\xb1\xb2\xb3\xb4\xb5\xb6\xb7\xb8\xb9\xba\xbb\xbc\xbd\xbe\xbf");
1350         }
1351 
1352         SECTION("3.2  Lonely start characters")
1353         {
1354             // 3.2.1  All 32 first bytes of 2-byte sequences (0xc0-0xdf)
1355             roundtrip(false, "\xc0 \xc1 \xc2 \xc3 \xc4 \xc5 \xc6 \xc7 \xc8 \xc9 \xca \xcb \xcc \xcd \xce \xcf \xd0 \xd1 \xd2 \xd3 \xd4 \xd5 \xd6 \xd7 \xd8 \xd9 \xda \xdb \xdc \xdd \xde \xdf");
1356             // 3.2.2  All 16 first bytes of 3-byte sequences (0xe0-0xef)
1357             roundtrip(false, "\xe0 \xe1 \xe2 \xe3 \xe4 \xe5 \xe6 \xe7 \xe8 \xe9 \xea \xeb \xec \xed \xee \xef");
1358             // 3.2.3  All 8 first bytes of 4-byte sequences (0xf0-0xf7)
1359             roundtrip(false, "\xf0 \xf1 \xf2 \xf3 \xf4 \xf5 \xf6 \xf7");
1360             // 3.2.4  All 4 first bytes of 5-byte sequences (0xf8-0xfb)
1361             roundtrip(false, "\xf8 \xf9 \xfa \xfb");
1362             // 3.2.5  All 2 first bytes of 6-byte sequences (0xfc-0xfd)
1363             roundtrip(false, "\xfc \xfd");
1364         }
1365 
1366         SECTION("3.3  Sequences with last continuation byte missing")
1367         {
1368             // All bytes of an incomplete sequence should be signalled as a single
1369             // malformed sequence, i.e., you should see only a single replacement
1370             // character in each of the next 10 tests. (Characters as in section 2)
1371 
1372             // 3.3.1  2-byte sequence with last byte missing (U+0000)
1373             roundtrip(false, "\xc0");
1374             // 3.3.2  3-byte sequence with last byte missing (U+0000)
1375             roundtrip(false, "\xe0\x80");
1376             // 3.3.3  4-byte sequence with last byte missing (U+0000)
1377             roundtrip(false, "\xf0\x80\x80");
1378             // 3.3.4  5-byte sequence with last byte missing (U+0000)
1379             roundtrip(false, "\xf8\x80\x80\x80");
1380             // 3.3.5  6-byte sequence with last byte missing (U+0000)
1381             roundtrip(false, "\xfc\x80\x80\x80\x80");
1382             // 3.3.6  2-byte sequence with last byte missing (U-000007FF)
1383             roundtrip(false, "\xdf");
1384             // 3.3.7  3-byte sequence with last byte missing (U-0000FFFF)
1385             roundtrip(false, "\xef\xbf");
1386             // 3.3.8  4-byte sequence with last byte missing (U-001FFFFF)
1387             roundtrip(false, "\xf7\xbf\xbf");
1388             // 3.3.9  5-byte sequence with last byte missing (U-03FFFFFF)
1389             roundtrip(false, "\xfb\xbf\xbf\xbf");
1390             // 3.3.10 6-byte sequence with last byte missing (U-7FFFFFFF)
1391             roundtrip(false, "\xfd\xbf\xbf\xbf\xbf");
1392         }
1393 
1394         SECTION("3.4  Concatenation of incomplete sequences")
1395         {
1396             // All the 10 sequences of 3.3 concatenated, you should see 10 malformed
1397             // sequences being signalled:
1398             roundtrip(false, "\xc0\xe0\x80\xf0\x80\x80\xf8\x80\x80\x80\xfc\x80\x80\x80\x80\xdf\xef\xbf\xf7\xbf\xbf\xfb\xbf\xbf\xbf\xfd\xbf\xbf\xbf\xbf");
1399         }
1400 
1401         SECTION("3.5  Impossible bytes")
1402         {
1403             // The following two bytes cannot appear in a correct UTF-8 string
1404 
1405             // 3.5.1  fe
1406             roundtrip(false, "\xfe");
1407             // 3.5.2  ff
1408             roundtrip(false, "\xff");
1409             // 3.5.3  fe fe ff ff
1410             roundtrip(false, "\xfe\xfe\xff\xff");
1411         }
1412     }
1413 
1414     SECTION("4  Overlong sequences")
1415     {
1416         // The following sequences are not malformed according to the letter of
1417         // the Unicode 2.0 standard. However, they are longer then necessary and
1418         // a correct UTF-8 encoder is not allowed to produce them. A "safe UTF-8
1419         // decoder" should reject them just like malformed sequences for two
1420         // reasons: (1) It helps to debug applications if overlong sequences are
1421         // not treated as valid representations of characters, because this helps
1422         // to spot problems more quickly. (2) Overlong sequences provide
1423         // alternative representations of characters, that could maliciously be
1424         // used to bypass filters that check only for ASCII characters. For
1425         // instance, a 2-byte encoded line feed (LF) would not be caught by a
1426         // line counter that counts only 0x0a bytes, but it would still be
1427         // processed as a line feed by an unsafe UTF-8 decoder later in the
1428         // pipeline. From a security point of view, ASCII compatibility of UTF-8
1429         // sequences means also, that ASCII characters are *only* allowed to be
1430         // represented by ASCII bytes in the range 0x00-0x7f. To ensure this
1431         // aspect of ASCII compatibility, use only "safe UTF-8 decoders" that
1432         // reject overlong UTF-8 sequences for which a shorter encoding exists.
1433 
1434         SECTION("4.1  Examples of an overlong ASCII character")
1435         {
1436             // With a safe UTF-8 decoder, all of the following five overlong
1437             // representations of the ASCII character slash ("/") should be rejected
1438             // like a malformed UTF-8 sequence, for instance by substituting it with
1439             // a replacement character. If you see a slash below, you do not have a
1440             // safe UTF-8 decoder!
1441 
1442             // 4.1.1 U+002F = c0 af
1443             roundtrip(false, "\xc0\xaf");
1444             // 4.1.2 U+002F = e0 80 af
1445             roundtrip(false, "\xe0\x80\xaf");
1446             // 4.1.3 U+002F = f0 80 80 af
1447             roundtrip(false, "\xf0\x80\x80\xaf");
1448             // 4.1.4 U+002F = f8 80 80 80 af
1449             roundtrip(false, "\xf8\x80\x80\x80\xaf");
1450             // 4.1.5 U+002F = fc 80 80 80 80 af
1451             roundtrip(false, "\xfc\x80\x80\x80\x80\xaf");
1452         }
1453 
1454         SECTION("4.2  Maximum overlong sequences")
1455         {
1456             // Below you see the highest Unicode value that is still resulting in an
1457             // overlong sequence if represented with the given number of bytes. This
1458             // is a boundary test for safe UTF-8 decoders. All five characters should
1459             // be rejected like malformed UTF-8 sequences.
1460 
1461             // 4.2.1  U-0000007F = c1 bf
1462             roundtrip(false, "\xc1\xbf");
1463             // 4.2.2  U-000007FF = e0 9f bf
1464             roundtrip(false, "\xe0\x9f\xbf");
1465             // 4.2.3  U-0000FFFF = f0 8f bf bf
1466             roundtrip(false, "\xf0\x8f\xbf\xbf");
1467             // 4.2.4  U-001FFFFF = f8 87 bf bf bf
1468             roundtrip(false, "\xf8\x87\xbf\xbf\xbf");
1469             // 4.2.5  U-03FFFFFF = fc 83 bf bf bf bf
1470             roundtrip(false, "\xfc\x83\xbf\xbf\xbf\xbf");
1471         }
1472 
1473         SECTION("4.3  Overlong representation of the NUL character")
1474         {
1475             // The following five sequences should also be rejected like malformed
1476             // UTF-8 sequences and should not be treated like the ASCII NUL
1477             // character.
1478 
1479             // 4.3.1  U+0000 = c0 80
1480             roundtrip(false, "\xc0\x80");
1481             // 4.3.2  U+0000 = e0 80 80
1482             roundtrip(false, "\xe0\x80\x80");
1483             // 4.3.3  U+0000 = f0 80 80 80
1484             roundtrip(false, "\xf0\x80\x80\x80");
1485             // 4.3.4  U+0000 = f8 80 80 80 80
1486             roundtrip(false, "\xf8\x80\x80\x80\x80");
1487             // 4.3.5  U+0000 = fc 80 80 80 80 80
1488             roundtrip(false, "\xfc\x80\x80\x80\x80\x80");
1489         }
1490     }
1491 
1492     SECTION("5  Illegal code positions")
1493     {
1494         // The following UTF-8 sequences should be rejected like malformed
1495         // sequences, because they never represent valid ISO 10646 characters and
1496         // a UTF-8 decoder that accepts them might introduce security problems
1497         // comparable to overlong UTF-8 sequences.
1498 
1499         SECTION("5.1 Single UTF-16 surrogates")
1500         {
1501             // 5.1.1  U+D800 = ed a0 80
1502             roundtrip(false, "\xed\xa0\x80");
1503             // 5.1.2  U+DB7F = ed ad bf
1504             roundtrip(false, "\xed\xad\xbf");
1505             // 5.1.3  U+DB80 = ed ae 80
1506             roundtrip(false, "\xed\xae\x80");
1507             // 5.1.4  U+DBFF = ed af bf
1508             roundtrip(false, "\xed\xaf\xbf");
1509             // 5.1.5  U+DC00 = ed b0 80
1510             roundtrip(false, "\xed\xb0\x80");
1511             // 5.1.6  U+DF80 = ed be 80
1512             roundtrip(false, "\xed\xbe\x80");
1513             // 5.1.7  U+DFFF = ed bf bf
1514             roundtrip(false, "\xed\xbf\xbf");
1515         }
1516 
1517         SECTION("5.2 Paired UTF-16 surrogates")
1518         {
1519             // 5.2.1  U+D800 U+DC00 = ed a0 80 ed b0 80
1520             roundtrip(false, "\xed\xa0\x80\xed\xb0\x80");
1521             // 5.2.2  U+D800 U+DFFF = ed a0 80 ed bf bf
1522             roundtrip(false, "\xed\xa0\x80\xed\xbf\xbf");
1523             // 5.2.3  U+DB7F U+DC00 = ed ad bf ed b0 80
1524             roundtrip(false, "\xed\xad\xbf\xed\xb0\x80");
1525             // 5.2.4  U+DB7F U+DFFF = ed ad bf ed bf bf
1526             roundtrip(false, "\xed\xad\xbf\xed\xbf\xbf");
1527             // 5.2.5  U+DB80 U+DC00 = ed ae 80 ed b0 80
1528             roundtrip(false, "\xed\xae\x80\xed\xb0\x80");
1529             // 5.2.6  U+DB80 U+DFFF = ed ae 80 ed bf bf
1530             roundtrip(false, "\xed\xae\x80\xed\xbf\xbf");
1531             // 5.2.7  U+DBFF U+DC00 = ed af bf ed b0 80
1532             roundtrip(false, "\xed\xaf\xbf\xed\xb0\x80");
1533             // 5.2.8  U+DBFF U+DFFF = ed af bf ed bf bf
1534             roundtrip(false, "\xed\xaf\xbf\xed\xbf\xbf");
1535         }
1536 
1537         SECTION("5.3 Noncharacter code positions")
1538         {
1539             // The following "noncharacters" are "reserved for internal use" by
1540             // applications, and according to older versions of the Unicode Standard
1541             // "should never be interchanged". Unicode Corrigendum #9 dropped the
1542             // latter restriction. Nevertheless, their presence in incoming UTF-8 data
1543             // can remain a potential security risk, depending on what use is made of
1544             // these codes subsequently. Examples of such internal use:
1545             //
1546             //  - Some file APIs with 16-bit characters may use the integer value -1
1547             //    = U+FFFF to signal an end-of-file (EOF) or error condition.
1548             //
1549             //  - In some UTF-16 receivers, code point U+FFFE might trigger a
1550             //    byte-swap operation (to convert between UTF-16LE and UTF-16BE).
1551             //
1552             // With such internal use of noncharacters, it may be desirable and safer
1553             // to block those code points in UTF-8 decoders, as they should never
1554             // occur legitimately in incoming UTF-8 data, and could trigger unsafe
1555             // behaviour in subsequent processing.
1556 
1557             // Particularly problematic noncharacters in 16-bit applications:
1558 
1559             // 5.3.1  U+FFFE = ef bf be
1560             roundtrip(true, "\xef\xbf\xbe");
1561             // 5.3.2  U+FFFF = ef bf bf
1562             roundtrip(true, "\xef\xbf\xbf");
1563 
1564             // 5.3.3  U+FDD0 .. U+FDEF
1565             roundtrip(true, "\xEF\xB7\x90");
1566             roundtrip(true, "\xEF\xB7\x91");
1567             roundtrip(true, "\xEF\xB7\x92");
1568             roundtrip(true, "\xEF\xB7\x93");
1569             roundtrip(true, "\xEF\xB7\x94");
1570             roundtrip(true, "\xEF\xB7\x95");
1571             roundtrip(true, "\xEF\xB7\x96");
1572             roundtrip(true, "\xEF\xB7\x97");
1573             roundtrip(true, "\xEF\xB7\x98");
1574             roundtrip(true, "\xEF\xB7\x99");
1575             roundtrip(true, "\xEF\xB7\x9A");
1576             roundtrip(true, "\xEF\xB7\x9B");
1577             roundtrip(true, "\xEF\xB7\x9C");
1578             roundtrip(true, "\xEF\xB7\x9D");
1579             roundtrip(true, "\xEF\xB7\x9E");
1580             roundtrip(true, "\xEF\xB7\x9F");
1581             roundtrip(true, "\xEF\xB7\xA0");
1582             roundtrip(true, "\xEF\xB7\xA1");
1583             roundtrip(true, "\xEF\xB7\xA2");
1584             roundtrip(true, "\xEF\xB7\xA3");
1585             roundtrip(true, "\xEF\xB7\xA4");
1586             roundtrip(true, "\xEF\xB7\xA5");
1587             roundtrip(true, "\xEF\xB7\xA6");
1588             roundtrip(true, "\xEF\xB7\xA7");
1589             roundtrip(true, "\xEF\xB7\xA8");
1590             roundtrip(true, "\xEF\xB7\xA9");
1591             roundtrip(true, "\xEF\xB7\xAA");
1592             roundtrip(true, "\xEF\xB7\xAB");
1593             roundtrip(true, "\xEF\xB7\xAC");
1594             roundtrip(true, "\xEF\xB7\xAD");
1595             roundtrip(true, "\xEF\xB7\xAE");
1596             roundtrip(true, "\xEF\xB7\xAF");
1597 
1598             // 5.3.4  U+nFFFE U+nFFFF (for n = 1..10)
1599             roundtrip(true, "\xF0\x9F\xBF\xBF");
1600             roundtrip(true, "\xF0\xAF\xBF\xBF");
1601             roundtrip(true, "\xF0\xBF\xBF\xBF");
1602             roundtrip(true, "\xF1\x8F\xBF\xBF");
1603             roundtrip(true, "\xF1\x9F\xBF\xBF");
1604             roundtrip(true, "\xF1\xAF\xBF\xBF");
1605             roundtrip(true, "\xF1\xBF\xBF\xBF");
1606             roundtrip(true, "\xF2\x8F\xBF\xBF");
1607             roundtrip(true, "\xF2\x9F\xBF\xBF");
1608             roundtrip(true, "\xF2\xAF\xBF\xBF");
1609         }
1610     }
1611 }
1612