• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 //     __ _____ _____ _____
2 //  __|  |   __|     |   | |  JSON for Modern C++ (supporting code)
3 // |  |  |__   |  |  | | | |  version 3.11.2
4 // |_____|_____|_____|_|___|  https://github.com/nlohmann/json
5 //
6 // SPDX-FileCopyrightText: 2013-2022 Niels Lohmann <https://nlohmann.me>
7 // SPDX-License-Identifier: MIT
8 
9 #include "doctest_compatibility.h"
10 
11 // for some reason including this after the json header leads to linker errors with VS 2017...
12 #include <locale>
13 #include <nlohmann/json.hpp>
14 using nlohmann::json;
15 
16 #include <fstream>
17 #include <sstream>
18 #include <iomanip>
19 #include "make_test_data_available.hpp"
20 
skip()21 TEST_CASE("Unicode (1/5)" * doctest::skip())
22 {
23     SECTION("\\uxxxx sequences")
24     {
25         // create an escaped string from a code point
26         const auto codepoint_to_unicode = [](std::size_t cp)
27         {
28             // code points are represented as a six-character sequence: a
29             // reverse solidus, followed by the lowercase letter u, followed
30             // by four hexadecimal digits that encode the character's code
31             // point
32             std::stringstream ss;
33             ss << "\\u" << std::setw(4) << std::setfill('0') << std::hex << cp;
34             return ss.str();
35         };
36 
37         SECTION("correct sequences")
38         {
39             // generate all UTF-8 code points; in total, 1112064 code points are
40             // generated: 0x1FFFFF code points - 2048 invalid values between
41             // 0xD800 and 0xDFFF.
42             for (std::size_t cp = 0; cp <= 0x10FFFFu; ++cp)
43             {
44                 // string to store the code point as in \uxxxx format
45                 std::string json_text = "\"";
46 
47                 // decide whether to use one or two \uxxxx sequences
48                 if (cp < 0x10000u)
49                 {
50                     // The Unicode standard permanently reserves these code point
51                     // values for UTF-16 encoding of the high and low surrogates, and
52                     // they will never be assigned a character, so there should be no
53                     // reason to encode them. The official Unicode standard says that
54                     // no UTF forms, including UTF-16, can encode these code points.
55                     if (cp >= 0xD800u && cp <= 0xDFFFu)
56                     {
57                         // if we would not skip these code points, we would get a
58                         // "missing low surrogate" exception
59                         continue;
60                     }
61 
62                     // code points in the Basic Multilingual Plane can be
63                     // represented with one \uxxxx sequence
64                     json_text += codepoint_to_unicode(cp);
65                 }
66                 else
67                 {
68                     // To escape an extended character that is not in the Basic
69                     // Multilingual Plane, the character is represented as a
70                     // 12-character sequence, encoding the UTF-16 surrogate pair
71                     const auto codepoint1 = 0xd800u + (((cp - 0x10000u) >> 10) & 0x3ffu);
72                     const auto codepoint2 = 0xdc00u + ((cp - 0x10000u) & 0x3ffu);
73                     json_text += codepoint_to_unicode(codepoint1) + codepoint_to_unicode(codepoint2);
74                 }
75 
76                 json_text += "\"";
77                 CAPTURE(json_text)
78                 json _;
79                 CHECK_NOTHROW(_ = json::parse(json_text));
80             }
81         }
82 
83         SECTION("incorrect sequences")
84         {
85             SECTION("incorrect surrogate values")
86             {
87                 json _;
88 
89                 CHECK_THROWS_WITH_AS(_ = json::parse("\"\\uDC00\\uDC00\""), "[json.exception.parse_error.101] parse error at line 1, column 7: syntax error while parsing value - invalid string: surrogate U+DC00..U+DFFF must follow U+D800..U+DBFF; last read: '\"\\uDC00'", json::parse_error&);
90 
91                 CHECK_THROWS_WITH_AS(_ = json::parse("\"\\uD7FF\\uDC00\""), "[json.exception.parse_error.101] parse error at line 1, column 13: syntax error while parsing value - invalid string: surrogate U+DC00..U+DFFF must follow U+D800..U+DBFF; last read: '\"\\uD7FF\\uDC00'", json::parse_error&);
92 
93                 CHECK_THROWS_WITH_AS(_ = json::parse("\"\\uD800]\""), "[json.exception.parse_error.101] parse error at line 1, column 8: syntax error while parsing value - invalid string: surrogate U+D800..U+DBFF must be followed by U+DC00..U+DFFF; last read: '\"\\uD800]'", json::parse_error&);
94 
95                 CHECK_THROWS_WITH_AS(_ = json::parse("\"\\uD800\\v\""), "[json.exception.parse_error.101] parse error at line 1, column 9: syntax error while parsing value - invalid string: surrogate U+D800..U+DBFF must be followed by U+DC00..U+DFFF; last read: '\"\\uD800\\v'", json::parse_error&);
96 
97                 CHECK_THROWS_WITH_AS(_ = json::parse("\"\\uD800\\u123\""), "[json.exception.parse_error.101] parse error at line 1, column 13: syntax error while parsing value - invalid string: '\\u' must be followed by 4 hex digits; last read: '\"\\uD800\\u123\"'", json::parse_error&);
98 
99                 CHECK_THROWS_WITH_AS(_ = json::parse("\"\\uD800\\uDBFF\""), "[json.exception.parse_error.101] parse error at line 1, column 13: syntax error while parsing value - invalid string: surrogate U+D800..U+DBFF must be followed by U+DC00..U+DFFF; last read: '\"\\uD800\\uDBFF'", json::parse_error&);
100 
101                 CHECK_THROWS_WITH_AS(_ = json::parse("\"\\uD800\\uE000\""), "[json.exception.parse_error.101] parse error at line 1, column 13: syntax error while parsing value - invalid string: surrogate U+D800..U+DBFF must be followed by U+DC00..U+DFFF; last read: '\"\\uD800\\uE000'", json::parse_error&);
102             }
103         }
104 
105 #if 0
106         SECTION("incorrect sequences")
107         {
108             SECTION("high surrogate without low surrogate")
109             {
110                 // D800..DBFF are high surrogates and must be followed by low
111                 // surrogates DC00..DFFF; here, nothing follows
112                 for (std::size_t cp = 0xD800u; cp <= 0xDBFFu; ++cp)
113                 {
114                     std::string json_text = "\"" + codepoint_to_unicode(cp) + "\"";
115                     CAPTURE(json_text)
116                     CHECK_THROWS_AS(json::parse(json_text), json::parse_error&);
117                 }
118             }
119 
120             SECTION("high surrogate with wrong low surrogate")
121             {
122                 // D800..DBFF are high surrogates and must be followed by low
123                 // surrogates DC00..DFFF; here a different sequence follows
124                 for (std::size_t cp1 = 0xD800u; cp1 <= 0xDBFFu; ++cp1)
125                 {
126                     for (std::size_t cp2 = 0x0000u; cp2 <= 0xFFFFu; ++cp2)
127                     {
128                         if (0xDC00u <= cp2 && cp2 <= 0xDFFFu)
129                         {
130                             continue;
131                         }
132 
133                         std::string json_text = "\"" + codepoint_to_unicode(cp1) + codepoint_to_unicode(cp2) + "\"";
134                         CAPTURE(json_text)
135                         CHECK_THROWS_AS(json::parse(json_text), json::parse_error&);
136                     }
137                 }
138             }
139 
140             SECTION("low surrogate without high surrogate")
141             {
142                 // low surrogates DC00..DFFF must follow high surrogates; here,
143                 // they occur alone
144                 for (std::size_t cp = 0xDC00u; cp <= 0xDFFFu; ++cp)
145                 {
146                     std::string json_text = "\"" + codepoint_to_unicode(cp) + "\"";
147                     CAPTURE(json_text)
148                     CHECK_THROWS_AS(json::parse(json_text), json::parse_error&);
149                 }
150             }
151 
152         }
153 #endif
154     }
155 
156     SECTION("read all unicode characters")
157     {
158         // read a file with all unicode characters stored as single-character
159         // strings in a JSON array
160         std::ifstream f(TEST_DATA_DIRECTORY "/json_nlohmann_tests/all_unicode.json");
161         json j;
162         CHECK_NOTHROW(f >> j);
163 
164         // the array has 1112064 + 1 elements (a terminating "null" value)
165         // Note: 1112064 = 0x1FFFFF code points - 2048 invalid values between
166         // 0xD800 and 0xDFFF.
167         CHECK(j.size() == 1112065);
168 
169         SECTION("check JSON Pointers")
170         {
171             for (const auto& s : j)
172             {
173                 // skip non-string JSON values
174                 if (!s.is_string())
175                 {
176                     continue;
177                 }
178 
179                 auto ptr = s.get<std::string>();
180 
181                 // tilde must be followed by 0 or 1
182                 if (ptr == "~")
183                 {
184                     ptr += "0";
185                 }
186 
187                 // JSON Pointers must begin with "/"
188                 ptr.insert(0, "/");
189 
190                 CHECK_NOTHROW(json::json_pointer("/" + ptr));
191 
192                 // check escape/unescape roundtrip
193                 auto escaped = nlohmann::detail::escape(ptr);
194                 nlohmann::detail::unescape(escaped);
195                 CHECK(escaped == ptr);
196             }
197         }
198     }
199 
200     SECTION("ignore byte-order-mark")
201     {
202         SECTION("in a stream")
203         {
204             // read a file with a UTF-8 BOM
205             std::ifstream f(TEST_DATA_DIRECTORY "/json_nlohmann_tests/bom.json");
206             json j;
207             CHECK_NOTHROW(f >> j);
208         }
209 
210         SECTION("with an iterator")
211         {
212             std::string i = "\xef\xbb\xbf{\n   \"foo\": true\n}";
213             json _;
214             CHECK_NOTHROW(_ = json::parse(i.begin(), i.end()));
215         }
216     }
217 
218     SECTION("error for incomplete/wrong BOM")
219     {
220         json _;
221         CHECK_THROWS_AS(_ = json::parse("\xef\xbb"), json::parse_error&);
222         CHECK_THROWS_AS(_ = json::parse("\xef\xbb\xbb"), json::parse_error&);
223     }
224 }
225 
226 namespace
227 {
228 void roundtrip(bool success_expected, const std::string& s);
229 
roundtrip(bool success_expected,const std::string & s)230 void roundtrip(bool success_expected, const std::string& s)
231 {
232     CAPTURE(s)
233     json _;
234 
235     // create JSON string value
236     json j = s;
237     // create JSON text
238     std::string ps = std::string("\"") + s + "\"";
239 
240     if (success_expected)
241     {
242         // serialization succeeds
243         CHECK_NOTHROW(j.dump());
244 
245         // exclude parse test for U+0000
246         if (s[0] != '\0')
247         {
248             // parsing JSON text succeeds
249             CHECK_NOTHROW(_ = json::parse(ps));
250         }
251 
252         // roundtrip succeeds
253         CHECK_NOTHROW(_ = json::parse(j.dump()));
254 
255         // after roundtrip, the same string is stored
256         json jr = json::parse(j.dump());
257         CHECK(jr.get<std::string>() == s);
258     }
259     else
260     {
261         // serialization fails
262         CHECK_THROWS_AS(j.dump(), json::type_error&);
263 
264         // parsing JSON text fails
265         CHECK_THROWS_AS(_ = json::parse(ps), json::parse_error&);
266     }
267 }
268 } // namespace
269 
270 TEST_CASE("Markus Kuhn's UTF-8 decoder capability and stress test")
271 {
272     // Markus Kuhn <http://www.cl.cam.ac.uk/~mgk25/> - 2015-08-28 - CC BY 4.0
273     // http://www.cl.cam.ac.uk/~mgk25/ucs/examples/UTF-8-test.txt
274 
275     SECTION("1  Some correct UTF-8 text")
276     {
277         roundtrip(true, "κόσμε");
278     }
279 
280     SECTION("2  Boundary condition test cases")
281     {
282         SECTION("2.1  First possible sequence of a certain length")
283         {
284             // 2.1.1  1 byte  (U-00000000)
285             roundtrip(true, std::string("\0", 1));
286             // 2.1.2  2 bytes (U-00000080)
287             roundtrip(true, "\xc2\x80");
288             // 2.1.3  3 bytes (U-00000800)
289             roundtrip(true, "\xe0\xa0\x80");
290             // 2.1.4  4 bytes (U-00010000)
291             roundtrip(true, "\xf0\x90\x80\x80");
292 
293             // 2.1.5  5 bytes (U-00200000)
294             roundtrip(false, "\xF8\x88\x80\x80\x80");
295             // 2.1.6  6 bytes (U-04000000)
296             roundtrip(false, "\xFC\x84\x80\x80\x80\x80");
297         }
298 
299         SECTION("2.2  Last possible sequence of a certain length")
300         {
301             // 2.2.1  1 byte  (U-0000007F)
302             roundtrip(true, "\x7f");
303             // 2.2.2  2 bytes (U-000007FF)
304             roundtrip(true, "\xdf\xbf");
305             // 2.2.3  3 bytes (U-0000FFFF)
306             roundtrip(true, "\xef\xbf\xbf");
307 
308             // 2.2.4  4 bytes (U-001FFFFF)
309             roundtrip(false, "\xF7\xBF\xBF\xBF");
310             // 2.2.5  5 bytes (U-03FFFFFF)
311             roundtrip(false, "\xFB\xBF\xBF\xBF\xBF");
312             // 2.2.6  6 bytes (U-7FFFFFFF)
313             roundtrip(false, "\xFD\xBF\xBF\xBF\xBF\xBF");
314         }
315 
316         SECTION("2.3  Other boundary conditions")
317         {
318             // 2.3.1  U-0000D7FF = ed 9f bf
319             roundtrip(true, "\xed\x9f\xbf");
320             // 2.3.2  U-0000E000 = ee 80 80
321             roundtrip(true, "\xee\x80\x80");
322             // 2.3.3  U-0000FFFD = ef bf bd
323             roundtrip(true, "\xef\xbf\xbd");
324             // 2.3.4  U-0010FFFF = f4 8f bf bf
325             roundtrip(true, "\xf4\x8f\xbf\xbf");
326 
327             // 2.3.5  U-00110000 = f4 90 80 80
328             roundtrip(false, "\xf4\x90\x80\x80");
329         }
330     }
331 
332     SECTION("3  Malformed sequences")
333     {
334         SECTION("3.1  Unexpected continuation bytes")
335         {
336             // Each unexpected continuation byte should be separately signalled as a
337             // malformed sequence of its own.
338 
339             // 3.1.1  First continuation byte 0x80
340             roundtrip(false, "\x80");
341             // 3.1.2  Last  continuation byte 0xbf
342             roundtrip(false, "\xbf");
343 
344             // 3.1.3  2 continuation bytes
345             roundtrip(false, "\x80\xbf");
346             // 3.1.4  3 continuation bytes
347             roundtrip(false, "\x80\xbf\x80");
348             // 3.1.5  4 continuation bytes
349             roundtrip(false, "\x80\xbf\x80\xbf");
350             // 3.1.6  5 continuation bytes
351             roundtrip(false, "\x80\xbf\x80\xbf\x80");
352             // 3.1.7  6 continuation bytes
353             roundtrip(false, "\x80\xbf\x80\xbf\x80\xbf");
354             // 3.1.8  7 continuation bytes
355             roundtrip(false, "\x80\xbf\x80\xbf\x80\xbf\x80");
356 
357             // 3.1.9  Sequence of all 64 possible continuation bytes (0x80-0xbf)
358             roundtrip(false, "\x80\x81\x82\x83\x84\x85\x86\x87\x88\x89\x8a\x8b\x8c\x8d\x8e\x8f\x90\x91\x92\x93\x94\x95\x96\x97\x98\x99\x9a\x9b\x9c\x9d\x9e\x9f\xa0\xa1\xa2\xa3\xa4\xa5\xa6\xa7\xa8\xa9\xaa\xab\xac\xad\xae\xaf\xb0\xb1\xb2\xb3\xb4\xb5\xb6\xb7\xb8\xb9\xba\xbb\xbc\xbd\xbe\xbf");
359         }
360 
361         SECTION("3.2  Lonely start characters")
362         {
363             // 3.2.1  All 32 first bytes of 2-byte sequences (0xc0-0xdf)
364             roundtrip(false, "\xc0 \xc1 \xc2 \xc3 \xc4 \xc5 \xc6 \xc7 \xc8 \xc9 \xca \xcb \xcc \xcd \xce \xcf \xd0 \xd1 \xd2 \xd3 \xd4 \xd5 \xd6 \xd7 \xd8 \xd9 \xda \xdb \xdc \xdd \xde \xdf");
365             // 3.2.2  All 16 first bytes of 3-byte sequences (0xe0-0xef)
366             roundtrip(false, "\xe0 \xe1 \xe2 \xe3 \xe4 \xe5 \xe6 \xe7 \xe8 \xe9 \xea \xeb \xec \xed \xee \xef");
367             // 3.2.3  All 8 first bytes of 4-byte sequences (0xf0-0xf7)
368             roundtrip(false, "\xf0 \xf1 \xf2 \xf3 \xf4 \xf5 \xf6 \xf7");
369             // 3.2.4  All 4 first bytes of 5-byte sequences (0xf8-0xfb)
370             roundtrip(false, "\xf8 \xf9 \xfa \xfb");
371             // 3.2.5  All 2 first bytes of 6-byte sequences (0xfc-0xfd)
372             roundtrip(false, "\xfc \xfd");
373         }
374 
375         SECTION("3.3  Sequences with last continuation byte missing")
376         {
377             // All bytes of an incomplete sequence should be signalled as a single
378             // malformed sequence, i.e., you should see only a single replacement
379             // character in each of the next 10 tests. (Characters as in section 2)
380 
381             // 3.3.1  2-byte sequence with last byte missing (U+0000)
382             roundtrip(false, "\xc0");
383             // 3.3.2  3-byte sequence with last byte missing (U+0000)
384             roundtrip(false, "\xe0\x80");
385             // 3.3.3  4-byte sequence with last byte missing (U+0000)
386             roundtrip(false, "\xf0\x80\x80");
387             // 3.3.4  5-byte sequence with last byte missing (U+0000)
388             roundtrip(false, "\xf8\x80\x80\x80");
389             // 3.3.5  6-byte sequence with last byte missing (U+0000)
390             roundtrip(false, "\xfc\x80\x80\x80\x80");
391             // 3.3.6  2-byte sequence with last byte missing (U-000007FF)
392             roundtrip(false, "\xdf");
393             // 3.3.7  3-byte sequence with last byte missing (U-0000FFFF)
394             roundtrip(false, "\xef\xbf");
395             // 3.3.8  4-byte sequence with last byte missing (U-001FFFFF)
396             roundtrip(false, "\xf7\xbf\xbf");
397             // 3.3.9  5-byte sequence with last byte missing (U-03FFFFFF)
398             roundtrip(false, "\xfb\xbf\xbf\xbf");
399             // 3.3.10 6-byte sequence with last byte missing (U-7FFFFFFF)
400             roundtrip(false, "\xfd\xbf\xbf\xbf\xbf");
401         }
402 
403         SECTION("3.4  Concatenation of incomplete sequences")
404         {
405             // All the 10 sequences of 3.3 concatenated, you should see 10 malformed
406             // sequences being signalled:
407             roundtrip(false, "\xc0\xe0\x80\xf0\x80\x80\xf8\x80\x80\x80\xfc\x80\x80\x80\x80\xdf\xef\xbf\xf7\xbf\xbf\xfb\xbf\xbf\xbf\xfd\xbf\xbf\xbf\xbf");
408         }
409 
410         SECTION("3.5  Impossible bytes")
411         {
412             // The following two bytes cannot appear in a correct UTF-8 string
413 
414             // 3.5.1  fe
415             roundtrip(false, "\xfe");
416             // 3.5.2  ff
417             roundtrip(false, "\xff");
418             // 3.5.3  fe fe ff ff
419             roundtrip(false, "\xfe\xfe\xff\xff");
420         }
421     }
422 
423     SECTION("4  Overlong sequences")
424     {
425         // The following sequences are not malformed according to the letter of
426         // the Unicode 2.0 standard. However, they are longer then necessary and
427         // a correct UTF-8 encoder is not allowed to produce them. A "safe UTF-8
428         // decoder" should reject them just like malformed sequences for two
429         // reasons: (1) It helps to debug applications if overlong sequences are
430         // not treated as valid representations of characters, because this helps
431         // to spot problems more quickly. (2) Overlong sequences provide
432         // alternative representations of characters, that could maliciously be
433         // used to bypass filters that check only for ASCII characters. For
434         // instance, a 2-byte encoded line feed (LF) would not be caught by a
435         // line counter that counts only 0x0a bytes, but it would still be
436         // processed as a line feed by an unsafe UTF-8 decoder later in the
437         // pipeline. From a security point of view, ASCII compatibility of UTF-8
438         // sequences means also, that ASCII characters are *only* allowed to be
439         // represented by ASCII bytes in the range 0x00-0x7f. To ensure this
440         // aspect of ASCII compatibility, use only "safe UTF-8 decoders" that
441         // reject overlong UTF-8 sequences for which a shorter encoding exists.
442 
443         SECTION("4.1  Examples of an overlong ASCII character")
444         {
445             // With a safe UTF-8 decoder, all of the following five overlong
446             // representations of the ASCII character slash ("/") should be rejected
447             // like a malformed UTF-8 sequence, for instance by substituting it with
448             // a replacement character. If you see a slash below, you do not have a
449             // safe UTF-8 decoder!
450 
451             // 4.1.1 U+002F = c0 af
452             roundtrip(false, "\xc0\xaf");
453             // 4.1.2 U+002F = e0 80 af
454             roundtrip(false, "\xe0\x80\xaf");
455             // 4.1.3 U+002F = f0 80 80 af
456             roundtrip(false, "\xf0\x80\x80\xaf");
457             // 4.1.4 U+002F = f8 80 80 80 af
458             roundtrip(false, "\xf8\x80\x80\x80\xaf");
459             // 4.1.5 U+002F = fc 80 80 80 80 af
460             roundtrip(false, "\xfc\x80\x80\x80\x80\xaf");
461         }
462 
463         SECTION("4.2  Maximum overlong sequences")
464         {
465             // Below you see the highest Unicode value that is still resulting in an
466             // overlong sequence if represented with the given number of bytes. This
467             // is a boundary test for safe UTF-8 decoders. All five characters should
468             // be rejected like malformed UTF-8 sequences.
469 
470             // 4.2.1  U-0000007F = c1 bf
471             roundtrip(false, "\xc1\xbf");
472             // 4.2.2  U-000007FF = e0 9f bf
473             roundtrip(false, "\xe0\x9f\xbf");
474             // 4.2.3  U-0000FFFF = f0 8f bf bf
475             roundtrip(false, "\xf0\x8f\xbf\xbf");
476             // 4.2.4  U-001FFFFF = f8 87 bf bf bf
477             roundtrip(false, "\xf8\x87\xbf\xbf\xbf");
478             // 4.2.5  U-03FFFFFF = fc 83 bf bf bf bf
479             roundtrip(false, "\xfc\x83\xbf\xbf\xbf\xbf");
480         }
481 
482         SECTION("4.3  Overlong representation of the NUL character")
483         {
484             // The following five sequences should also be rejected like malformed
485             // UTF-8 sequences and should not be treated like the ASCII NUL
486             // character.
487 
488             // 4.3.1  U+0000 = c0 80
489             roundtrip(false, "\xc0\x80");
490             // 4.3.2  U+0000 = e0 80 80
491             roundtrip(false, "\xe0\x80\x80");
492             // 4.3.3  U+0000 = f0 80 80 80
493             roundtrip(false, "\xf0\x80\x80\x80");
494             // 4.3.4  U+0000 = f8 80 80 80 80
495             roundtrip(false, "\xf8\x80\x80\x80\x80");
496             // 4.3.5  U+0000 = fc 80 80 80 80 80
497             roundtrip(false, "\xfc\x80\x80\x80\x80\x80");
498         }
499     }
500 
501     SECTION("5  Illegal code positions")
502     {
503         // The following UTF-8 sequences should be rejected like malformed
504         // sequences, because they never represent valid ISO 10646 characters and
505         // a UTF-8 decoder that accepts them might introduce security problems
506         // comparable to overlong UTF-8 sequences.
507 
508         SECTION("5.1 Single UTF-16 surrogates")
509         {
510             // 5.1.1  U+D800 = ed a0 80
511             roundtrip(false, "\xed\xa0\x80");
512             // 5.1.2  U+DB7F = ed ad bf
513             roundtrip(false, "\xed\xad\xbf");
514             // 5.1.3  U+DB80 = ed ae 80
515             roundtrip(false, "\xed\xae\x80");
516             // 5.1.4  U+DBFF = ed af bf
517             roundtrip(false, "\xed\xaf\xbf");
518             // 5.1.5  U+DC00 = ed b0 80
519             roundtrip(false, "\xed\xb0\x80");
520             // 5.1.6  U+DF80 = ed be 80
521             roundtrip(false, "\xed\xbe\x80");
522             // 5.1.7  U+DFFF = ed bf bf
523             roundtrip(false, "\xed\xbf\xbf");
524         }
525 
526         SECTION("5.2 Paired UTF-16 surrogates")
527         {
528             // 5.2.1  U+D800 U+DC00 = ed a0 80 ed b0 80
529             roundtrip(false, "\xed\xa0\x80\xed\xb0\x80");
530             // 5.2.2  U+D800 U+DFFF = ed a0 80 ed bf bf
531             roundtrip(false, "\xed\xa0\x80\xed\xbf\xbf");
532             // 5.2.3  U+DB7F U+DC00 = ed ad bf ed b0 80
533             roundtrip(false, "\xed\xad\xbf\xed\xb0\x80");
534             // 5.2.4  U+DB7F U+DFFF = ed ad bf ed bf bf
535             roundtrip(false, "\xed\xad\xbf\xed\xbf\xbf");
536             // 5.2.5  U+DB80 U+DC00 = ed ae 80 ed b0 80
537             roundtrip(false, "\xed\xae\x80\xed\xb0\x80");
538             // 5.2.6  U+DB80 U+DFFF = ed ae 80 ed bf bf
539             roundtrip(false, "\xed\xae\x80\xed\xbf\xbf");
540             // 5.2.7  U+DBFF U+DC00 = ed af bf ed b0 80
541             roundtrip(false, "\xed\xaf\xbf\xed\xb0\x80");
542             // 5.2.8  U+DBFF U+DFFF = ed af bf ed bf bf
543             roundtrip(false, "\xed\xaf\xbf\xed\xbf\xbf");
544         }
545 
546         SECTION("5.3 Noncharacter code positions")
547         {
548             // The following "noncharacters" are "reserved for internal use" by
549             // applications, and according to older versions of the Unicode Standard
550             // "should never be interchanged". Unicode Corrigendum #9 dropped the
551             // latter restriction. Nevertheless, their presence in incoming UTF-8 data
552             // can remain a potential security risk, depending on what use is made of
553             // these codes subsequently. Examples of such internal use:
554             //
555             //  - Some file APIs with 16-bit characters may use the integer value -1
556             //    = U+FFFF to signal an end-of-file (EOF) or error condition.
557             //
558             //  - In some UTF-16 receivers, code point U+FFFE might trigger a
559             //    byte-swap operation (to convert between UTF-16LE and UTF-16BE).
560             //
561             // With such internal use of noncharacters, it may be desirable and safer
562             // to block those code points in UTF-8 decoders, as they should never
563             // occur legitimately in incoming UTF-8 data, and could trigger unsafe
564             // behaviour in subsequent processing.
565 
566             // Particularly problematic noncharacters in 16-bit applications:
567 
568             // 5.3.1  U+FFFE = ef bf be
569             roundtrip(true, "\xef\xbf\xbe");
570             // 5.3.2  U+FFFF = ef bf bf
571             roundtrip(true, "\xef\xbf\xbf");
572 
573             // 5.3.3  U+FDD0 .. U+FDEF
574             roundtrip(true, "\xEF\xB7\x90");
575             roundtrip(true, "\xEF\xB7\x91");
576             roundtrip(true, "\xEF\xB7\x92");
577             roundtrip(true, "\xEF\xB7\x93");
578             roundtrip(true, "\xEF\xB7\x94");
579             roundtrip(true, "\xEF\xB7\x95");
580             roundtrip(true, "\xEF\xB7\x96");
581             roundtrip(true, "\xEF\xB7\x97");
582             roundtrip(true, "\xEF\xB7\x98");
583             roundtrip(true, "\xEF\xB7\x99");
584             roundtrip(true, "\xEF\xB7\x9A");
585             roundtrip(true, "\xEF\xB7\x9B");
586             roundtrip(true, "\xEF\xB7\x9C");
587             roundtrip(true, "\xEF\xB7\x9D");
588             roundtrip(true, "\xEF\xB7\x9E");
589             roundtrip(true, "\xEF\xB7\x9F");
590             roundtrip(true, "\xEF\xB7\xA0");
591             roundtrip(true, "\xEF\xB7\xA1");
592             roundtrip(true, "\xEF\xB7\xA2");
593             roundtrip(true, "\xEF\xB7\xA3");
594             roundtrip(true, "\xEF\xB7\xA4");
595             roundtrip(true, "\xEF\xB7\xA5");
596             roundtrip(true, "\xEF\xB7\xA6");
597             roundtrip(true, "\xEF\xB7\xA7");
598             roundtrip(true, "\xEF\xB7\xA8");
599             roundtrip(true, "\xEF\xB7\xA9");
600             roundtrip(true, "\xEF\xB7\xAA");
601             roundtrip(true, "\xEF\xB7\xAB");
602             roundtrip(true, "\xEF\xB7\xAC");
603             roundtrip(true, "\xEF\xB7\xAD");
604             roundtrip(true, "\xEF\xB7\xAE");
605             roundtrip(true, "\xEF\xB7\xAF");
606 
607             // 5.3.4  U+nFFFE U+nFFFF (for n = 1..10)
608             roundtrip(true, "\xF0\x9F\xBF\xBF");
609             roundtrip(true, "\xF0\xAF\xBF\xBF");
610             roundtrip(true, "\xF0\xBF\xBF\xBF");
611             roundtrip(true, "\xF1\x8F\xBF\xBF");
612             roundtrip(true, "\xF1\x9F\xBF\xBF");
613             roundtrip(true, "\xF1\xAF\xBF\xBF");
614             roundtrip(true, "\xF1\xBF\xBF\xBF");
615             roundtrip(true, "\xF2\x8F\xBF\xBF");
616             roundtrip(true, "\xF2\x9F\xBF\xBF");
617             roundtrip(true, "\xF2\xAF\xBF\xBF");
618         }
619     }
620 }
621