1 // __ _____ _____ _____
2 // __| | __| | | | JSON for Modern C++ (supporting code)
3 // | | |__ | | | | | | version 3.11.2
4 // |_____|_____|_____|_|___| https://github.com/nlohmann/json
5 //
6 // SPDX-FileCopyrightText: 2013-2022 Niels Lohmann <https://nlohmann.me>
7 // SPDX-License-Identifier: MIT
8
9 #include "doctest_compatibility.h"
10
11 // for some reason including this after the json header leads to linker errors with VS 2017...
12 #include <locale>
13 #include <nlohmann/json.hpp>
14 using nlohmann::json;
15
16 #include <fstream>
17 #include <sstream>
18 #include <iomanip>
19 #include "make_test_data_available.hpp"
20
skip()21 TEST_CASE("Unicode (1/5)" * doctest::skip())
22 {
23 SECTION("\\uxxxx sequences")
24 {
25 // create an escaped string from a code point
26 const auto codepoint_to_unicode = [](std::size_t cp)
27 {
28 // code points are represented as a six-character sequence: a
29 // reverse solidus, followed by the lowercase letter u, followed
30 // by four hexadecimal digits that encode the character's code
31 // point
32 std::stringstream ss;
33 ss << "\\u" << std::setw(4) << std::setfill('0') << std::hex << cp;
34 return ss.str();
35 };
36
37 SECTION("correct sequences")
38 {
39 // generate all UTF-8 code points; in total, 1112064 code points are
40 // generated: 0x1FFFFF code points - 2048 invalid values between
41 // 0xD800 and 0xDFFF.
42 for (std::size_t cp = 0; cp <= 0x10FFFFu; ++cp)
43 {
44 // string to store the code point as in \uxxxx format
45 std::string json_text = "\"";
46
47 // decide whether to use one or two \uxxxx sequences
48 if (cp < 0x10000u)
49 {
50 // The Unicode standard permanently reserves these code point
51 // values for UTF-16 encoding of the high and low surrogates, and
52 // they will never be assigned a character, so there should be no
53 // reason to encode them. The official Unicode standard says that
54 // no UTF forms, including UTF-16, can encode these code points.
55 if (cp >= 0xD800u && cp <= 0xDFFFu)
56 {
57 // if we would not skip these code points, we would get a
58 // "missing low surrogate" exception
59 continue;
60 }
61
62 // code points in the Basic Multilingual Plane can be
63 // represented with one \uxxxx sequence
64 json_text += codepoint_to_unicode(cp);
65 }
66 else
67 {
68 // To escape an extended character that is not in the Basic
69 // Multilingual Plane, the character is represented as a
70 // 12-character sequence, encoding the UTF-16 surrogate pair
71 const auto codepoint1 = 0xd800u + (((cp - 0x10000u) >> 10) & 0x3ffu);
72 const auto codepoint2 = 0xdc00u + ((cp - 0x10000u) & 0x3ffu);
73 json_text += codepoint_to_unicode(codepoint1) + codepoint_to_unicode(codepoint2);
74 }
75
76 json_text += "\"";
77 CAPTURE(json_text)
78 json _;
79 CHECK_NOTHROW(_ = json::parse(json_text));
80 }
81 }
82
83 SECTION("incorrect sequences")
84 {
85 SECTION("incorrect surrogate values")
86 {
87 json _;
88
89 CHECK_THROWS_WITH_AS(_ = json::parse("\"\\uDC00\\uDC00\""), "[json.exception.parse_error.101] parse error at line 1, column 7: syntax error while parsing value - invalid string: surrogate U+DC00..U+DFFF must follow U+D800..U+DBFF; last read: '\"\\uDC00'", json::parse_error&);
90
91 CHECK_THROWS_WITH_AS(_ = json::parse("\"\\uD7FF\\uDC00\""), "[json.exception.parse_error.101] parse error at line 1, column 13: syntax error while parsing value - invalid string: surrogate U+DC00..U+DFFF must follow U+D800..U+DBFF; last read: '\"\\uD7FF\\uDC00'", json::parse_error&);
92
93 CHECK_THROWS_WITH_AS(_ = json::parse("\"\\uD800]\""), "[json.exception.parse_error.101] parse error at line 1, column 8: syntax error while parsing value - invalid string: surrogate U+D800..U+DBFF must be followed by U+DC00..U+DFFF; last read: '\"\\uD800]'", json::parse_error&);
94
95 CHECK_THROWS_WITH_AS(_ = json::parse("\"\\uD800\\v\""), "[json.exception.parse_error.101] parse error at line 1, column 9: syntax error while parsing value - invalid string: surrogate U+D800..U+DBFF must be followed by U+DC00..U+DFFF; last read: '\"\\uD800\\v'", json::parse_error&);
96
97 CHECK_THROWS_WITH_AS(_ = json::parse("\"\\uD800\\u123\""), "[json.exception.parse_error.101] parse error at line 1, column 13: syntax error while parsing value - invalid string: '\\u' must be followed by 4 hex digits; last read: '\"\\uD800\\u123\"'", json::parse_error&);
98
99 CHECK_THROWS_WITH_AS(_ = json::parse("\"\\uD800\\uDBFF\""), "[json.exception.parse_error.101] parse error at line 1, column 13: syntax error while parsing value - invalid string: surrogate U+D800..U+DBFF must be followed by U+DC00..U+DFFF; last read: '\"\\uD800\\uDBFF'", json::parse_error&);
100
101 CHECK_THROWS_WITH_AS(_ = json::parse("\"\\uD800\\uE000\""), "[json.exception.parse_error.101] parse error at line 1, column 13: syntax error while parsing value - invalid string: surrogate U+D800..U+DBFF must be followed by U+DC00..U+DFFF; last read: '\"\\uD800\\uE000'", json::parse_error&);
102 }
103 }
104
105 #if 0
106 SECTION("incorrect sequences")
107 {
108 SECTION("high surrogate without low surrogate")
109 {
110 // D800..DBFF are high surrogates and must be followed by low
111 // surrogates DC00..DFFF; here, nothing follows
112 for (std::size_t cp = 0xD800u; cp <= 0xDBFFu; ++cp)
113 {
114 std::string json_text = "\"" + codepoint_to_unicode(cp) + "\"";
115 CAPTURE(json_text)
116 CHECK_THROWS_AS(json::parse(json_text), json::parse_error&);
117 }
118 }
119
120 SECTION("high surrogate with wrong low surrogate")
121 {
122 // D800..DBFF are high surrogates and must be followed by low
123 // surrogates DC00..DFFF; here a different sequence follows
124 for (std::size_t cp1 = 0xD800u; cp1 <= 0xDBFFu; ++cp1)
125 {
126 for (std::size_t cp2 = 0x0000u; cp2 <= 0xFFFFu; ++cp2)
127 {
128 if (0xDC00u <= cp2 && cp2 <= 0xDFFFu)
129 {
130 continue;
131 }
132
133 std::string json_text = "\"" + codepoint_to_unicode(cp1) + codepoint_to_unicode(cp2) + "\"";
134 CAPTURE(json_text)
135 CHECK_THROWS_AS(json::parse(json_text), json::parse_error&);
136 }
137 }
138 }
139
140 SECTION("low surrogate without high surrogate")
141 {
142 // low surrogates DC00..DFFF must follow high surrogates; here,
143 // they occur alone
144 for (std::size_t cp = 0xDC00u; cp <= 0xDFFFu; ++cp)
145 {
146 std::string json_text = "\"" + codepoint_to_unicode(cp) + "\"";
147 CAPTURE(json_text)
148 CHECK_THROWS_AS(json::parse(json_text), json::parse_error&);
149 }
150 }
151
152 }
153 #endif
154 }
155
156 SECTION("read all unicode characters")
157 {
158 // read a file with all unicode characters stored as single-character
159 // strings in a JSON array
160 std::ifstream f(TEST_DATA_DIRECTORY "/json_nlohmann_tests/all_unicode.json");
161 json j;
162 CHECK_NOTHROW(f >> j);
163
164 // the array has 1112064 + 1 elements (a terminating "null" value)
165 // Note: 1112064 = 0x1FFFFF code points - 2048 invalid values between
166 // 0xD800 and 0xDFFF.
167 CHECK(j.size() == 1112065);
168
169 SECTION("check JSON Pointers")
170 {
171 for (const auto& s : j)
172 {
173 // skip non-string JSON values
174 if (!s.is_string())
175 {
176 continue;
177 }
178
179 auto ptr = s.get<std::string>();
180
181 // tilde must be followed by 0 or 1
182 if (ptr == "~")
183 {
184 ptr += "0";
185 }
186
187 // JSON Pointers must begin with "/"
188 ptr.insert(0, "/");
189
190 CHECK_NOTHROW(json::json_pointer("/" + ptr));
191
192 // check escape/unescape roundtrip
193 auto escaped = nlohmann::detail::escape(ptr);
194 nlohmann::detail::unescape(escaped);
195 CHECK(escaped == ptr);
196 }
197 }
198 }
199
200 SECTION("ignore byte-order-mark")
201 {
202 SECTION("in a stream")
203 {
204 // read a file with a UTF-8 BOM
205 std::ifstream f(TEST_DATA_DIRECTORY "/json_nlohmann_tests/bom.json");
206 json j;
207 CHECK_NOTHROW(f >> j);
208 }
209
210 SECTION("with an iterator")
211 {
212 std::string i = "\xef\xbb\xbf{\n \"foo\": true\n}";
213 json _;
214 CHECK_NOTHROW(_ = json::parse(i.begin(), i.end()));
215 }
216 }
217
218 SECTION("error for incomplete/wrong BOM")
219 {
220 json _;
221 CHECK_THROWS_AS(_ = json::parse("\xef\xbb"), json::parse_error&);
222 CHECK_THROWS_AS(_ = json::parse("\xef\xbb\xbb"), json::parse_error&);
223 }
224 }
225
226 namespace
227 {
228 void roundtrip(bool success_expected, const std::string& s);
229
roundtrip(bool success_expected,const std::string & s)230 void roundtrip(bool success_expected, const std::string& s)
231 {
232 CAPTURE(s)
233 json _;
234
235 // create JSON string value
236 json j = s;
237 // create JSON text
238 std::string ps = std::string("\"") + s + "\"";
239
240 if (success_expected)
241 {
242 // serialization succeeds
243 CHECK_NOTHROW(j.dump());
244
245 // exclude parse test for U+0000
246 if (s[0] != '\0')
247 {
248 // parsing JSON text succeeds
249 CHECK_NOTHROW(_ = json::parse(ps));
250 }
251
252 // roundtrip succeeds
253 CHECK_NOTHROW(_ = json::parse(j.dump()));
254
255 // after roundtrip, the same string is stored
256 json jr = json::parse(j.dump());
257 CHECK(jr.get<std::string>() == s);
258 }
259 else
260 {
261 // serialization fails
262 CHECK_THROWS_AS(j.dump(), json::type_error&);
263
264 // parsing JSON text fails
265 CHECK_THROWS_AS(_ = json::parse(ps), json::parse_error&);
266 }
267 }
268 } // namespace
269
270 TEST_CASE("Markus Kuhn's UTF-8 decoder capability and stress test")
271 {
272 // Markus Kuhn <http://www.cl.cam.ac.uk/~mgk25/> - 2015-08-28 - CC BY 4.0
273 // http://www.cl.cam.ac.uk/~mgk25/ucs/examples/UTF-8-test.txt
274
275 SECTION("1 Some correct UTF-8 text")
276 {
277 roundtrip(true, "κόσμε");
278 }
279
280 SECTION("2 Boundary condition test cases")
281 {
282 SECTION("2.1 First possible sequence of a certain length")
283 {
284 // 2.1.1 1 byte (U-00000000)
285 roundtrip(true, std::string("\0", 1));
286 // 2.1.2 2 bytes (U-00000080)
287 roundtrip(true, "\xc2\x80");
288 // 2.1.3 3 bytes (U-00000800)
289 roundtrip(true, "\xe0\xa0\x80");
290 // 2.1.4 4 bytes (U-00010000)
291 roundtrip(true, "\xf0\x90\x80\x80");
292
293 // 2.1.5 5 bytes (U-00200000)
294 roundtrip(false, "\xF8\x88\x80\x80\x80");
295 // 2.1.6 6 bytes (U-04000000)
296 roundtrip(false, "\xFC\x84\x80\x80\x80\x80");
297 }
298
299 SECTION("2.2 Last possible sequence of a certain length")
300 {
301 // 2.2.1 1 byte (U-0000007F)
302 roundtrip(true, "\x7f");
303 // 2.2.2 2 bytes (U-000007FF)
304 roundtrip(true, "\xdf\xbf");
305 // 2.2.3 3 bytes (U-0000FFFF)
306 roundtrip(true, "\xef\xbf\xbf");
307
308 // 2.2.4 4 bytes (U-001FFFFF)
309 roundtrip(false, "\xF7\xBF\xBF\xBF");
310 // 2.2.5 5 bytes (U-03FFFFFF)
311 roundtrip(false, "\xFB\xBF\xBF\xBF\xBF");
312 // 2.2.6 6 bytes (U-7FFFFFFF)
313 roundtrip(false, "\xFD\xBF\xBF\xBF\xBF\xBF");
314 }
315
316 SECTION("2.3 Other boundary conditions")
317 {
318 // 2.3.1 U-0000D7FF = ed 9f bf
319 roundtrip(true, "\xed\x9f\xbf");
320 // 2.3.2 U-0000E000 = ee 80 80
321 roundtrip(true, "\xee\x80\x80");
322 // 2.3.3 U-0000FFFD = ef bf bd
323 roundtrip(true, "\xef\xbf\xbd");
324 // 2.3.4 U-0010FFFF = f4 8f bf bf
325 roundtrip(true, "\xf4\x8f\xbf\xbf");
326
327 // 2.3.5 U-00110000 = f4 90 80 80
328 roundtrip(false, "\xf4\x90\x80\x80");
329 }
330 }
331
332 SECTION("3 Malformed sequences")
333 {
334 SECTION("3.1 Unexpected continuation bytes")
335 {
336 // Each unexpected continuation byte should be separately signalled as a
337 // malformed sequence of its own.
338
339 // 3.1.1 First continuation byte 0x80
340 roundtrip(false, "\x80");
341 // 3.1.2 Last continuation byte 0xbf
342 roundtrip(false, "\xbf");
343
344 // 3.1.3 2 continuation bytes
345 roundtrip(false, "\x80\xbf");
346 // 3.1.4 3 continuation bytes
347 roundtrip(false, "\x80\xbf\x80");
348 // 3.1.5 4 continuation bytes
349 roundtrip(false, "\x80\xbf\x80\xbf");
350 // 3.1.6 5 continuation bytes
351 roundtrip(false, "\x80\xbf\x80\xbf\x80");
352 // 3.1.7 6 continuation bytes
353 roundtrip(false, "\x80\xbf\x80\xbf\x80\xbf");
354 // 3.1.8 7 continuation bytes
355 roundtrip(false, "\x80\xbf\x80\xbf\x80\xbf\x80");
356
357 // 3.1.9 Sequence of all 64 possible continuation bytes (0x80-0xbf)
358 roundtrip(false, "\x80\x81\x82\x83\x84\x85\x86\x87\x88\x89\x8a\x8b\x8c\x8d\x8e\x8f\x90\x91\x92\x93\x94\x95\x96\x97\x98\x99\x9a\x9b\x9c\x9d\x9e\x9f\xa0\xa1\xa2\xa3\xa4\xa5\xa6\xa7\xa8\xa9\xaa\xab\xac\xad\xae\xaf\xb0\xb1\xb2\xb3\xb4\xb5\xb6\xb7\xb8\xb9\xba\xbb\xbc\xbd\xbe\xbf");
359 }
360
361 SECTION("3.2 Lonely start characters")
362 {
363 // 3.2.1 All 32 first bytes of 2-byte sequences (0xc0-0xdf)
364 roundtrip(false, "\xc0 \xc1 \xc2 \xc3 \xc4 \xc5 \xc6 \xc7 \xc8 \xc9 \xca \xcb \xcc \xcd \xce \xcf \xd0 \xd1 \xd2 \xd3 \xd4 \xd5 \xd6 \xd7 \xd8 \xd9 \xda \xdb \xdc \xdd \xde \xdf");
365 // 3.2.2 All 16 first bytes of 3-byte sequences (0xe0-0xef)
366 roundtrip(false, "\xe0 \xe1 \xe2 \xe3 \xe4 \xe5 \xe6 \xe7 \xe8 \xe9 \xea \xeb \xec \xed \xee \xef");
367 // 3.2.3 All 8 first bytes of 4-byte sequences (0xf0-0xf7)
368 roundtrip(false, "\xf0 \xf1 \xf2 \xf3 \xf4 \xf5 \xf6 \xf7");
369 // 3.2.4 All 4 first bytes of 5-byte sequences (0xf8-0xfb)
370 roundtrip(false, "\xf8 \xf9 \xfa \xfb");
371 // 3.2.5 All 2 first bytes of 6-byte sequences (0xfc-0xfd)
372 roundtrip(false, "\xfc \xfd");
373 }
374
375 SECTION("3.3 Sequences with last continuation byte missing")
376 {
377 // All bytes of an incomplete sequence should be signalled as a single
378 // malformed sequence, i.e., you should see only a single replacement
379 // character in each of the next 10 tests. (Characters as in section 2)
380
381 // 3.3.1 2-byte sequence with last byte missing (U+0000)
382 roundtrip(false, "\xc0");
383 // 3.3.2 3-byte sequence with last byte missing (U+0000)
384 roundtrip(false, "\xe0\x80");
385 // 3.3.3 4-byte sequence with last byte missing (U+0000)
386 roundtrip(false, "\xf0\x80\x80");
387 // 3.3.4 5-byte sequence with last byte missing (U+0000)
388 roundtrip(false, "\xf8\x80\x80\x80");
389 // 3.3.5 6-byte sequence with last byte missing (U+0000)
390 roundtrip(false, "\xfc\x80\x80\x80\x80");
391 // 3.3.6 2-byte sequence with last byte missing (U-000007FF)
392 roundtrip(false, "\xdf");
393 // 3.3.7 3-byte sequence with last byte missing (U-0000FFFF)
394 roundtrip(false, "\xef\xbf");
395 // 3.3.8 4-byte sequence with last byte missing (U-001FFFFF)
396 roundtrip(false, "\xf7\xbf\xbf");
397 // 3.3.9 5-byte sequence with last byte missing (U-03FFFFFF)
398 roundtrip(false, "\xfb\xbf\xbf\xbf");
399 // 3.3.10 6-byte sequence with last byte missing (U-7FFFFFFF)
400 roundtrip(false, "\xfd\xbf\xbf\xbf\xbf");
401 }
402
403 SECTION("3.4 Concatenation of incomplete sequences")
404 {
405 // All the 10 sequences of 3.3 concatenated, you should see 10 malformed
406 // sequences being signalled:
407 roundtrip(false, "\xc0\xe0\x80\xf0\x80\x80\xf8\x80\x80\x80\xfc\x80\x80\x80\x80\xdf\xef\xbf\xf7\xbf\xbf\xfb\xbf\xbf\xbf\xfd\xbf\xbf\xbf\xbf");
408 }
409
410 SECTION("3.5 Impossible bytes")
411 {
412 // The following two bytes cannot appear in a correct UTF-8 string
413
414 // 3.5.1 fe
415 roundtrip(false, "\xfe");
416 // 3.5.2 ff
417 roundtrip(false, "\xff");
418 // 3.5.3 fe fe ff ff
419 roundtrip(false, "\xfe\xfe\xff\xff");
420 }
421 }
422
423 SECTION("4 Overlong sequences")
424 {
425 // The following sequences are not malformed according to the letter of
426 // the Unicode 2.0 standard. However, they are longer then necessary and
427 // a correct UTF-8 encoder is not allowed to produce them. A "safe UTF-8
428 // decoder" should reject them just like malformed sequences for two
429 // reasons: (1) It helps to debug applications if overlong sequences are
430 // not treated as valid representations of characters, because this helps
431 // to spot problems more quickly. (2) Overlong sequences provide
432 // alternative representations of characters, that could maliciously be
433 // used to bypass filters that check only for ASCII characters. For
434 // instance, a 2-byte encoded line feed (LF) would not be caught by a
435 // line counter that counts only 0x0a bytes, but it would still be
436 // processed as a line feed by an unsafe UTF-8 decoder later in the
437 // pipeline. From a security point of view, ASCII compatibility of UTF-8
438 // sequences means also, that ASCII characters are *only* allowed to be
439 // represented by ASCII bytes in the range 0x00-0x7f. To ensure this
440 // aspect of ASCII compatibility, use only "safe UTF-8 decoders" that
441 // reject overlong UTF-8 sequences for which a shorter encoding exists.
442
443 SECTION("4.1 Examples of an overlong ASCII character")
444 {
445 // With a safe UTF-8 decoder, all of the following five overlong
446 // representations of the ASCII character slash ("/") should be rejected
447 // like a malformed UTF-8 sequence, for instance by substituting it with
448 // a replacement character. If you see a slash below, you do not have a
449 // safe UTF-8 decoder!
450
451 // 4.1.1 U+002F = c0 af
452 roundtrip(false, "\xc0\xaf");
453 // 4.1.2 U+002F = e0 80 af
454 roundtrip(false, "\xe0\x80\xaf");
455 // 4.1.3 U+002F = f0 80 80 af
456 roundtrip(false, "\xf0\x80\x80\xaf");
457 // 4.1.4 U+002F = f8 80 80 80 af
458 roundtrip(false, "\xf8\x80\x80\x80\xaf");
459 // 4.1.5 U+002F = fc 80 80 80 80 af
460 roundtrip(false, "\xfc\x80\x80\x80\x80\xaf");
461 }
462
463 SECTION("4.2 Maximum overlong sequences")
464 {
465 // Below you see the highest Unicode value that is still resulting in an
466 // overlong sequence if represented with the given number of bytes. This
467 // is a boundary test for safe UTF-8 decoders. All five characters should
468 // be rejected like malformed UTF-8 sequences.
469
470 // 4.2.1 U-0000007F = c1 bf
471 roundtrip(false, "\xc1\xbf");
472 // 4.2.2 U-000007FF = e0 9f bf
473 roundtrip(false, "\xe0\x9f\xbf");
474 // 4.2.3 U-0000FFFF = f0 8f bf bf
475 roundtrip(false, "\xf0\x8f\xbf\xbf");
476 // 4.2.4 U-001FFFFF = f8 87 bf bf bf
477 roundtrip(false, "\xf8\x87\xbf\xbf\xbf");
478 // 4.2.5 U-03FFFFFF = fc 83 bf bf bf bf
479 roundtrip(false, "\xfc\x83\xbf\xbf\xbf\xbf");
480 }
481
482 SECTION("4.3 Overlong representation of the NUL character")
483 {
484 // The following five sequences should also be rejected like malformed
485 // UTF-8 sequences and should not be treated like the ASCII NUL
486 // character.
487
488 // 4.3.1 U+0000 = c0 80
489 roundtrip(false, "\xc0\x80");
490 // 4.3.2 U+0000 = e0 80 80
491 roundtrip(false, "\xe0\x80\x80");
492 // 4.3.3 U+0000 = f0 80 80 80
493 roundtrip(false, "\xf0\x80\x80\x80");
494 // 4.3.4 U+0000 = f8 80 80 80 80
495 roundtrip(false, "\xf8\x80\x80\x80\x80");
496 // 4.3.5 U+0000 = fc 80 80 80 80 80
497 roundtrip(false, "\xfc\x80\x80\x80\x80\x80");
498 }
499 }
500
501 SECTION("5 Illegal code positions")
502 {
503 // The following UTF-8 sequences should be rejected like malformed
504 // sequences, because they never represent valid ISO 10646 characters and
505 // a UTF-8 decoder that accepts them might introduce security problems
506 // comparable to overlong UTF-8 sequences.
507
508 SECTION("5.1 Single UTF-16 surrogates")
509 {
510 // 5.1.1 U+D800 = ed a0 80
511 roundtrip(false, "\xed\xa0\x80");
512 // 5.1.2 U+DB7F = ed ad bf
513 roundtrip(false, "\xed\xad\xbf");
514 // 5.1.3 U+DB80 = ed ae 80
515 roundtrip(false, "\xed\xae\x80");
516 // 5.1.4 U+DBFF = ed af bf
517 roundtrip(false, "\xed\xaf\xbf");
518 // 5.1.5 U+DC00 = ed b0 80
519 roundtrip(false, "\xed\xb0\x80");
520 // 5.1.6 U+DF80 = ed be 80
521 roundtrip(false, "\xed\xbe\x80");
522 // 5.1.7 U+DFFF = ed bf bf
523 roundtrip(false, "\xed\xbf\xbf");
524 }
525
526 SECTION("5.2 Paired UTF-16 surrogates")
527 {
528 // 5.2.1 U+D800 U+DC00 = ed a0 80 ed b0 80
529 roundtrip(false, "\xed\xa0\x80\xed\xb0\x80");
530 // 5.2.2 U+D800 U+DFFF = ed a0 80 ed bf bf
531 roundtrip(false, "\xed\xa0\x80\xed\xbf\xbf");
532 // 5.2.3 U+DB7F U+DC00 = ed ad bf ed b0 80
533 roundtrip(false, "\xed\xad\xbf\xed\xb0\x80");
534 // 5.2.4 U+DB7F U+DFFF = ed ad bf ed bf bf
535 roundtrip(false, "\xed\xad\xbf\xed\xbf\xbf");
536 // 5.2.5 U+DB80 U+DC00 = ed ae 80 ed b0 80
537 roundtrip(false, "\xed\xae\x80\xed\xb0\x80");
538 // 5.2.6 U+DB80 U+DFFF = ed ae 80 ed bf bf
539 roundtrip(false, "\xed\xae\x80\xed\xbf\xbf");
540 // 5.2.7 U+DBFF U+DC00 = ed af bf ed b0 80
541 roundtrip(false, "\xed\xaf\xbf\xed\xb0\x80");
542 // 5.2.8 U+DBFF U+DFFF = ed af bf ed bf bf
543 roundtrip(false, "\xed\xaf\xbf\xed\xbf\xbf");
544 }
545
546 SECTION("5.3 Noncharacter code positions")
547 {
548 // The following "noncharacters" are "reserved for internal use" by
549 // applications, and according to older versions of the Unicode Standard
550 // "should never be interchanged". Unicode Corrigendum #9 dropped the
551 // latter restriction. Nevertheless, their presence in incoming UTF-8 data
552 // can remain a potential security risk, depending on what use is made of
553 // these codes subsequently. Examples of such internal use:
554 //
555 // - Some file APIs with 16-bit characters may use the integer value -1
556 // = U+FFFF to signal an end-of-file (EOF) or error condition.
557 //
558 // - In some UTF-16 receivers, code point U+FFFE might trigger a
559 // byte-swap operation (to convert between UTF-16LE and UTF-16BE).
560 //
561 // With such internal use of noncharacters, it may be desirable and safer
562 // to block those code points in UTF-8 decoders, as they should never
563 // occur legitimately in incoming UTF-8 data, and could trigger unsafe
564 // behaviour in subsequent processing.
565
566 // Particularly problematic noncharacters in 16-bit applications:
567
568 // 5.3.1 U+FFFE = ef bf be
569 roundtrip(true, "\xef\xbf\xbe");
570 // 5.3.2 U+FFFF = ef bf bf
571 roundtrip(true, "\xef\xbf\xbf");
572
573 // 5.3.3 U+FDD0 .. U+FDEF
574 roundtrip(true, "\xEF\xB7\x90");
575 roundtrip(true, "\xEF\xB7\x91");
576 roundtrip(true, "\xEF\xB7\x92");
577 roundtrip(true, "\xEF\xB7\x93");
578 roundtrip(true, "\xEF\xB7\x94");
579 roundtrip(true, "\xEF\xB7\x95");
580 roundtrip(true, "\xEF\xB7\x96");
581 roundtrip(true, "\xEF\xB7\x97");
582 roundtrip(true, "\xEF\xB7\x98");
583 roundtrip(true, "\xEF\xB7\x99");
584 roundtrip(true, "\xEF\xB7\x9A");
585 roundtrip(true, "\xEF\xB7\x9B");
586 roundtrip(true, "\xEF\xB7\x9C");
587 roundtrip(true, "\xEF\xB7\x9D");
588 roundtrip(true, "\xEF\xB7\x9E");
589 roundtrip(true, "\xEF\xB7\x9F");
590 roundtrip(true, "\xEF\xB7\xA0");
591 roundtrip(true, "\xEF\xB7\xA1");
592 roundtrip(true, "\xEF\xB7\xA2");
593 roundtrip(true, "\xEF\xB7\xA3");
594 roundtrip(true, "\xEF\xB7\xA4");
595 roundtrip(true, "\xEF\xB7\xA5");
596 roundtrip(true, "\xEF\xB7\xA6");
597 roundtrip(true, "\xEF\xB7\xA7");
598 roundtrip(true, "\xEF\xB7\xA8");
599 roundtrip(true, "\xEF\xB7\xA9");
600 roundtrip(true, "\xEF\xB7\xAA");
601 roundtrip(true, "\xEF\xB7\xAB");
602 roundtrip(true, "\xEF\xB7\xAC");
603 roundtrip(true, "\xEF\xB7\xAD");
604 roundtrip(true, "\xEF\xB7\xAE");
605 roundtrip(true, "\xEF\xB7\xAF");
606
607 // 5.3.4 U+nFFFE U+nFFFF (for n = 1..10)
608 roundtrip(true, "\xF0\x9F\xBF\xBF");
609 roundtrip(true, "\xF0\xAF\xBF\xBF");
610 roundtrip(true, "\xF0\xBF\xBF\xBF");
611 roundtrip(true, "\xF1\x8F\xBF\xBF");
612 roundtrip(true, "\xF1\x9F\xBF\xBF");
613 roundtrip(true, "\xF1\xAF\xBF\xBF");
614 roundtrip(true, "\xF1\xBF\xBF\xBF");
615 roundtrip(true, "\xF2\x8F\xBF\xBF");
616 roundtrip(true, "\xF2\x9F\xBF\xBF");
617 roundtrip(true, "\xF2\xAF\xBF\xBF");
618 }
619 }
620 }
621