1 //===----------------------------------------------------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8
9 // ADDITIONAL_COMPILE_FLAGS: -D_LIBCPP_DISABLE_DEPRECATION_WARNINGS -D_LIBCPP_ENABLE_CXX26_REMOVED_CODECVT
10 // XFAIL: stdlib=apple-libc++ && target={{.+}}-apple-macosx{{10.9|10.10|10.11|10.12|10.13|10.14|10.15|11.0|12.0|13.0}}
11
12 #include <algorithm>
13 #include <cassert>
14 #include <codecvt>
15 #include <locale>
16
17 #include "test_macros.h"
18
19 struct test_offsets_ok {
20 size_t in_size;
21 size_t out_size;
22 };
23 struct test_offsets_partial {
24 size_t in_size;
25 size_t out_size;
26 size_t expected_in_next;
27 size_t expected_out_next;
28 };
29
30 template <class CharT>
31 struct test_offsets_error {
32 size_t in_size;
33 size_t out_size;
34 size_t expected_in_next;
35 size_t expected_out_next;
36 CharT replace_char;
37 size_t replace_pos;
38 };
39
40 #define array_size(x) (sizeof(x) / sizeof(x)[0])
41
42 using std::begin;
43 using std::char_traits;
44 using std::codecvt_base;
45 using std::copy;
46 using std::end;
47
48 template <class InternT, class ExternT>
utf8_to_utf32_in_ok(const std::codecvt<InternT,ExternT,mbstate_t> & cvt)49 void utf8_to_utf32_in_ok(const std::codecvt<InternT, ExternT, mbstate_t>& cvt) {
50 // UTF-8 string of 1-byte CP, 2-byte CP, 3-byte CP and 4-byte CP
51 const unsigned char input[] = "b\u0448\uAAAA\U0010AAAA";
52 const char32_t expected[] = {'b', 0x0448, 0xAAAA, 0x10AAAA, 0};
53 static_assert(array_size(input) == 11, "");
54 static_assert(array_size(expected) == 5, "");
55
56 ExternT in[array_size(input)];
57 InternT exp[array_size(expected)];
58 copy(begin(input), end(input), begin(in));
59 copy(begin(expected), end(expected), begin(exp));
60 assert(char_traits<ExternT>::length(in) == 10);
61 assert(char_traits<InternT>::length(exp) == 4);
62 test_offsets_ok offsets[] = {{0, 0}, {1, 1}, {3, 2}, {6, 3}, {10, 4}};
63 for (test_offsets_ok* it = begin(offsets); it != end(offsets); ++it) {
64 test_offsets_ok t = *it;
65 InternT out[array_size(exp) - 1] = {};
66 assert(t.in_size <= array_size(in));
67 assert(t.out_size <= array_size(out));
68 mbstate_t state = {};
69 const ExternT* in_next = nullptr;
70 InternT* out_next = nullptr;
71 codecvt_base::result res = codecvt_base::ok;
72
73 res = cvt.in(state, in, in + t.in_size, in_next, out, out + t.out_size, out_next);
74 assert(res == cvt.ok);
75 assert(in_next == in + t.in_size);
76 assert(out_next == out + t.out_size);
77 assert(char_traits<InternT>::compare(out, exp, t.out_size) == 0);
78 if (t.out_size < array_size(out))
79 assert(out[t.out_size] == 0);
80
81 state = mbstate_t();
82 int len = cvt.length(state, in, in + t.in_size, t.out_size);
83 assert(len >= 0);
84 assert(static_cast<size_t>(len) == t.in_size);
85 }
86
87 for (test_offsets_ok* it = begin(offsets); it != end(offsets); ++it) {
88 test_offsets_ok t = *it;
89 InternT out[array_size(exp)] = {};
90 assert(t.in_size <= array_size(in));
91 assert(t.out_size <= array_size(out));
92 mbstate_t state = {};
93 const ExternT* in_next = nullptr;
94 InternT* out_next = nullptr;
95 codecvt_base::result res = codecvt_base::ok;
96
97 res = cvt.in(state, in, in + t.in_size, in_next, out, end(out), out_next);
98 assert(res == cvt.ok);
99 assert(in_next == in + t.in_size);
100 assert(out_next == out + t.out_size);
101 assert(char_traits<InternT>::compare(out, exp, t.out_size) == 0);
102 if (t.out_size < array_size(out))
103 assert(out[t.out_size] == 0);
104
105 state = mbstate_t();
106 int len = cvt.length(state, in, in + t.in_size, array_size(out));
107 assert(len >= 0);
108 assert(static_cast<size_t>(len) == t.in_size);
109 }
110 }
111
112 template <class InternT, class ExternT>
utf8_to_utf32_in_partial(const std::codecvt<InternT,ExternT,mbstate_t> & cvt)113 void utf8_to_utf32_in_partial(const std::codecvt<InternT, ExternT, mbstate_t>& cvt) {
114 // UTF-8 string of 1-byte CP, 2-byte CP, 3-byte CP and 4-byte CP
115 const unsigned char input[] = "b\u0448\uAAAA\U0010AAAA";
116 const char32_t expected[] = {'b', 0x0448, 0xAAAA, 0x10AAAA, 0};
117 static_assert(array_size(input) == 11, "");
118 static_assert(array_size(expected) == 5, "");
119
120 ExternT in[array_size(input)];
121 InternT exp[array_size(expected)];
122 copy(begin(input), end(input), begin(in));
123 copy(begin(expected), end(expected), begin(exp));
124 assert(char_traits<ExternT>::length(in) == 10);
125 assert(char_traits<InternT>::length(exp) == 4);
126
127 test_offsets_partial offsets[] = {
128 {1, 0, 0, 0}, // no space for first CP
129
130 {3, 1, 1, 1}, // no space for second CP
131 {2, 2, 1, 1}, // incomplete second CP
132 {2, 1, 1, 1}, // incomplete second CP, and no space for it
133
134 {6, 2, 3, 2}, // no space for third CP
135 {4, 3, 3, 2}, // incomplete third CP
136 {5, 3, 3, 2}, // incomplete third CP
137 {4, 2, 3, 2}, // incomplete third CP, and no space for it
138 {5, 2, 3, 2}, // incomplete third CP, and no space for it
139
140 {10, 3, 6, 3}, // no space for fourth CP
141 {7, 4, 6, 3}, // incomplete fourth CP
142 {8, 4, 6, 3}, // incomplete fourth CP
143 {9, 4, 6, 3}, // incomplete fourth CP
144 {7, 3, 6, 3}, // incomplete fourth CP, and no space for it
145 {8, 3, 6, 3}, // incomplete fourth CP, and no space for it
146 {9, 3, 6, 3}, // incomplete fourth CP, and no space for it
147 };
148
149 for (test_offsets_partial* it = begin(offsets); it != end(offsets); ++it) {
150 test_offsets_partial t = *it;
151 InternT out[array_size(exp) - 1] = {};
152 assert(t.in_size <= array_size(in));
153 assert(t.out_size <= array_size(out));
154 assert(t.expected_in_next <= t.in_size);
155 assert(t.expected_out_next <= t.out_size);
156 mbstate_t state = {};
157 const ExternT* in_next = nullptr;
158 InternT* out_next = nullptr;
159 codecvt_base::result res = codecvt_base::ok;
160
161 res = cvt.in(state, in, in + t.in_size, in_next, out, out + t.out_size, out_next);
162 assert(res == cvt.partial);
163 assert(in_next == in + t.expected_in_next);
164 assert(out_next == out + t.expected_out_next);
165 assert(char_traits<InternT>::compare(out, exp, t.expected_out_next) == 0);
166 if (t.expected_out_next < array_size(out))
167 assert(out[t.expected_out_next] == 0);
168
169 state = mbstate_t();
170 int len = cvt.length(state, in, in + t.in_size, t.out_size);
171 assert(len >= 0);
172 assert(static_cast<size_t>(len) == t.expected_in_next);
173 }
174 }
175
176 template <class InternT, class ExternT>
utf8_to_utf32_in_error(const std::codecvt<InternT,ExternT,mbstate_t> & cvt)177 void utf8_to_utf32_in_error(const std::codecvt<InternT, ExternT, mbstate_t>& cvt) {
178 // UTF-8 string of 1-byte CP, 2-byte CP, 3-byte CP, 4-byte CP
179 const unsigned char input[] = "b\u0448\uD700\U0010AAAA";
180 const char32_t expected[] = {'b', 0x0448, 0xD700, 0x10AAAA, 0};
181 static_assert(array_size(input) == 11, "");
182 static_assert(array_size(expected) == 5, "");
183
184 ExternT in[array_size(input)];
185 InternT exp[array_size(expected)];
186 copy(begin(input), end(input), begin(in));
187 copy(begin(expected), end(expected), begin(exp));
188 assert(char_traits<ExternT>::length(in) == 10);
189 assert(char_traits<InternT>::length(exp) == 4);
190
191 // There are 5 classes of errors in UTF-8 decoding
192 // 1. Missing leading byte
193 // 2. Missing trailing byte
194 // 3. Surrogate CP
195 // 4. Overlong sequence
196 // 5. CP out of Unicode range
197 test_offsets_error<unsigned char> offsets[] = {
198
199 // 1. Missing leading byte. We will replace the leading byte with
200 // non-leading byte, such as a byte that is always invalid or a trailing
201 // byte.
202
203 // replace leading byte with invalid byte
204 {1, 4, 0, 0, 0xFF, 0},
205 {3, 4, 1, 1, 0xFF, 1},
206 {6, 4, 3, 2, 0xFF, 3},
207 {10, 4, 6, 3, 0xFF, 6},
208
209 // replace leading byte with trailing byte
210 {1, 4, 0, 0, 0b10101010, 0},
211 {3, 4, 1, 1, 0b10101010, 1},
212 {6, 4, 3, 2, 0b10101010, 3},
213 {10, 4, 6, 3, 0b10101010, 6},
214
215 // 2. Missing trailing byte. We will replace the trailing byte with
216 // non-trailing byte, such as a byte that is always invalid or a leading
217 // byte (simple ASCII byte in our case).
218
219 // replace first trailing byte with ASCII byte
220 {3, 4, 1, 1, 'z', 2},
221 {6, 4, 3, 2, 'z', 4},
222 {10, 4, 6, 3, 'z', 7},
223
224 // replace first trailing byte with invalid byte
225 {3, 4, 1, 1, 0xFF, 2},
226 {6, 4, 3, 2, 0xFF, 4},
227 {10, 4, 6, 3, 0xFF, 7},
228
229 // replace second trailing byte with ASCII byte
230 {6, 4, 3, 2, 'z', 5},
231 {10, 4, 6, 3, 'z', 8},
232
233 // replace second trailing byte with invalid byte
234 {6, 4, 3, 2, 0xFF, 5},
235 {10, 4, 6, 3, 0xFF, 8},
236
237 // replace third trailing byte
238 {10, 4, 6, 3, 'z', 9},
239 {10, 4, 6, 3, 0xFF, 9},
240
241 // 2.1 The following test-cases raise doubt whether error or partial should
242 // be returned. For example, we have 4-byte sequence with valid leading
243 // byte. If we hide the last byte we need to return partial. But, if the
244 // second or third byte, which are visible to the call to codecvt, are
245 // malformed then error should be returned.
246
247 // replace first trailing byte with ASCII byte, also incomplete at end
248 {5, 4, 3, 2, 'z', 4},
249 {8, 4, 6, 3, 'z', 7},
250 {9, 4, 6, 3, 'z', 7},
251
252 // replace first trailing byte with invalid byte, also incomplete at end
253 {5, 4, 3, 2, 0xFF, 4},
254 {8, 4, 6, 3, 0xFF, 7},
255 {9, 4, 6, 3, 0xFF, 7},
256
257 // replace second trailing byte with ASCII byte, also incomplete at end
258 {9, 4, 6, 3, 'z', 8},
259
260 // replace second trailing byte with invalid byte, also incomplete at end
261 {9, 4, 6, 3, 0xFF, 8},
262
263 // 3. Surrogate CP. We modify the second byte (first trailing) of the 3-byte
264 // CP U+D700
265 {6, 4, 3, 2, 0b10100000, 4}, // turn U+D700 into U+D800
266 {6, 4, 3, 2, 0b10101100, 4}, // turn U+D700 into U+DB00
267 {6, 4, 3, 2, 0b10110000, 4}, // turn U+D700 into U+DC00
268 {6, 4, 3, 2, 0b10111100, 4}, // turn U+D700 into U+DF00
269
270 // 4. Overlong sequence. The CPs in the input are chosen such as modifying
271 // just the leading byte is enough to make them overlong, i.e. for the
272 // 3-byte and 4-byte CP the second byte (first trailing) has enough leading
273 // zeroes.
274 {3, 4, 1, 1, 0b11000000, 1}, // make the 2-byte CP overlong
275 {3, 4, 1, 1, 0b11000001, 1}, // make the 2-byte CP overlong
276 {6, 4, 3, 2, 0b11100000, 3}, // make the 3-byte CP overlong
277 {10, 4, 6, 3, 0b11110000, 6}, // make the 4-byte CP overlong
278
279 // 5. CP above range
280 // turn U+10AAAA into U+14AAAA by changing its leading byte
281 {10, 4, 6, 3, 0b11110101, 6},
282 // turn U+10AAAA into U+11AAAA by changing its 2nd byte
283 {10, 4, 6, 3, 0b10011010, 7},
284 };
285 for (test_offsets_error<unsigned char>* it = begin(offsets); it != end(offsets); ++it) {
286 test_offsets_error<unsigned char> t = *it;
287 InternT out[array_size(exp) - 1] = {};
288 assert(t.in_size <= array_size(in));
289 assert(t.out_size <= array_size(out));
290 assert(t.expected_in_next <= t.in_size);
291 assert(t.expected_out_next <= t.out_size);
292 ExternT old_char = in[t.replace_pos];
293 in[t.replace_pos] = t.replace_char;
294
295 mbstate_t state = {};
296 const ExternT* in_next = nullptr;
297 InternT* out_next = nullptr;
298 codecvt_base::result res = codecvt_base::ok;
299
300 res = cvt.in(state, in, in + t.in_size, in_next, out, out + t.out_size, out_next);
301 assert(res == cvt.error);
302 assert(in_next == in + t.expected_in_next);
303 assert(out_next == out + t.expected_out_next);
304 assert(char_traits<InternT>::compare(out, exp, t.expected_out_next) == 0);
305 if (t.expected_out_next < array_size(out))
306 assert(out[t.expected_out_next] == 0);
307
308 state = mbstate_t();
309 int len = cvt.length(state, in, in + t.in_size, t.out_size);
310 assert(len >= 0);
311 assert(static_cast<size_t>(len) == t.expected_in_next);
312
313 in[t.replace_pos] = old_char;
314 }
315 }
316
317 template <class InternT, class ExternT>
utf8_to_utf32_in(const std::codecvt<InternT,ExternT,mbstate_t> & cvt)318 void utf8_to_utf32_in(const std::codecvt<InternT, ExternT, mbstate_t>& cvt) {
319 utf8_to_utf32_in_ok(cvt);
320 utf8_to_utf32_in_partial(cvt);
321 utf8_to_utf32_in_error(cvt);
322 }
323
324 template <class InternT, class ExternT>
utf32_to_utf8_out_ok(const std::codecvt<InternT,ExternT,mbstate_t> & cvt)325 void utf32_to_utf8_out_ok(const std::codecvt<InternT, ExternT, mbstate_t>& cvt) {
326 // UTF-8 string of 1-byte CP, 2-byte CP, 3-byte CP and 4-byte CP
327 const char32_t input[] = {'b', 0x0448, 0xAAAA, 0x10AAAA, 0};
328 const unsigned char expected[] = "b\u0448\uAAAA\U0010AAAA";
329 static_assert(array_size(input) == 5, "");
330 static_assert(array_size(expected) == 11, "");
331
332 InternT in[array_size(input)];
333 ExternT exp[array_size(expected)];
334 copy(begin(input), end(input), begin(in));
335 copy(begin(expected), end(expected), begin(exp));
336 assert(char_traits<InternT>::length(in) == 4);
337 assert(char_traits<ExternT>::length(exp) == 10);
338
339 test_offsets_ok offsets[] = {{0, 0}, {1, 1}, {2, 3}, {3, 6}, {4, 10}};
340 for (test_offsets_ok* it = begin(offsets); it != end(offsets); ++it) {
341 test_offsets_ok t = *it;
342 ExternT out[array_size(exp) - 1] = {};
343 assert(t.in_size <= array_size(in));
344 assert(t.out_size <= array_size(out));
345 mbstate_t state = {};
346 const InternT* in_next = nullptr;
347 ExternT* out_next = nullptr;
348 codecvt_base::result res = codecvt_base::ok;
349
350 res = cvt.out(state, in, in + t.in_size, in_next, out, out + t.out_size, out_next);
351 assert(res == cvt.ok);
352 assert(in_next == in + t.in_size);
353 assert(out_next == out + t.out_size);
354 assert(char_traits<ExternT>::compare(out, exp, t.out_size) == 0);
355 if (t.out_size < array_size(out))
356 assert(out[t.out_size] == 0);
357 }
358 }
359
360 template <class InternT, class ExternT>
utf32_to_utf8_out_partial(const std::codecvt<InternT,ExternT,mbstate_t> & cvt)361 void utf32_to_utf8_out_partial(const std::codecvt<InternT, ExternT, mbstate_t>& cvt) {
362 // UTF-8 string of 1-byte CP, 2-byte CP, 3-byte CP and 4-byte CP
363 const char32_t input[] = {'b', 0x0448, 0xAAAA, 0x10AAAA, 0};
364 const unsigned char expected[] = "b\u0448\uAAAA\U0010AAAA";
365 static_assert(array_size(input) == 5, "");
366 static_assert(array_size(expected) == 11, "");
367
368 InternT in[array_size(input)];
369 ExternT exp[array_size(expected)];
370 copy(begin(input), end(input), begin(in));
371 copy(begin(expected), end(expected), begin(exp));
372 assert(char_traits<InternT>::length(in) == 4);
373 assert(char_traits<ExternT>::length(exp) == 10);
374
375 test_offsets_partial offsets[] = {
376 {1, 0, 0, 0}, // no space for first CP
377
378 {2, 1, 1, 1}, // no space for second CP
379 {2, 2, 1, 1}, // no space for second CP
380
381 {3, 3, 2, 3}, // no space for third CP
382 {3, 4, 2, 3}, // no space for third CP
383 {3, 5, 2, 3}, // no space for third CP
384
385 {4, 6, 3, 6}, // no space for fourth CP
386 {4, 7, 3, 6}, // no space for fourth CP
387 {4, 8, 3, 6}, // no space for fourth CP
388 {4, 9, 3, 6}, // no space for fourth CP
389 };
390 for (test_offsets_partial* it = begin(offsets); it != end(offsets); ++it) {
391 test_offsets_partial t = *it;
392 ExternT out[array_size(exp) - 1] = {};
393 assert(t.in_size <= array_size(in));
394 assert(t.out_size <= array_size(out));
395 assert(t.expected_in_next <= t.in_size);
396 assert(t.expected_out_next <= t.out_size);
397 mbstate_t state = {};
398 const InternT* in_next = nullptr;
399 ExternT* out_next = nullptr;
400 codecvt_base::result res = codecvt_base::ok;
401
402 res = cvt.out(state, in, in + t.in_size, in_next, out, out + t.out_size, out_next);
403 assert(res == cvt.partial);
404 assert(in_next == in + t.expected_in_next);
405 assert(out_next == out + t.expected_out_next);
406 assert(char_traits<ExternT>::compare(out, exp, t.expected_out_next) == 0);
407 if (t.expected_out_next < array_size(out))
408 assert(out[t.expected_out_next] == 0);
409 }
410 }
411
412 template <class InternT, class ExternT>
utf32_to_utf8_out_error(const std::codecvt<InternT,ExternT,mbstate_t> & cvt)413 void utf32_to_utf8_out_error(const std::codecvt<InternT, ExternT, mbstate_t>& cvt) {
414 // UTF-8 string of 1-byte CP, 2-byte CP, 3-byte CP and 4-byte CP
415 const char32_t input[] = {'b', 0x0448, 0xAAAA, 0x10AAAA, 0};
416 const unsigned char expected[] = "b\u0448\uAAAA\U0010AAAA";
417 static_assert(array_size(input) == 5, "");
418 static_assert(array_size(expected) == 11, "");
419
420 InternT in[array_size(input)];
421 ExternT exp[array_size(expected)];
422 copy(begin(input), end(input), begin(in));
423 copy(begin(expected), end(expected), begin(exp));
424 assert(char_traits<InternT>::length(in) == 4);
425 assert(char_traits<ExternT>::length(exp) == 10);
426
427 test_offsets_error<InternT> offsets[] = {
428
429 // Surrogate CP
430 {4, 10, 0, 0, 0xD800, 0},
431 {4, 10, 1, 1, 0xDBFF, 1},
432 {4, 10, 2, 3, 0xDC00, 2},
433 {4, 10, 3, 6, 0xDFFF, 3},
434
435 // CP out of range
436 {4, 10, 0, 0, 0x00110000, 0},
437 {4, 10, 1, 1, 0x00110000, 1},
438 {4, 10, 2, 3, 0x00110000, 2},
439 {4, 10, 3, 6, 0x00110000, 3}};
440
441 for (test_offsets_error<InternT>* it = begin(offsets); it != end(offsets); ++it) {
442 test_offsets_error<InternT> t = *it;
443 ExternT out[array_size(exp) - 1] = {};
444 assert(t.in_size <= array_size(in));
445 assert(t.out_size <= array_size(out));
446 assert(t.expected_in_next <= t.in_size);
447 assert(t.expected_out_next <= t.out_size);
448 InternT old_char = in[t.replace_pos];
449 in[t.replace_pos] = t.replace_char;
450
451 mbstate_t state = {};
452 const InternT* in_next = nullptr;
453 ExternT* out_next = nullptr;
454 codecvt_base::result res = codecvt_base::ok;
455
456 res = cvt.out(state, in, in + t.in_size, in_next, out, out + t.out_size, out_next);
457 assert(res == cvt.error);
458 assert(in_next == in + t.expected_in_next);
459 assert(out_next == out + t.expected_out_next);
460 assert(char_traits<ExternT>::compare(out, exp, t.expected_out_next) == 0);
461 if (t.expected_out_next < array_size(out))
462 assert(out[t.expected_out_next] == 0);
463
464 in[t.replace_pos] = old_char;
465 }
466 }
467
468 template <class InternT, class ExternT>
utf32_to_utf8_out(const std::codecvt<InternT,ExternT,mbstate_t> & cvt)469 void utf32_to_utf8_out(const std::codecvt<InternT, ExternT, mbstate_t>& cvt) {
470 utf32_to_utf8_out_ok(cvt);
471 utf32_to_utf8_out_partial(cvt);
472 utf32_to_utf8_out_error(cvt);
473 }
474
475 template <class InternT, class ExternT>
test_utf8_utf32_cvt(const std::codecvt<InternT,ExternT,mbstate_t> & cvt)476 void test_utf8_utf32_cvt(const std::codecvt<InternT, ExternT, mbstate_t>& cvt) {
477 utf8_to_utf32_in(cvt);
478 utf32_to_utf8_out(cvt);
479 }
480
481 template <class InternT, class ExternT>
utf8_to_utf16_in_ok(const std::codecvt<InternT,ExternT,mbstate_t> & cvt)482 void utf8_to_utf16_in_ok(const std::codecvt<InternT, ExternT, mbstate_t>& cvt) {
483 // UTF-8 string of 1-byte CP, 2-byte CP, 3-byte CP and 4-byte CP
484 const unsigned char input[] = "b\u0448\uAAAA\U0010AAAA";
485 const char16_t expected[] = {'b', 0x0448, 0xAAAA, 0xDBEA, 0xDEAA, 0};
486 static_assert(array_size(input) == 11, "");
487 static_assert(array_size(expected) == 6, "");
488
489 ExternT in[array_size(input)];
490 InternT exp[array_size(expected)];
491 copy(begin(input), end(input), begin(in));
492 copy(begin(expected), end(expected), begin(exp));
493 assert(char_traits<ExternT>::length(in) == 10);
494 assert(char_traits<InternT>::length(exp) == 5);
495
496 test_offsets_ok offsets[] = {{0, 0}, {1, 1}, {3, 2}, {6, 3}, {10, 5}};
497 for (test_offsets_ok* it = begin(offsets); it != end(offsets); ++it) {
498 test_offsets_ok t = *it;
499 InternT out[array_size(exp) - 1] = {};
500 assert(t.in_size <= array_size(in));
501 assert(t.out_size <= array_size(out));
502 mbstate_t state = {};
503 const ExternT* in_next = nullptr;
504 InternT* out_next = nullptr;
505 codecvt_base::result res = codecvt_base::ok;
506
507 res = cvt.in(state, in, in + t.in_size, in_next, out, out + t.out_size, out_next);
508 assert(res == cvt.ok);
509 assert(in_next == in + t.in_size);
510 assert(out_next == out + t.out_size);
511 assert(char_traits<InternT>::compare(out, exp, t.out_size) == 0);
512 if (t.out_size < array_size(out))
513 assert(out[t.out_size] == 0);
514
515 state = mbstate_t();
516 int len = cvt.length(state, in, in + t.in_size, t.out_size);
517 assert(len >= 0);
518 assert(static_cast<size_t>(len) == t.in_size);
519 }
520
521 for (test_offsets_ok* it = begin(offsets); it != end(offsets); ++it) {
522 test_offsets_ok t = *it;
523 InternT out[array_size(exp)] = {};
524 assert(t.in_size <= array_size(in));
525 assert(t.out_size <= array_size(out));
526 mbstate_t state = {};
527 const ExternT* in_next = nullptr;
528 InternT* out_next = nullptr;
529 codecvt_base::result res = codecvt_base::ok;
530
531 res = cvt.in(state, in, in + t.in_size, in_next, out, end(out), out_next);
532 assert(res == cvt.ok);
533 assert(in_next == in + t.in_size);
534 assert(out_next == out + t.out_size);
535 assert(char_traits<InternT>::compare(out, exp, t.out_size) == 0);
536 if (t.out_size < array_size(out))
537 assert(out[t.out_size] == 0);
538
539 state = mbstate_t();
540 int len = cvt.length(state, in, in + t.in_size, array_size(out));
541 assert(len >= 0);
542 assert(static_cast<size_t>(len) == t.in_size);
543 }
544 }
545
546 template <class InternT, class ExternT>
utf8_to_utf16_in_partial(const std::codecvt<InternT,ExternT,mbstate_t> & cvt)547 void utf8_to_utf16_in_partial(const std::codecvt<InternT, ExternT, mbstate_t>& cvt) {
548 // UTF-8 string of 1-byte CP, 2-byte CP, 3-byte CP and 4-byte CP
549 const unsigned char input[] = "b\u0448\uAAAA\U0010AAAA";
550 const char16_t expected[] = {'b', 0x0448, 0xAAAA, 0xDBEA, 0xDEAA, 0};
551 static_assert(array_size(input) == 11, "");
552 static_assert(array_size(expected) == 6, "");
553
554 ExternT in[array_size(input)];
555 InternT exp[array_size(expected)];
556 copy(begin(input), end(input), begin(in));
557 copy(begin(expected), end(expected), begin(exp));
558 assert(char_traits<ExternT>::length(in) == 10);
559 assert(char_traits<InternT>::length(exp) == 5);
560
561 test_offsets_partial offsets[] = {
562 {1, 0, 0, 0}, // no space for first CP
563
564 {3, 1, 1, 1}, // no space for second CP
565 {2, 2, 1, 1}, // incomplete second CP
566 {2, 1, 1, 1}, // incomplete second CP, and no space for it
567
568 {6, 2, 3, 2}, // no space for third CP
569 {4, 3, 3, 2}, // incomplete third CP
570 {5, 3, 3, 2}, // incomplete third CP
571 {4, 2, 3, 2}, // incomplete third CP, and no space for it
572 {5, 2, 3, 2}, // incomplete third CP, and no space for it
573
574 {10, 3, 6, 3}, // no space for fourth CP
575 {10, 4, 6, 3}, // no space for fourth CP
576 {7, 5, 6, 3}, // incomplete fourth CP
577 {8, 5, 6, 3}, // incomplete fourth CP
578 {9, 5, 6, 3}, // incomplete fourth CP
579 {7, 3, 6, 3}, // incomplete fourth CP, and no space for it
580 {8, 3, 6, 3}, // incomplete fourth CP, and no space for it
581 {9, 3, 6, 3}, // incomplete fourth CP, and no space for it
582 {7, 4, 6, 3}, // incomplete fourth CP, and no space for it
583 {8, 4, 6, 3}, // incomplete fourth CP, and no space for it
584 {9, 4, 6, 3}, // incomplete fourth CP, and no space for it
585
586 };
587
588 for (test_offsets_partial* it = begin(offsets); it != end(offsets); ++it) {
589 test_offsets_partial t = *it;
590 InternT out[array_size(exp) - 1] = {};
591 assert(t.in_size <= array_size(in));
592 assert(t.out_size <= array_size(out));
593 assert(t.expected_in_next <= t.in_size);
594 assert(t.expected_out_next <= t.out_size);
595 mbstate_t state = {};
596 const ExternT* in_next = nullptr;
597 InternT* out_next = nullptr;
598 codecvt_base::result res = codecvt_base::ok;
599
600 res = cvt.in(state, in, in + t.in_size, in_next, out, out + t.out_size, out_next);
601 assert(res == cvt.partial);
602 assert(in_next == in + t.expected_in_next);
603 assert(out_next == out + t.expected_out_next);
604 assert(char_traits<InternT>::compare(out, exp, t.expected_out_next) == 0);
605 if (t.expected_out_next < array_size(out))
606 assert(out[t.expected_out_next] == 0);
607
608 state = mbstate_t();
609 int len = cvt.length(state, in, in + t.in_size, t.out_size);
610 assert(len >= 0);
611 assert(static_cast<size_t>(len) == t.expected_in_next);
612 }
613 }
614
615 template <class InternT, class ExternT>
utf8_to_utf16_in_error(const std::codecvt<InternT,ExternT,mbstate_t> & cvt)616 void utf8_to_utf16_in_error(const std::codecvt<InternT, ExternT, mbstate_t>& cvt) {
617 // UTF-8 string of 1-byte CP, 2-byte CP, 3-byte CP, 4-byte CP
618 const unsigned char input[] = "b\u0448\uD700\U0010AAAA";
619 const char16_t expected[] = {'b', 0x0448, 0xD700, 0xDBEA, 0xDEAA, 0};
620 static_assert(array_size(input) == 11, "");
621 static_assert(array_size(expected) == 6, "");
622
623 ExternT in[array_size(input)];
624 InternT exp[array_size(expected)];
625 copy(begin(input), end(input), begin(in));
626 copy(begin(expected), end(expected), begin(exp));
627 assert(char_traits<ExternT>::length(in) == 10);
628 assert(char_traits<InternT>::length(exp) == 5);
629
630 // There are 5 classes of errors in UTF-8 decoding
631 // 1. Missing leading byte
632 // 2. Missing trailing byte
633 // 3. Surrogate CP
634 // 4. Overlong sequence
635 // 5. CP out of Unicode range
636 test_offsets_error<unsigned char> offsets[] = {
637
638 // 1. Missing leading byte. We will replace the leading byte with
639 // non-leading byte, such as a byte that is always invalid or a trailing
640 // byte.
641
642 // replace leading byte with invalid byte
643 {1, 5, 0, 0, 0xFF, 0},
644 {3, 5, 1, 1, 0xFF, 1},
645 {6, 5, 3, 2, 0xFF, 3},
646 {10, 5, 6, 3, 0xFF, 6},
647
648 // replace leading byte with trailing byte
649 {1, 5, 0, 0, 0b10101010, 0},
650 {3, 5, 1, 1, 0b10101010, 1},
651 {6, 5, 3, 2, 0b10101010, 3},
652 {10, 5, 6, 3, 0b10101010, 6},
653
654 // 2. Missing trailing byte. We will replace the trailing byte with
655 // non-trailing byte, such as a byte that is always invalid or a leading
656 // byte (simple ASCII byte in our case).
657
658 // replace first trailing byte with ASCII byte
659 {3, 5, 1, 1, 'z', 2},
660 {6, 5, 3, 2, 'z', 4},
661 {10, 5, 6, 3, 'z', 7},
662
663 // replace first trailing byte with invalid byte
664 {3, 5, 1, 1, 0xFF, 2},
665 {6, 5, 3, 2, 0xFF, 4},
666 {10, 5, 6, 3, 0xFF, 7},
667
668 // replace second trailing byte with ASCII byte
669 {6, 5, 3, 2, 'z', 5},
670 {10, 5, 6, 3, 'z', 8},
671
672 // replace second trailing byte with invalid byte
673 {6, 5, 3, 2, 0xFF, 5},
674 {10, 5, 6, 3, 0xFF, 8},
675
676 // replace third trailing byte
677 {10, 5, 6, 3, 'z', 9},
678 {10, 5, 6, 3, 0xFF, 9},
679
680 // 2.1 The following test-cases raise doubt whether error or partial should
681 // be returned. For example, we have 4-byte sequence with valid leading
682 // byte. If we hide the last byte we need to return partial. But, if the
683 // second or third byte, which are visible to the call to codecvt, are
684 // malformed then error should be returned.
685
686 // replace first trailing byte with ASCII byte, also incomplete at end
687 {5, 5, 3, 2, 'z', 4},
688 {8, 5, 6, 3, 'z', 7},
689 {9, 5, 6, 3, 'z', 7},
690
691 // replace first trailing byte with invalid byte, also incomplete at end
692 {5, 5, 3, 2, 0xFF, 4},
693 {8, 5, 6, 3, 0xFF, 7},
694 {9, 5, 6, 3, 0xFF, 7},
695
696 // replace second trailing byte with ASCII byte, also incomplete at end
697 {9, 5, 6, 3, 'z', 8},
698
699 // replace second trailing byte with invalid byte, also incomplete at end
700 {9, 5, 6, 3, 0xFF, 8},
701
702 // 3. Surrogate CP. We modify the second byte (first trailing) of the 3-byte
703 // CP U+D700
704 {6, 5, 3, 2, 0b10100000, 4}, // turn U+D700 into U+D800
705 {6, 5, 3, 2, 0b10101100, 4}, // turn U+D700 into U+DB00
706 {6, 5, 3, 2, 0b10110000, 4}, // turn U+D700 into U+DC00
707 {6, 5, 3, 2, 0b10111100, 4}, // turn U+D700 into U+DF00
708
709 // 4. Overlong sequence. The CPs in the input are chosen such as modifying
710 // just the leading byte is enough to make them overlong, i.e. for the
711 // 3-byte and 4-byte CP the second byte (first trailing) has enough leading
712 // zeroes.
713 {3, 5, 1, 1, 0b11000000, 1}, // make the 2-byte CP overlong
714 {3, 5, 1, 1, 0b11000001, 1}, // make the 2-byte CP overlong
715 {6, 5, 3, 2, 0b11100000, 3}, // make the 3-byte CP overlong
716 {10, 5, 6, 3, 0b11110000, 6}, // make the 4-byte CP overlong
717
718 // 5. CP above range
719 // turn U+10AAAA into U+14AAAA by changing its leading byte
720 {10, 5, 6, 3, 0b11110101, 6},
721 // turn U+10AAAA into U+11AAAA by changing its 2nd byte
722 {10, 5, 6, 3, 0b10011010, 7},
723 };
724 for (test_offsets_error<unsigned char>* it = begin(offsets); it != end(offsets); ++it) {
725 test_offsets_error<unsigned char> t = *it;
726 InternT out[array_size(exp) - 1] = {};
727 assert(t.in_size <= array_size(in));
728 assert(t.out_size <= array_size(out));
729 assert(t.expected_in_next <= t.in_size);
730 assert(t.expected_out_next <= t.out_size);
731 ExternT old_char = in[t.replace_pos];
732 in[t.replace_pos] = t.replace_char;
733
734 mbstate_t state = {};
735 const ExternT* in_next = nullptr;
736 InternT* out_next = nullptr;
737 codecvt_base::result res = codecvt_base::ok;
738
739 res = cvt.in(state, in, in + t.in_size, in_next, out, out + t.out_size, out_next);
740 assert(res == cvt.error);
741 assert(in_next == in + t.expected_in_next);
742 assert(out_next == out + t.expected_out_next);
743 assert(char_traits<InternT>::compare(out, exp, t.expected_out_next) == 0);
744 if (t.expected_out_next < array_size(out))
745 assert(out[t.expected_out_next] == 0);
746
747 state = mbstate_t();
748 int len = cvt.length(state, in, in + t.in_size, t.out_size);
749 assert(len >= 0);
750 assert(static_cast<size_t>(len) == t.expected_in_next);
751
752 in[t.replace_pos] = old_char;
753 }
754 }
755
756 template <class InternT, class ExternT>
utf8_to_utf16_in(const std::codecvt<InternT,ExternT,mbstate_t> & cvt)757 void utf8_to_utf16_in(const std::codecvt<InternT, ExternT, mbstate_t>& cvt) {
758 utf8_to_utf16_in_ok(cvt);
759 utf8_to_utf16_in_partial(cvt);
760 utf8_to_utf16_in_error(cvt);
761 }
762
763 template <class InternT, class ExternT>
utf16_to_utf8_out_ok(const std::codecvt<InternT,ExternT,mbstate_t> & cvt)764 void utf16_to_utf8_out_ok(const std::codecvt<InternT, ExternT, mbstate_t>& cvt) {
765 // UTF-8 string of 1-byte CP, 2-byte CP, 3-byte CP and 4-byte CP
766 const char16_t input[] = {'b', 0x0448, 0xAAAA, 0xDBEA, 0xDEAA, 0};
767 const unsigned char expected[] = "b\u0448\uAAAA\U0010AAAA";
768 static_assert(array_size(input) == 6, "");
769 static_assert(array_size(expected) == 11, "");
770
771 InternT in[array_size(input)];
772 ExternT exp[array_size(expected)];
773 copy(begin(input), end(input), begin(in));
774 copy(begin(expected), end(expected), begin(exp));
775 assert(char_traits<InternT>::length(in) == 5);
776 assert(char_traits<ExternT>::length(exp) == 10);
777
778 test_offsets_ok offsets[] = {{0, 0}, {1, 1}, {2, 3}, {3, 6}, {5, 10}};
779 for (test_offsets_ok* it = begin(offsets); it != end(offsets); ++it) {
780 test_offsets_ok t = *it;
781 ExternT out[array_size(exp) - 1] = {};
782 assert(t.in_size <= array_size(in));
783 assert(t.out_size <= array_size(out));
784 mbstate_t state = {};
785 const InternT* in_next = nullptr;
786 ExternT* out_next = nullptr;
787 codecvt_base::result res = codecvt_base::ok;
788
789 res = cvt.out(state, in, in + t.in_size, in_next, out, out + t.out_size, out_next);
790 assert(res == cvt.ok);
791 assert(in_next == in + t.in_size);
792 assert(out_next == out + t.out_size);
793 assert(char_traits<ExternT>::compare(out, exp, t.out_size) == 0);
794 if (t.out_size < array_size(out))
795 assert(out[t.out_size] == 0);
796 }
797 }
798
799 template <class InternT, class ExternT>
utf16_to_utf8_out_partial(const std::codecvt<InternT,ExternT,mbstate_t> & cvt)800 void utf16_to_utf8_out_partial(const std::codecvt<InternT, ExternT, mbstate_t>& cvt) {
801 // UTF-8 string of 1-byte CP, 2-byte CP, 3-byte CP and 4-byte CP
802 const char16_t input[] = {'b', 0x0448, 0xAAAA, 0xDBEA, 0xDEAA, 0};
803 const unsigned char expected[] = "b\u0448\uAAAA\U0010AAAA";
804 static_assert(array_size(input) == 6, "");
805 static_assert(array_size(expected) == 11, "");
806
807 InternT in[array_size(input)];
808 ExternT exp[array_size(expected)];
809 copy(begin(input), end(input), begin(in));
810 copy(begin(expected), end(expected), begin(exp));
811 assert(char_traits<InternT>::length(in) == 5);
812 assert(char_traits<ExternT>::length(exp) == 10);
813
814 test_offsets_partial offsets[] = {
815 {1, 0, 0, 0}, // no space for first CP
816
817 {2, 1, 1, 1}, // no space for second CP
818 {2, 2, 1, 1}, // no space for second CP
819
820 {3, 3, 2, 3}, // no space for third CP
821 {3, 4, 2, 3}, // no space for third CP
822 {3, 5, 2, 3}, // no space for third CP
823
824 {5, 6, 3, 6}, // no space for fourth CP
825 {5, 7, 3, 6}, // no space for fourth CP
826 {5, 8, 3, 6}, // no space for fourth CP
827 {5, 9, 3, 6}, // no space for fourth CP
828
829 {4, 10, 3, 6}, // incomplete fourth CP
830
831 {4, 6, 3, 6}, // incomplete fourth CP, and no space for it
832 {4, 7, 3, 6}, // incomplete fourth CP, and no space for it
833 {4, 8, 3, 6}, // incomplete fourth CP, and no space for it
834 {4, 9, 3, 6}, // incomplete fourth CP, and no space for it
835 };
836 for (test_offsets_partial* it = begin(offsets); it != end(offsets); ++it) {
837 test_offsets_partial t = *it;
838 ExternT out[array_size(exp) - 1] = {};
839 assert(t.in_size <= array_size(in));
840 assert(t.out_size <= array_size(out));
841 assert(t.expected_in_next <= t.in_size);
842 assert(t.expected_out_next <= t.out_size);
843 mbstate_t state = {};
844 const InternT* in_next = nullptr;
845 ExternT* out_next = nullptr;
846 codecvt_base::result res = codecvt_base::ok;
847
848 res = cvt.out(state, in, in + t.in_size, in_next, out, out + t.out_size, out_next);
849 assert(res == cvt.partial);
850 assert(in_next == in + t.expected_in_next);
851 assert(out_next == out + t.expected_out_next);
852 assert(char_traits<ExternT>::compare(out, exp, t.expected_out_next) == 0);
853 if (t.expected_out_next < array_size(out))
854 assert(out[t.expected_out_next] == 0);
855 }
856 }
857
858 template <class InternT, class ExternT>
utf16_to_utf8_out_error(const std::codecvt<InternT,ExternT,mbstate_t> & cvt)859 void utf16_to_utf8_out_error(const std::codecvt<InternT, ExternT, mbstate_t>& cvt) {
860 // UTF-8 string of 1-byte CP, 2-byte CP, 3-byte CP and 4-byte CP
861 const char16_t input[] = {'b', 0x0448, 0xAAAA, 0xDBEA, 0xDEAA, 0};
862 const unsigned char expected[] = "b\u0448\uAAAA\U0010AAAA";
863 static_assert(array_size(input) == 6, "");
864 static_assert(array_size(expected) == 11, "");
865
866 InternT in[array_size(input)];
867 ExternT exp[array_size(expected)];
868 copy(begin(input), end(input), begin(in));
869 copy(begin(expected), end(expected), begin(exp));
870 assert(char_traits<InternT>::length(in) == 5);
871 assert(char_traits<ExternT>::length(exp) == 10);
872
873 // The only possible error in UTF-16 is unpaired surrogate code units.
874 // So we replace valid code points (scalar values) with lone surrogate CU.
875 test_offsets_error<InternT> offsets[] = {
876 {5, 10, 0, 0, 0xD800, 0},
877 {5, 10, 0, 0, 0xDBFF, 0},
878 {5, 10, 0, 0, 0xDC00, 0},
879 {5, 10, 0, 0, 0xDFFF, 0},
880
881 {5, 10, 1, 1, 0xD800, 1},
882 {5, 10, 1, 1, 0xDBFF, 1},
883 {5, 10, 1, 1, 0xDC00, 1},
884 {5, 10, 1, 1, 0xDFFF, 1},
885
886 {5, 10, 2, 3, 0xD800, 2},
887 {5, 10, 2, 3, 0xDBFF, 2},
888 {5, 10, 2, 3, 0xDC00, 2},
889 {5, 10, 2, 3, 0xDFFF, 2},
890
891 // make the leading surrogate a trailing one
892 {5, 10, 3, 6, 0xDC00, 3},
893 {5, 10, 3, 6, 0xDFFF, 3},
894
895 // make the trailing surrogate a leading one
896 {5, 10, 3, 6, 0xD800, 4},
897 {5, 10, 3, 6, 0xDBFF, 4},
898
899 // make the trailing surrogate a BMP char
900 {5, 10, 3, 6, 'z', 4},
901 };
902
903 for (test_offsets_error<InternT>* it = begin(offsets); it != end(offsets); ++it) {
904 test_offsets_error<InternT> t = *it;
905 ExternT out[array_size(exp) - 1] = {};
906 assert(t.in_size <= array_size(in));
907 assert(t.out_size <= array_size(out));
908 assert(t.expected_in_next <= t.in_size);
909 assert(t.expected_out_next <= t.out_size);
910 InternT old_char = in[t.replace_pos];
911 in[t.replace_pos] = t.replace_char;
912
913 mbstate_t state = {};
914 const InternT* in_next = nullptr;
915 ExternT* out_next = nullptr;
916 codecvt_base::result res = codecvt_base::ok;
917
918 res = cvt.out(state, in, in + t.in_size, in_next, out, out + t.out_size, out_next);
919 assert(res == cvt.error);
920 assert(in_next == in + t.expected_in_next);
921 assert(out_next == out + t.expected_out_next);
922 assert(char_traits<ExternT>::compare(out, exp, t.expected_out_next) == 0);
923 if (t.expected_out_next < array_size(out))
924 assert(out[t.expected_out_next] == 0);
925
926 in[t.replace_pos] = old_char;
927 }
928 }
929
930 template <class InternT, class ExternT>
utf16_to_utf8_out(const std::codecvt<InternT,ExternT,mbstate_t> & cvt)931 void utf16_to_utf8_out(const std::codecvt<InternT, ExternT, mbstate_t>& cvt) {
932 utf16_to_utf8_out_ok(cvt);
933 utf16_to_utf8_out_partial(cvt);
934 utf16_to_utf8_out_error(cvt);
935 }
936
937 template <class InternT, class ExternT>
test_utf8_utf16_cvt(const std::codecvt<InternT,ExternT,mbstate_t> & cvt)938 void test_utf8_utf16_cvt(const std::codecvt<InternT, ExternT, mbstate_t>& cvt) {
939 utf8_to_utf16_in(cvt);
940 utf16_to_utf8_out(cvt);
941 }
942
943 template <class InternT, class ExternT>
utf8_to_ucs2_in_ok(const std::codecvt<InternT,ExternT,mbstate_t> & cvt)944 void utf8_to_ucs2_in_ok(const std::codecvt<InternT, ExternT, mbstate_t>& cvt) {
945 // UTF-8 string of 1-byte CP, 2-byte CP and 3-byte CP
946 const unsigned char input[] = "b\u0448\uAAAA";
947 const char16_t expected[] = {'b', 0x0448, 0xAAAA, 0};
948 static_assert(array_size(input) == 7, "");
949 static_assert(array_size(expected) == 4, "");
950
951 ExternT in[array_size(input)];
952 InternT exp[array_size(expected)];
953 copy(begin(input), end(input), begin(in));
954 copy(begin(expected), end(expected), begin(exp));
955 assert(char_traits<ExternT>::length(in) == 6);
956 assert(char_traits<InternT>::length(exp) == 3);
957
958 test_offsets_ok offsets[] = {{0, 0}, {1, 1}, {3, 2}, {6, 3}};
959 for (test_offsets_ok* it = begin(offsets); it != end(offsets); ++it) {
960 test_offsets_ok t = *it;
961 InternT out[array_size(exp) - 1] = {};
962 assert(t.in_size <= array_size(in));
963 assert(t.out_size <= array_size(out));
964 mbstate_t state = {};
965 const ExternT* in_next = nullptr;
966 InternT* out_next = nullptr;
967 codecvt_base::result res = codecvt_base::ok;
968
969 res = cvt.in(state, in, in + t.in_size, in_next, out, out + t.out_size, out_next);
970 assert(res == cvt.ok);
971 assert(in_next == in + t.in_size);
972 assert(out_next == out + t.out_size);
973 assert(char_traits<InternT>::compare(out, exp, t.out_size) == 0);
974 if (t.out_size < array_size(out))
975 assert(out[t.out_size] == 0);
976
977 state = mbstate_t();
978 int len = cvt.length(state, in, in + t.in_size, t.out_size);
979 assert(len >= 0);
980 assert(static_cast<size_t>(len) == t.in_size);
981 }
982
983 for (test_offsets_ok* it = begin(offsets); it != end(offsets); ++it) {
984 test_offsets_ok t = *it;
985 InternT out[array_size(exp)] = {};
986 assert(t.in_size <= array_size(in));
987 assert(t.out_size <= array_size(out));
988 mbstate_t state = {};
989 const ExternT* in_next = nullptr;
990 InternT* out_next = nullptr;
991 codecvt_base::result res = codecvt_base::ok;
992
993 res = cvt.in(state, in, in + t.in_size, in_next, out, end(out), out_next);
994 assert(res == cvt.ok);
995 assert(in_next == in + t.in_size);
996 assert(out_next == out + t.out_size);
997 assert(char_traits<InternT>::compare(out, exp, t.out_size) == 0);
998 if (t.out_size < array_size(out))
999 assert(out[t.out_size] == 0);
1000
1001 state = mbstate_t();
1002 int len = cvt.length(state, in, in + t.in_size, array_size(out));
1003 assert(len >= 0);
1004 assert(static_cast<size_t>(len) == t.in_size);
1005 }
1006 }
1007
1008 template <class InternT, class ExternT>
utf8_to_ucs2_in_partial(const std::codecvt<InternT,ExternT,mbstate_t> & cvt)1009 void utf8_to_ucs2_in_partial(const std::codecvt<InternT, ExternT, mbstate_t>& cvt) {
1010 // UTF-8 string of 1-byte CP, 2-byte CP and 3-byte CP
1011 const unsigned char input[] = "b\u0448\uAAAA";
1012 const char16_t expected[] = {'b', 0x0448, 0xAAAA, 0};
1013 static_assert(array_size(input) == 7, "");
1014 static_assert(array_size(expected) == 4, "");
1015
1016 ExternT in[array_size(input)];
1017 InternT exp[array_size(expected)];
1018 copy(begin(input), end(input), begin(in));
1019 copy(begin(expected), end(expected), begin(exp));
1020 assert(char_traits<ExternT>::length(in) == 6);
1021 assert(char_traits<InternT>::length(exp) == 3);
1022
1023 test_offsets_partial offsets[] = {
1024 {1, 0, 0, 0}, // no space for first CP
1025
1026 {3, 1, 1, 1}, // no space for second CP
1027 {2, 2, 1, 1}, // incomplete second CP
1028 {2, 1, 1, 1}, // incomplete second CP, and no space for it
1029
1030 {6, 2, 3, 2}, // no space for third CP
1031 {4, 3, 3, 2}, // incomplete third CP
1032 {5, 3, 3, 2}, // incomplete third CP
1033 {4, 2, 3, 2}, // incomplete third CP, and no space for it
1034 {5, 2, 3, 2}, // incomplete third CP, and no space for it
1035 };
1036
1037 for (test_offsets_partial* it = begin(offsets); it != end(offsets); ++it) {
1038 test_offsets_partial t = *it;
1039 InternT out[array_size(exp) - 1] = {};
1040 assert(t.in_size <= array_size(in));
1041 assert(t.out_size <= array_size(out));
1042 assert(t.expected_in_next <= t.in_size);
1043 assert(t.expected_out_next <= t.out_size);
1044 mbstate_t state = {};
1045 const ExternT* in_next = nullptr;
1046 InternT* out_next = nullptr;
1047 codecvt_base::result res = codecvt_base::ok;
1048
1049 res = cvt.in(state, in, in + t.in_size, in_next, out, out + t.out_size, out_next);
1050 assert(res == cvt.partial);
1051 assert(in_next == in + t.expected_in_next);
1052 assert(out_next == out + t.expected_out_next);
1053 assert(char_traits<InternT>::compare(out, exp, t.expected_out_next) == 0);
1054 if (t.expected_out_next < array_size(out))
1055 assert(out[t.expected_out_next] == 0);
1056
1057 state = mbstate_t();
1058 int len = cvt.length(state, in, in + t.in_size, t.out_size);
1059 assert(len >= 0);
1060 assert(static_cast<size_t>(len) == t.expected_in_next);
1061 }
1062 }
1063
1064 template <class InternT, class ExternT>
utf8_to_ucs2_in_error(const std::codecvt<InternT,ExternT,mbstate_t> & cvt)1065 void utf8_to_ucs2_in_error(const std::codecvt<InternT, ExternT, mbstate_t>& cvt) {
1066 const unsigned char input[] = "b\u0448\uD700\U0010AAAA";
1067 const char16_t expected[] = {'b', 0x0448, 0xD700, 0xDBEA, 0xDEAA, 0};
1068 static_assert(array_size(input) == 11, "");
1069 static_assert(array_size(expected) == 6, "");
1070
1071 ExternT in[array_size(input)];
1072 InternT exp[array_size(expected)];
1073 copy(begin(input), end(input), begin(in));
1074 copy(begin(expected), end(expected), begin(exp));
1075 assert(char_traits<ExternT>::length(in) == 10);
1076 assert(char_traits<InternT>::length(exp) == 5);
1077
1078 // There are 5 classes of errors in UTF-8 decoding
1079 // 1. Missing leading byte
1080 // 2. Missing trailing byte
1081 // 3. Surrogate CP
1082 // 4. Overlong sequence
1083 // 5. CP out of Unicode range
1084 test_offsets_error<unsigned char> offsets[] = {
1085
1086 // 1. Missing leading byte. We will replace the leading byte with
1087 // non-leading byte, such as a byte that is always invalid or a trailing
1088 // byte.
1089
1090 // replace leading byte with invalid byte
1091 {1, 5, 0, 0, 0xFF, 0},
1092 {3, 5, 1, 1, 0xFF, 1},
1093 {6, 5, 3, 2, 0xFF, 3},
1094 {10, 5, 6, 3, 0xFF, 6},
1095
1096 // replace leading byte with trailing byte
1097 {1, 5, 0, 0, 0b10101010, 0},
1098 {3, 5, 1, 1, 0b10101010, 1},
1099 {6, 5, 3, 2, 0b10101010, 3},
1100 {10, 5, 6, 3, 0b10101010, 6},
1101
1102 // 2. Missing trailing byte. We will replace the trailing byte with
1103 // non-trailing byte, such as a byte that is always invalid or a leading
1104 // byte (simple ASCII byte in our case).
1105
1106 // replace first trailing byte with ASCII byte
1107 {3, 5, 1, 1, 'z', 2},
1108 {6, 5, 3, 2, 'z', 4},
1109 {10, 5, 6, 3, 'z', 7},
1110
1111 // replace first trailing byte with invalid byte
1112 {3, 5, 1, 1, 0xFF, 2},
1113 {6, 5, 3, 2, 0xFF, 4},
1114 {10, 5, 6, 3, 0xFF, 7},
1115
1116 // replace second trailing byte with ASCII byte
1117 {6, 5, 3, 2, 'z', 5},
1118 {10, 5, 6, 3, 'z', 8},
1119
1120 // replace second trailing byte with invalid byte
1121 {6, 5, 3, 2, 0xFF, 5},
1122 {10, 5, 6, 3, 0xFF, 8},
1123
1124 // replace third trailing byte
1125 {10, 5, 6, 3, 'z', 9},
1126 {10, 5, 6, 3, 0xFF, 9},
1127
1128 // 2.1 The following test-cases raise doubt whether error or partial should
1129 // be returned. For example, we have 4-byte sequence with valid leading
1130 // byte. If we hide the last byte we need to return partial. But, if the
1131 // second or third byte, which are visible to the call to codecvt, are
1132 // malformed then error should be returned.
1133
1134 // replace first trailing byte with ASCII byte, also incomplete at end
1135 {5, 5, 3, 2, 'z', 4},
1136 {8, 5, 6, 3, 'z', 7},
1137 {9, 5, 6, 3, 'z', 7},
1138
1139 // replace first trailing byte with invalid byte, also incomplete at end
1140 {5, 5, 3, 2, 0xFF, 4},
1141 {8, 5, 6, 3, 0xFF, 7},
1142 {9, 5, 6, 3, 0xFF, 7},
1143
1144 // replace second trailing byte with ASCII byte, also incomplete at end
1145 {9, 5, 6, 3, 'z', 8},
1146
1147 // replace second trailing byte with invalid byte, also incomplete at end
1148 {9, 5, 6, 3, 0xFF, 8},
1149
1150 // 3. Surrogate CP. We modify the second byte (first trailing) of the 3-byte
1151 // CP U+D700
1152 {6, 5, 3, 2, 0b10100000, 4}, // turn U+D700 into U+D800
1153 {6, 5, 3, 2, 0b10101100, 4}, // turn U+D700 into U+DB00
1154 {6, 5, 3, 2, 0b10110000, 4}, // turn U+D700 into U+DC00
1155 {6, 5, 3, 2, 0b10111100, 4}, // turn U+D700 into U+DF00
1156
1157 // 4. Overlong sequence. The CPs in the input are chosen such as modifying
1158 // just the leading byte is enough to make them overlong, i.e. for the
1159 // 3-byte and 4-byte CP the second byte (first trailing) has enough leading
1160 // zeroes.
1161 {3, 5, 1, 1, 0b11000000, 1}, // make the 2-byte CP overlong
1162 {3, 5, 1, 1, 0b11000001, 1}, // make the 2-byte CP overlong
1163 {6, 5, 3, 2, 0b11100000, 3}, // make the 3-byte CP overlong
1164 {10, 5, 6, 3, 0b11110000, 6}, // make the 4-byte CP overlong
1165
1166 // 5. CP above range
1167 // turn U+10AAAA into U+14AAAA by changing its leading byte
1168 {10, 5, 6, 3, 0b11110101, 6},
1169 // turn U+10AAAA into U+11AAAA by changing its 2nd byte
1170 {10, 5, 6, 3, 0b10011010, 7},
1171 // Don't replace anything, show full 4-byte CP U+10AAAA
1172 {10, 4, 6, 3, 'b', 0},
1173 {10, 5, 6, 3, 'b', 0},
1174 // Don't replace anything, show incomplete 4-byte CP at the end. It's still
1175 // out of UCS2 range just by seeing the first byte.
1176 {7, 4, 6, 3, 'b', 0}, // incomplete fourth CP
1177 {8, 4, 6, 3, 'b', 0}, // incomplete fourth CP
1178 {9, 4, 6, 3, 'b', 0}, // incomplete fourth CP
1179 {7, 5, 6, 3, 'b', 0}, // incomplete fourth CP
1180 {8, 5, 6, 3, 'b', 0}, // incomplete fourth CP
1181 {9, 5, 6, 3, 'b', 0}, // incomplete fourth CP
1182 };
1183 for (test_offsets_error<unsigned char>* it = begin(offsets); it != end(offsets); ++it) {
1184 test_offsets_error<unsigned char> t = *it;
1185 InternT out[array_size(exp) - 1] = {};
1186 assert(t.in_size <= array_size(in));
1187 assert(t.out_size <= array_size(out));
1188 assert(t.expected_in_next <= t.in_size);
1189 assert(t.expected_out_next <= t.out_size);
1190 ExternT old_char = in[t.replace_pos];
1191 in[t.replace_pos] = t.replace_char;
1192
1193 mbstate_t state = {};
1194 const ExternT* in_next = nullptr;
1195 InternT* out_next = nullptr;
1196 codecvt_base::result res = codecvt_base::ok;
1197
1198 res = cvt.in(state, in, in + t.in_size, in_next, out, out + t.out_size, out_next);
1199 assert(res == cvt.error);
1200 assert(in_next == in + t.expected_in_next);
1201 assert(out_next == out + t.expected_out_next);
1202 assert(char_traits<InternT>::compare(out, exp, t.expected_out_next) == 0);
1203 if (t.expected_out_next < array_size(out))
1204 assert(out[t.expected_out_next] == 0);
1205
1206 state = mbstate_t();
1207 int len = cvt.length(state, in, in + t.in_size, t.out_size);
1208 assert(len >= 0);
1209 assert(static_cast<size_t>(len) == t.expected_in_next);
1210
1211 in[t.replace_pos] = old_char;
1212 }
1213 }
1214
1215 template <class InternT, class ExternT>
utf8_to_ucs2_in(const std::codecvt<InternT,ExternT,mbstate_t> & cvt)1216 void utf8_to_ucs2_in(const std::codecvt<InternT, ExternT, mbstate_t>& cvt) {
1217 utf8_to_ucs2_in_ok(cvt);
1218 utf8_to_ucs2_in_partial(cvt);
1219 utf8_to_ucs2_in_error(cvt);
1220 }
1221
1222 template <class InternT, class ExternT>
ucs2_to_utf8_out_ok(const std::codecvt<InternT,ExternT,mbstate_t> & cvt)1223 void ucs2_to_utf8_out_ok(const std::codecvt<InternT, ExternT, mbstate_t>& cvt) {
1224 // UTF-8 string of 1-byte CP, 2-byte CP and 3-byte CP
1225 const char16_t input[] = {'b', 0x0448, 0xAAAA, 0};
1226 const unsigned char expected[] = "b\u0448\uAAAA";
1227 static_assert(array_size(input) == 4, "");
1228 static_assert(array_size(expected) == 7, "");
1229
1230 InternT in[array_size(input)];
1231 ExternT exp[array_size(expected)];
1232 copy(begin(input), end(input), begin(in));
1233 copy(begin(expected), end(expected), begin(exp));
1234 assert(char_traits<InternT>::length(in) == 3);
1235 assert(char_traits<ExternT>::length(exp) == 6);
1236
1237 test_offsets_ok offsets[] = {{0, 0}, {1, 1}, {2, 3}, {3, 6}};
1238 for (test_offsets_ok* it = begin(offsets); it != end(offsets); ++it) {
1239 test_offsets_ok t = *it;
1240 ExternT out[array_size(exp) - 1] = {};
1241 assert(t.in_size <= array_size(in));
1242 assert(t.out_size <= array_size(out));
1243 mbstate_t state = {};
1244 const InternT* in_next = nullptr;
1245 ExternT* out_next = nullptr;
1246 codecvt_base::result res = codecvt_base::ok;
1247
1248 res = cvt.out(state, in, in + t.in_size, in_next, out, out + t.out_size, out_next);
1249 assert(res == cvt.ok);
1250 assert(in_next == in + t.in_size);
1251 assert(out_next == out + t.out_size);
1252 assert(char_traits<ExternT>::compare(out, exp, t.out_size) == 0);
1253 if (t.out_size < array_size(out))
1254 assert(out[t.out_size] == 0);
1255 }
1256 }
1257
1258 template <class InternT, class ExternT>
ucs2_to_utf8_out_partial(const std::codecvt<InternT,ExternT,mbstate_t> & cvt)1259 void ucs2_to_utf8_out_partial(const std::codecvt<InternT, ExternT, mbstate_t>& cvt) {
1260 // UTF-8 string of 1-byte CP, 2-byte CP and 3-byte CP
1261 const char16_t input[] = {'b', 0x0448, 0xAAAA, 0};
1262 const unsigned char expected[] = "b\u0448\uAAAA";
1263 static_assert(array_size(input) == 4, "");
1264 static_assert(array_size(expected) == 7, "");
1265
1266 InternT in[array_size(input)];
1267 ExternT exp[array_size(expected)];
1268 copy(begin(input), end(input), begin(in));
1269 copy(begin(expected), end(expected), begin(exp));
1270 assert(char_traits<InternT>::length(in) == 3);
1271 assert(char_traits<ExternT>::length(exp) == 6);
1272
1273 test_offsets_partial offsets[] = {
1274 {1, 0, 0, 0}, // no space for first CP
1275
1276 {2, 1, 1, 1}, // no space for second CP
1277 {2, 2, 1, 1}, // no space for second CP
1278
1279 {3, 3, 2, 3}, // no space for third CP
1280 {3, 4, 2, 3}, // no space for third CP
1281 {3, 5, 2, 3}, // no space for third CP
1282 };
1283 for (test_offsets_partial* it = begin(offsets); it != end(offsets); ++it) {
1284 test_offsets_partial t = *it;
1285 ExternT out[array_size(exp) - 1] = {};
1286 assert(t.in_size <= array_size(in));
1287 assert(t.out_size <= array_size(out));
1288 assert(t.expected_in_next <= t.in_size);
1289 assert(t.expected_out_next <= t.out_size);
1290 mbstate_t state = {};
1291 const InternT* in_next = nullptr;
1292 ExternT* out_next = nullptr;
1293 codecvt_base::result res = codecvt_base::ok;
1294
1295 res = cvt.out(state, in, in + t.in_size, in_next, out, out + t.out_size, out_next);
1296 assert(res == cvt.partial);
1297 assert(in_next == in + t.expected_in_next);
1298 assert(out_next == out + t.expected_out_next);
1299 assert(char_traits<ExternT>::compare(out, exp, t.expected_out_next) == 0);
1300 if (t.expected_out_next < array_size(out))
1301 assert(out[t.expected_out_next] == 0);
1302 }
1303 }
1304
1305 template <class InternT, class ExternT>
ucs2_to_utf8_out_error(const std::codecvt<InternT,ExternT,mbstate_t> & cvt)1306 void ucs2_to_utf8_out_error(const std::codecvt<InternT, ExternT, mbstate_t>& cvt) {
1307 const char16_t input[] = {'b', 0x0448, 0xAAAA, 0xDBEA, 0xDEAA, 0};
1308 const unsigned char expected[] = "b\u0448\uAAAA\U0010AAAA";
1309 static_assert(array_size(input) == 6, "");
1310 static_assert(array_size(expected) == 11, "");
1311
1312 InternT in[array_size(input)];
1313 ExternT exp[array_size(expected)];
1314 copy(begin(input), end(input), begin(in));
1315 copy(begin(expected), end(expected), begin(exp));
1316 assert(char_traits<InternT>::length(in) == 5);
1317 assert(char_traits<ExternT>::length(exp) == 10);
1318
1319 test_offsets_error<InternT> offsets[] = {
1320 {3, 6, 0, 0, 0xD800, 0},
1321 {3, 6, 0, 0, 0xDBFF, 0},
1322 {3, 6, 0, 0, 0xDC00, 0},
1323 {3, 6, 0, 0, 0xDFFF, 0},
1324
1325 {3, 6, 1, 1, 0xD800, 1},
1326 {3, 6, 1, 1, 0xDBFF, 1},
1327 {3, 6, 1, 1, 0xDC00, 1},
1328 {3, 6, 1, 1, 0xDFFF, 1},
1329
1330 {3, 6, 2, 3, 0xD800, 2},
1331 {3, 6, 2, 3, 0xDBFF, 2},
1332 {3, 6, 2, 3, 0xDC00, 2},
1333 {3, 6, 2, 3, 0xDFFF, 2},
1334
1335 // make the leading surrogate a trailing one
1336 {5, 10, 3, 6, 0xDC00, 3},
1337 {5, 10, 3, 6, 0xDFFF, 3},
1338
1339 // make the trailing surrogate a leading one
1340 {5, 10, 3, 6, 0xD800, 4},
1341 {5, 10, 3, 6, 0xDBFF, 4},
1342
1343 // make the trailing surrogate a BMP char
1344 {5, 10, 3, 6, 'z', 4},
1345
1346 // don't replace anything in the test cases bellow, just show the surrogate
1347 // pair (fourth CP) fully or partially
1348 {5, 10, 3, 6, 'b', 0},
1349 {5, 7, 3, 6, 'b', 0}, // no space for fourth CP
1350 {5, 8, 3, 6, 'b', 0}, // no space for fourth CP
1351 {5, 9, 3, 6, 'b', 0}, // no space for fourth CP
1352
1353 {4, 10, 3, 6, 'b', 0}, // incomplete fourth CP
1354 {4, 7, 3, 6, 'b', 0}, // incomplete fourth CP, and no space for it
1355 {4, 8, 3, 6, 'b', 0}, // incomplete fourth CP, and no space for it
1356 {4, 9, 3, 6, 'b', 0}, // incomplete fourth CP, and no space for it
1357 };
1358
1359 for (test_offsets_error<InternT>* it = begin(offsets); it != end(offsets); ++it) {
1360 test_offsets_error<InternT> t = *it;
1361 ExternT out[array_size(exp) - 1] = {};
1362 assert(t.in_size <= array_size(in));
1363 assert(t.out_size <= array_size(out));
1364 assert(t.expected_in_next <= t.in_size);
1365 assert(t.expected_out_next <= t.out_size);
1366 InternT old_char = in[t.replace_pos];
1367 in[t.replace_pos] = t.replace_char;
1368
1369 mbstate_t state = {};
1370 const InternT* in_next = nullptr;
1371 ExternT* out_next = nullptr;
1372 codecvt_base::result res = codecvt_base::ok;
1373
1374 res = cvt.out(state, in, in + t.in_size, in_next, out, out + t.out_size, out_next);
1375 assert(res == cvt.error);
1376 assert(in_next == in + t.expected_in_next);
1377 assert(out_next == out + t.expected_out_next);
1378 assert(char_traits<ExternT>::compare(out, exp, t.expected_out_next) == 0);
1379 if (t.expected_out_next < array_size(out))
1380 assert(out[t.expected_out_next] == 0);
1381
1382 in[t.replace_pos] = old_char;
1383 }
1384 }
1385
1386 template <class InternT, class ExternT>
ucs2_to_utf8_out(const std::codecvt<InternT,ExternT,mbstate_t> & cvt)1387 void ucs2_to_utf8_out(const std::codecvt<InternT, ExternT, mbstate_t>& cvt) {
1388 ucs2_to_utf8_out_ok(cvt);
1389 ucs2_to_utf8_out_partial(cvt);
1390 ucs2_to_utf8_out_error(cvt);
1391 }
1392
1393 template <class InternT, class ExternT>
test_utf8_ucs2_cvt(const std::codecvt<InternT,ExternT,mbstate_t> & cvt)1394 void test_utf8_ucs2_cvt(const std::codecvt<InternT, ExternT, mbstate_t>& cvt) {
1395 utf8_to_ucs2_in(cvt);
1396 ucs2_to_utf8_out(cvt);
1397 }
1398
1399 enum utf16_endianess { utf16_big_endian, utf16_little_endian };
1400
1401 template <class Iter1, class Iter2>
utf16_to_bytes(Iter1 f,Iter1 l,Iter2 o,utf16_endianess e)1402 Iter2 utf16_to_bytes(Iter1 f, Iter1 l, Iter2 o, utf16_endianess e) {
1403 if (e == utf16_big_endian)
1404 for (; f != l; ++f) {
1405 *o++ = (*f >> 8) & 0xFF;
1406 *o++ = *f & 0xFF;
1407 }
1408 else
1409 for (; f != l; ++f) {
1410 *o++ = *f & 0xFF;
1411 *o++ = (*f >> 8) & 0xFF;
1412 }
1413 return o;
1414 }
1415
1416 template <class InternT>
utf16_to_utf32_in_ok(const std::codecvt<InternT,char,mbstate_t> & cvt,utf16_endianess endianess)1417 void utf16_to_utf32_in_ok(const std::codecvt<InternT, char, mbstate_t>& cvt, utf16_endianess endianess) {
1418 const char16_t input[] = {'b', 0x0448, 0xAAAA, 0xDBEA, 0xDEAA, 0};
1419 const char32_t expected[] = {'b', 0x0448, 0xAAAA, 0x10AAAA, 0};
1420 static_assert(array_size(input) == 6, "");
1421 static_assert(array_size(expected) == 5, "");
1422
1423 char in[array_size(input) * 2];
1424 InternT exp[array_size(expected)];
1425 utf16_to_bytes(begin(input), end(input), begin(in), endianess);
1426 copy(begin(expected), end(expected), begin(exp));
1427
1428 test_offsets_ok offsets[] = {{0, 0}, {2, 1}, {4, 2}, {6, 3}, {10, 4}};
1429 for (test_offsets_ok* it = begin(offsets); it != end(offsets); ++it) {
1430 test_offsets_ok t = *it;
1431 InternT out[array_size(exp) - 1] = {};
1432 assert(t.in_size <= array_size(in));
1433 assert(t.out_size <= array_size(out));
1434 mbstate_t state = {};
1435 const char* in_next = nullptr;
1436 InternT* out_next = nullptr;
1437 codecvt_base::result res = codecvt_base::ok;
1438
1439 res = cvt.in(state, in, in + t.in_size, in_next, out, out + t.out_size, out_next);
1440 assert(res == cvt.ok);
1441 assert(in_next == in + t.in_size);
1442 assert(out_next == out + t.out_size);
1443 assert(char_traits<InternT>::compare(out, exp, t.out_size) == 0);
1444 if (t.out_size < array_size(out))
1445 assert(out[t.out_size] == 0);
1446
1447 state = mbstate_t();
1448 int len = cvt.length(state, in, in + t.in_size, t.out_size);
1449 assert(len >= 0);
1450 assert(static_cast<size_t>(len) == t.in_size);
1451 }
1452
1453 for (test_offsets_ok* it = begin(offsets); it != end(offsets); ++it) {
1454 test_offsets_ok t = *it;
1455 InternT out[array_size(exp)] = {};
1456 assert(t.in_size <= array_size(in));
1457 assert(t.out_size <= array_size(out));
1458 mbstate_t state = {};
1459 const char* in_next = nullptr;
1460 InternT* out_next = nullptr;
1461 codecvt_base::result res = codecvt_base::ok;
1462
1463 res = cvt.in(state, in, in + t.in_size, in_next, out, end(out), out_next);
1464 assert(res == cvt.ok);
1465 assert(in_next == in + t.in_size);
1466 assert(out_next == out + t.out_size);
1467 assert(char_traits<InternT>::compare(out, exp, t.out_size) == 0);
1468 if (t.out_size < array_size(out))
1469 assert(out[t.out_size] == 0);
1470
1471 state = mbstate_t();
1472 int len = cvt.length(state, in, in + t.in_size, array_size(out));
1473 assert(len >= 0);
1474 assert(static_cast<size_t>(len) == t.in_size);
1475 }
1476 }
1477
1478 template <class InternT>
utf16_to_utf32_in_partial(const std::codecvt<InternT,char,mbstate_t> & cvt,utf16_endianess endianess)1479 void utf16_to_utf32_in_partial(const std::codecvt<InternT, char, mbstate_t>& cvt, utf16_endianess endianess) {
1480 const char16_t input[] = {'b', 0x0448, 0xAAAA, 0xDBEA, 0xDEAA, 0};
1481 const char32_t expected[] = {'b', 0x0448, 0xAAAA, 0x10AAAA, 0};
1482 static_assert(array_size(input) == 6, "");
1483 static_assert(array_size(expected) == 5, "");
1484
1485 char in[array_size(input) * 2];
1486 InternT exp[array_size(expected)];
1487 utf16_to_bytes(begin(input), end(input), begin(in), endianess);
1488 copy(begin(expected), end(expected), begin(exp));
1489
1490 test_offsets_partial offsets[] = {
1491 {2, 0, 0, 0}, // no space for first CP
1492 {1, 1, 0, 0}, // incomplete first CP
1493 {1, 0, 0, 0}, // incomplete first CP, and no space for it
1494
1495 {4, 1, 2, 1}, // no space for second CP
1496 {3, 2, 2, 1}, // incomplete second CP
1497 {3, 1, 2, 1}, // incomplete second CP, and no space for it
1498
1499 {6, 2, 4, 2}, // no space for third CP
1500 {5, 3, 4, 2}, // incomplete third CP
1501 {5, 2, 4, 2}, // incomplete third CP, and no space for it
1502
1503 {10, 3, 6, 3}, // no space for fourth CP
1504 {7, 4, 6, 3}, // incomplete fourth CP
1505 {8, 4, 6, 3}, // incomplete fourth CP
1506 {9, 4, 6, 3}, // incomplete fourth CP
1507 {7, 3, 6, 3}, // incomplete fourth CP, and no space for it
1508 {8, 3, 6, 3}, // incomplete fourth CP, and no space for it
1509 {9, 3, 6, 3}, // incomplete fourth CP, and no space for it
1510 };
1511
1512 for (test_offsets_partial* it = begin(offsets); it != end(offsets); ++it) {
1513 test_offsets_partial t = *it;
1514 InternT out[array_size(exp) - 1] = {};
1515 assert(t.in_size <= array_size(in));
1516 assert(t.out_size <= array_size(out));
1517 assert(t.expected_in_next <= t.in_size);
1518 assert(t.expected_out_next <= t.out_size);
1519 mbstate_t state = {};
1520 const char* in_next = nullptr;
1521 InternT* out_next = nullptr;
1522 codecvt_base::result res = codecvt_base::ok;
1523
1524 res = cvt.in(state, in, in + t.in_size, in_next, out, out + t.out_size, out_next);
1525 assert(res == cvt.partial);
1526 assert(in_next == in + t.expected_in_next);
1527 assert(out_next == out + t.expected_out_next);
1528 assert(char_traits<InternT>::compare(out, exp, t.expected_out_next) == 0);
1529 if (t.expected_out_next < array_size(out))
1530 assert(out[t.expected_out_next] == 0);
1531
1532 state = mbstate_t();
1533 int len = cvt.length(state, in, in + t.in_size, t.out_size);
1534 assert(len >= 0);
1535 assert(static_cast<size_t>(len) == t.expected_in_next);
1536 }
1537 }
1538
1539 template <class InternT>
utf16_to_utf32_in_error(const std::codecvt<InternT,char,mbstate_t> & cvt,utf16_endianess endianess)1540 void utf16_to_utf32_in_error(const std::codecvt<InternT, char, mbstate_t>& cvt, utf16_endianess endianess) {
1541 char16_t input[] = {'b', 0x0448, 0xAAAA, 0xDBEA, 0xDEAA, 0};
1542 const char32_t expected[] = {'b', 0x0448, 0xAAAA, 0x10AAAA, 0};
1543 static_assert(array_size(input) == 6, "");
1544 static_assert(array_size(expected) == 5, "");
1545
1546 InternT exp[array_size(expected)];
1547 copy(begin(expected), end(expected), begin(exp));
1548
1549 // The only possible error in UTF-16 is unpaired surrogate code units.
1550 // So we replace valid code points (scalar values) with lone surrogate CU.
1551 test_offsets_error<char16_t> offsets[] = {
1552 {10, 4, 0, 0, 0xD800, 0},
1553 {10, 4, 0, 0, 0xDBFF, 0},
1554 {10, 4, 0, 0, 0xDC00, 0},
1555 {10, 4, 0, 0, 0xDFFF, 0},
1556
1557 {10, 4, 2, 1, 0xD800, 1},
1558 {10, 4, 2, 1, 0xDBFF, 1},
1559 {10, 4, 2, 1, 0xDC00, 1},
1560 {10, 4, 2, 1, 0xDFFF, 1},
1561
1562 {10, 4, 4, 2, 0xD800, 2},
1563 {10, 4, 4, 2, 0xDBFF, 2},
1564 {10, 4, 4, 2, 0xDC00, 2},
1565 {10, 4, 4, 2, 0xDFFF, 2},
1566
1567 // make the leading surrogate a trailing one
1568 {10, 4, 6, 3, 0xDC00, 3},
1569 {10, 4, 6, 3, 0xDFFF, 3},
1570
1571 // make the trailing surrogate a leading one
1572 {10, 4, 6, 3, 0xD800, 4},
1573 {10, 4, 6, 3, 0xDBFF, 4},
1574
1575 // make the trailing surrogate a BMP char
1576 {10, 4, 6, 3, 'z', 4},
1577 };
1578
1579 for (test_offsets_error<char16_t>* it = begin(offsets); it != end(offsets); ++it) {
1580 test_offsets_error<char16_t> t = *it;
1581 char in[array_size(input) * 2];
1582 InternT out[array_size(exp) - 1] = {};
1583 assert(t.in_size <= array_size(in));
1584 assert(t.out_size <= array_size(out));
1585 assert(t.expected_in_next <= t.in_size);
1586 assert(t.expected_out_next <= t.out_size);
1587 char16_t old_char = input[t.replace_pos];
1588 input[t.replace_pos] = t.replace_char; // replace in input, not in in
1589 utf16_to_bytes(begin(input), end(input), begin(in), endianess);
1590
1591 mbstate_t state = {};
1592 const char* in_next = nullptr;
1593 InternT* out_next = nullptr;
1594 codecvt_base::result res = codecvt_base::ok;
1595
1596 res = cvt.in(state, in, in + t.in_size, in_next, out, out + t.out_size, out_next);
1597 assert(res == cvt.error);
1598 assert(in_next == in + t.expected_in_next);
1599 assert(out_next == out + t.expected_out_next);
1600 assert(char_traits<InternT>::compare(out, exp, t.expected_out_next) == 0);
1601 if (t.expected_out_next < array_size(out))
1602 assert(out[t.expected_out_next] == 0);
1603
1604 state = mbstate_t();
1605 int len = cvt.length(state, in, in + t.in_size, t.out_size);
1606 assert(len >= 0);
1607 assert(static_cast<size_t>(len) == t.expected_in_next);
1608
1609 input[t.replace_pos] = old_char;
1610 }
1611 }
1612
1613 template <class InternT>
utf32_to_utf16_out_ok(const std::codecvt<InternT,char,mbstate_t> & cvt,utf16_endianess endianess)1614 void utf32_to_utf16_out_ok(const std::codecvt<InternT, char, mbstate_t>& cvt, utf16_endianess endianess) {
1615 const char32_t input[] = {'b', 0x0448, 0xAAAA, 0x10AAAA, 0};
1616 const char16_t expected[] = {'b', 0x0448, 0xAAAA, 0xDBEA, 0xDEAA, 0};
1617 static_assert(array_size(input) == 5, "");
1618 static_assert(array_size(expected) == 6, "");
1619
1620 InternT in[array_size(input)];
1621 char exp[array_size(expected) * 2];
1622 copy(begin(input), end(input), begin(in));
1623 utf16_to_bytes(begin(expected), end(expected), begin(exp), endianess);
1624
1625 test_offsets_ok offsets[] = {{0, 0}, {1, 2}, {2, 4}, {3, 6}, {4, 10}};
1626 for (test_offsets_ok* it = begin(offsets); it != end(offsets); ++it) {
1627 test_offsets_ok t = *it;
1628 char out[array_size(exp) - 2] = {};
1629 assert(t.in_size <= array_size(in));
1630 assert(t.out_size <= array_size(out));
1631 mbstate_t state = {};
1632 const InternT* in_next = nullptr;
1633 char* out_next = nullptr;
1634 codecvt_base::result res = codecvt_base::ok;
1635
1636 res = cvt.out(state, in, in + t.in_size, in_next, out, out + t.out_size, out_next);
1637 assert(res == cvt.ok);
1638 assert(in_next == in + t.in_size);
1639 assert(out_next == out + t.out_size);
1640 assert(char_traits<char>::compare(out, exp, t.out_size) == 0);
1641 if (t.out_size < array_size(out))
1642 assert(out[t.out_size] == 0);
1643 }
1644 }
1645
1646 template <class InternT>
utf32_to_utf16_out_partial(const std::codecvt<InternT,char,mbstate_t> & cvt,utf16_endianess endianess)1647 void utf32_to_utf16_out_partial(const std::codecvt<InternT, char, mbstate_t>& cvt, utf16_endianess endianess) {
1648 const char32_t input[] = {'b', 0x0448, 0xAAAA, 0x10AAAA, 0};
1649 const char16_t expected[] = {'b', 0x0448, 0xAAAA, 0xDBEA, 0xDEAA, 0};
1650 static_assert(array_size(input) == 5, "");
1651 static_assert(array_size(expected) == 6, "");
1652
1653 InternT in[array_size(input)];
1654 char exp[array_size(expected) * 2];
1655 copy(begin(input), end(input), begin(in));
1656 utf16_to_bytes(begin(expected), end(expected), begin(exp), endianess);
1657
1658 test_offsets_partial offsets[] = {
1659 {1, 0, 0, 0}, // no space for first CP
1660 {1, 1, 0, 0}, // no space for first CP
1661
1662 {2, 2, 1, 2}, // no space for second CP
1663 {2, 3, 1, 2}, // no space for second CP
1664
1665 {3, 4, 2, 4}, // no space for third CP
1666 {3, 5, 2, 4}, // no space for third CP
1667
1668 {4, 6, 3, 6}, // no space for fourth CP
1669 {4, 7, 3, 6}, // no space for fourth CP
1670 {4, 8, 3, 6}, // no space for fourth CP
1671 {4, 9, 3, 6}, // no space for fourth CP
1672 };
1673 for (test_offsets_partial* it = begin(offsets); it != end(offsets); ++it) {
1674 test_offsets_partial t = *it;
1675 char out[array_size(exp) - 2] = {};
1676 assert(t.in_size <= array_size(in));
1677 assert(t.out_size <= array_size(out));
1678 assert(t.expected_in_next <= t.in_size);
1679 assert(t.expected_out_next <= t.out_size);
1680 mbstate_t state = {};
1681 const InternT* in_next = nullptr;
1682 char* out_next = nullptr;
1683 codecvt_base::result res = codecvt_base::ok;
1684
1685 res = cvt.out(state, in, in + t.in_size, in_next, out, out + t.out_size, out_next);
1686 assert(res == cvt.partial);
1687 assert(in_next == in + t.expected_in_next);
1688 assert(out_next == out + t.expected_out_next);
1689 assert(char_traits<char>::compare(out, exp, t.expected_out_next) == 0);
1690 if (t.expected_out_next < array_size(out))
1691 assert(out[t.expected_out_next] == 0);
1692 }
1693 }
1694
1695 template <class InternT>
utf32_to_utf16_out_error(const std::codecvt<InternT,char,mbstate_t> & cvt,utf16_endianess endianess)1696 void utf32_to_utf16_out_error(const std::codecvt<InternT, char, mbstate_t>& cvt, utf16_endianess endianess) {
1697 const char32_t input[] = {'b', 0x0448, 0xAAAA, 0x10AAAA, 0};
1698 const char16_t expected[] = {'b', 0x0448, 0xAAAA, 0xDBEA, 0xDEAA, 0};
1699 static_assert(array_size(input) == 5, "");
1700 static_assert(array_size(expected) == 6, "");
1701
1702 InternT in[array_size(input)];
1703 char exp[array_size(expected) * 2];
1704 copy(begin(input), end(input), begin(in));
1705 utf16_to_bytes(begin(expected), end(expected), begin(exp), endianess);
1706
1707 test_offsets_error<InternT> offsets[] = {
1708
1709 // Surrogate CP
1710 {4, 10, 0, 0, 0xD800, 0},
1711 {4, 10, 1, 2, 0xDBFF, 1},
1712 {4, 10, 2, 4, 0xDC00, 2},
1713 {4, 10, 3, 6, 0xDFFF, 3},
1714
1715 // CP out of range
1716 {4, 10, 0, 0, 0x00110000, 0},
1717 {4, 10, 1, 2, 0x00110000, 1},
1718 {4, 10, 2, 4, 0x00110000, 2},
1719 {4, 10, 3, 6, 0x00110000, 3}};
1720
1721 for (test_offsets_error<InternT>* it = begin(offsets); it != end(offsets); ++it) {
1722 test_offsets_error<InternT> t = *it;
1723 char out[array_size(exp) - 2] = {};
1724 assert(t.in_size <= array_size(in));
1725 assert(t.out_size <= array_size(out));
1726 assert(t.expected_in_next <= t.in_size);
1727 assert(t.expected_out_next <= t.out_size);
1728 InternT old_char = in[t.replace_pos];
1729 in[t.replace_pos] = t.replace_char;
1730
1731 mbstate_t state = {};
1732 const InternT* in_next = nullptr;
1733 char* out_next = nullptr;
1734 codecvt_base::result res = codecvt_base::ok;
1735
1736 res = cvt.out(state, in, in + t.in_size, in_next, out, out + t.out_size, out_next);
1737 assert(res == cvt.error);
1738 assert(in_next == in + t.expected_in_next);
1739 assert(out_next == out + t.expected_out_next);
1740 assert(char_traits<char>::compare(out, exp, t.expected_out_next) == 0);
1741 if (t.expected_out_next < array_size(out))
1742 assert(out[t.expected_out_next] == 0);
1743
1744 in[t.replace_pos] = old_char;
1745 }
1746 }
1747
1748 template <class InternT>
test_utf16_utf32_cvt(const std::codecvt<InternT,char,mbstate_t> & cvt,utf16_endianess endianess)1749 void test_utf16_utf32_cvt(const std::codecvt<InternT, char, mbstate_t>& cvt, utf16_endianess endianess) {
1750 utf16_to_utf32_in_ok(cvt, endianess);
1751 utf16_to_utf32_in_partial(cvt, endianess);
1752 utf16_to_utf32_in_error(cvt, endianess);
1753 utf32_to_utf16_out_ok(cvt, endianess);
1754 utf32_to_utf16_out_partial(cvt, endianess);
1755 utf32_to_utf16_out_error(cvt, endianess);
1756 }
1757
1758 template <class InternT>
utf16_to_ucs2_in_ok(const std::codecvt<InternT,char,mbstate_t> & cvt,utf16_endianess endianess)1759 void utf16_to_ucs2_in_ok(const std::codecvt<InternT, char, mbstate_t>& cvt, utf16_endianess endianess) {
1760 const char16_t input[] = {'b', 0x0448, 0xAAAA, 0};
1761 const char16_t expected[] = {'b', 0x0448, 0xAAAA, 0};
1762 static_assert(array_size(input) == 4, "");
1763 static_assert(array_size(expected) == 4, "");
1764
1765 char in[array_size(input) * 2];
1766 InternT exp[array_size(expected)];
1767 utf16_to_bytes(begin(input), end(input), begin(in), endianess);
1768 copy(begin(expected), end(expected), begin(exp));
1769
1770 test_offsets_ok offsets[] = {{0, 0}, {2, 1}, {4, 2}, {6, 3}};
1771 for (test_offsets_ok* it = begin(offsets); it != end(offsets); ++it) {
1772 test_offsets_ok t = *it;
1773 InternT out[array_size(exp) - 1] = {};
1774 assert(t.in_size <= array_size(in));
1775 assert(t.out_size <= array_size(out));
1776 mbstate_t state = {};
1777 const char* in_next = nullptr;
1778 InternT* out_next = nullptr;
1779 codecvt_base::result res = codecvt_base::ok;
1780
1781 res = cvt.in(state, in, in + t.in_size, in_next, out, out + t.out_size, out_next);
1782 assert(res == cvt.ok);
1783 assert(in_next == in + t.in_size);
1784 assert(out_next == out + t.out_size);
1785 assert(char_traits<InternT>::compare(out, exp, t.out_size) == 0);
1786 if (t.out_size < array_size(out))
1787 assert(out[t.out_size] == 0);
1788
1789 state = mbstate_t();
1790 int len = cvt.length(state, in, in + t.in_size, t.out_size);
1791 assert(len >= 0);
1792 assert(static_cast<size_t>(len) == t.in_size);
1793 }
1794
1795 for (test_offsets_ok* it = begin(offsets); it != end(offsets); ++it) {
1796 test_offsets_ok t = *it;
1797 InternT out[array_size(exp)] = {};
1798 assert(t.in_size <= array_size(in));
1799 assert(t.out_size <= array_size(out));
1800 mbstate_t state = {};
1801 const char* in_next = nullptr;
1802 InternT* out_next = nullptr;
1803 codecvt_base::result res = codecvt_base::ok;
1804
1805 res = cvt.in(state, in, in + t.in_size, in_next, out, end(out), out_next);
1806 assert(res == cvt.ok);
1807 assert(in_next == in + t.in_size);
1808 assert(out_next == out + t.out_size);
1809 assert(char_traits<InternT>::compare(out, exp, t.out_size) == 0);
1810 if (t.out_size < array_size(out))
1811 assert(out[t.out_size] == 0);
1812
1813 state = mbstate_t();
1814 int len = cvt.length(state, in, in + t.in_size, array_size(out));
1815 assert(len >= 0);
1816 assert(static_cast<size_t>(len) == t.in_size);
1817 }
1818 }
1819
1820 template <class InternT>
utf16_to_ucs2_in_partial(const std::codecvt<InternT,char,mbstate_t> & cvt,utf16_endianess endianess)1821 void utf16_to_ucs2_in_partial(const std::codecvt<InternT, char, mbstate_t>& cvt, utf16_endianess endianess) {
1822 const char16_t input[] = {'b', 0x0448, 0xAAAA, 0};
1823 const char16_t expected[] = {'b', 0x0448, 0xAAAA, 0};
1824 static_assert(array_size(input) == 4, "");
1825 static_assert(array_size(expected) == 4, "");
1826
1827 char in[array_size(input) * 2];
1828 InternT exp[array_size(expected)];
1829 utf16_to_bytes(begin(input), end(input), begin(in), endianess);
1830 copy(begin(expected), end(expected), begin(exp));
1831
1832 test_offsets_partial offsets[] = {
1833 {2, 0, 0, 0}, // no space for first CP
1834 {1, 1, 0, 0}, // incomplete first CP
1835 {1, 0, 0, 0}, // incomplete first CP, and no space for it
1836
1837 {4, 1, 2, 1}, // no space for second CP
1838 {3, 2, 2, 1}, // incomplete second CP
1839 {3, 1, 2, 1}, // incomplete second CP, and no space for it
1840
1841 {6, 2, 4, 2}, // no space for third CP
1842 {5, 3, 4, 2}, // incomplete third CP
1843 {5, 2, 4, 2}, // incomplete third CP, and no space for it
1844 };
1845
1846 for (test_offsets_partial* it = begin(offsets); it != end(offsets); ++it) {
1847 test_offsets_partial t = *it;
1848 InternT out[array_size(exp) - 1] = {};
1849 assert(t.in_size <= array_size(in));
1850 assert(t.out_size <= array_size(out));
1851 assert(t.expected_in_next <= t.in_size);
1852 assert(t.expected_out_next <= t.out_size);
1853 mbstate_t state = {};
1854 const char* in_next = nullptr;
1855 InternT* out_next = nullptr;
1856 codecvt_base::result res = codecvt_base::ok;
1857
1858 res = cvt.in(state, in, in + t.in_size, in_next, out, out + t.out_size, out_next);
1859 assert(res == cvt.partial);
1860 assert(in_next == in + t.expected_in_next);
1861 assert(out_next == out + t.expected_out_next);
1862 assert(char_traits<InternT>::compare(out, exp, t.expected_out_next) == 0);
1863 if (t.expected_out_next < array_size(out))
1864 assert(out[t.expected_out_next] == 0);
1865
1866 state = mbstate_t();
1867 int len = cvt.length(state, in, in + t.in_size, t.out_size);
1868 assert(len >= 0);
1869 assert(static_cast<size_t>(len) == t.expected_in_next);
1870 }
1871 }
1872
1873 template <class InternT>
utf16_to_ucs2_in_error(const std::codecvt<InternT,char,mbstate_t> & cvt,utf16_endianess endianess)1874 void utf16_to_ucs2_in_error(const std::codecvt<InternT, char, mbstate_t>& cvt, utf16_endianess endianess) {
1875 char16_t input[] = {'b', 0x0448, 0xAAAA, 0xDBEA, 0xDEAA, 0};
1876 const char16_t expected[] = {'b', 0x0448, 0xAAAA, 0xDBEA, 0xDEAA, 0};
1877 static_assert(array_size(input) == 6, "");
1878 static_assert(array_size(expected) == 6, "");
1879
1880 InternT exp[array_size(expected)];
1881 copy(begin(expected), end(expected), begin(exp));
1882
1883 // The only possible error in UTF-16 is unpaired surrogate code units.
1884 // Additionally, because the target encoding is UCS-2, a proper pair of
1885 // surrogates is also error. Simply, any surrogate CU is error.
1886 test_offsets_error<char16_t> offsets[] = {
1887 {6, 3, 0, 0, 0xD800, 0},
1888 {6, 3, 0, 0, 0xDBFF, 0},
1889 {6, 3, 0, 0, 0xDC00, 0},
1890 {6, 3, 0, 0, 0xDFFF, 0},
1891
1892 {6, 3, 2, 1, 0xD800, 1},
1893 {6, 3, 2, 1, 0xDBFF, 1},
1894 {6, 3, 2, 1, 0xDC00, 1},
1895 {6, 3, 2, 1, 0xDFFF, 1},
1896
1897 {6, 3, 4, 2, 0xD800, 2},
1898 {6, 3, 4, 2, 0xDBFF, 2},
1899 {6, 3, 4, 2, 0xDC00, 2},
1900 {6, 3, 4, 2, 0xDFFF, 2},
1901
1902 // make the leading surrogate a trailing one
1903 {10, 5, 6, 3, 0xDC00, 3},
1904 {10, 5, 6, 3, 0xDFFF, 3},
1905
1906 // make the trailing surrogate a leading one
1907 {10, 5, 6, 3, 0xD800, 4},
1908 {10, 5, 6, 3, 0xDBFF, 4},
1909
1910 // make the trailing surrogate a BMP char
1911 {10, 5, 6, 3, 'z', 4},
1912
1913 // don't replace anything in the test cases bellow, just show the surrogate
1914 // pair (fourth CP) fully or partially (just the first surrogate)
1915 {10, 5, 6, 3, 'b', 0},
1916 {8, 5, 6, 3, 'b', 0},
1917 {9, 5, 6, 3, 'b', 0},
1918
1919 {10, 4, 6, 3, 'b', 0},
1920 {8, 4, 6, 3, 'b', 0},
1921 {9, 4, 6, 3, 'b', 0},
1922 };
1923
1924 for (test_offsets_error<char16_t>* it = begin(offsets); it != end(offsets); ++it) {
1925 test_offsets_error<char16_t> t = *it;
1926 char in[array_size(input) * 2];
1927 InternT out[array_size(exp) - 1] = {};
1928 assert(t.in_size <= array_size(in));
1929 assert(t.out_size <= array_size(out));
1930 assert(t.expected_in_next <= t.in_size);
1931 assert(t.expected_out_next <= t.out_size);
1932 char16_t old_char = input[t.replace_pos];
1933 input[t.replace_pos] = t.replace_char; // replace in input, not in in
1934 utf16_to_bytes(begin(input), end(input), begin(in), endianess);
1935
1936 mbstate_t state = {};
1937 const char* in_next = nullptr;
1938 InternT* out_next = nullptr;
1939 codecvt_base::result res = codecvt_base::ok;
1940
1941 res = cvt.in(state, in, in + t.in_size, in_next, out, out + t.out_size, out_next);
1942 assert(res == cvt.error);
1943 assert(in_next == in + t.expected_in_next);
1944 assert(out_next == out + t.expected_out_next);
1945 assert(char_traits<InternT>::compare(out, exp, t.expected_out_next) == 0);
1946 if (t.expected_out_next < array_size(out))
1947 assert(out[t.expected_out_next] == 0);
1948
1949 state = mbstate_t();
1950 int len = cvt.length(state, in, in + t.in_size, t.out_size);
1951 assert(len >= 0);
1952 assert(static_cast<size_t>(len) == t.expected_in_next);
1953
1954 input[t.replace_pos] = old_char;
1955 }
1956 }
1957
1958 template <class InternT>
ucs2_to_utf16_out_ok(const std::codecvt<InternT,char,mbstate_t> & cvt,utf16_endianess endianess)1959 void ucs2_to_utf16_out_ok(const std::codecvt<InternT, char, mbstate_t>& cvt, utf16_endianess endianess) {
1960 const char16_t input[] = {'b', 0x0448, 0xAAAA, 0};
1961 const char16_t expected[] = {'b', 0x0448, 0xAAAA, 0};
1962 static_assert(array_size(input) == 4, "");
1963 static_assert(array_size(expected) == 4, "");
1964
1965 InternT in[array_size(input)];
1966 char exp[array_size(expected) * 2];
1967 copy(begin(input), end(input), begin(in));
1968 utf16_to_bytes(begin(expected), end(expected), begin(exp), endianess);
1969
1970 test_offsets_ok offsets[] = {{0, 0}, {1, 2}, {2, 4}, {3, 6}};
1971 for (test_offsets_ok* it = begin(offsets); it != end(offsets); ++it) {
1972 test_offsets_ok t = *it;
1973 char out[array_size(exp) - 2] = {};
1974 assert(t.in_size <= array_size(in));
1975 assert(t.out_size <= array_size(out));
1976 mbstate_t state = {};
1977 const InternT* in_next = nullptr;
1978 char* out_next = nullptr;
1979 codecvt_base::result res = codecvt_base::ok;
1980
1981 res = cvt.out(state, in, in + t.in_size, in_next, out, out + t.out_size, out_next);
1982 assert(res == cvt.ok);
1983 assert(in_next == in + t.in_size);
1984 assert(out_next == out + t.out_size);
1985 assert(char_traits<char>::compare(out, exp, t.out_size) == 0);
1986 if (t.out_size < array_size(out))
1987 assert(out[t.out_size] == 0);
1988 }
1989 }
1990
1991 template <class InternT>
ucs2_to_utf16_out_partial(const std::codecvt<InternT,char,mbstate_t> & cvt,utf16_endianess endianess)1992 void ucs2_to_utf16_out_partial(const std::codecvt<InternT, char, mbstate_t>& cvt, utf16_endianess endianess) {
1993 const char16_t input[] = {'b', 0x0448, 0xAAAA, 0};
1994 const char16_t expected[] = {'b', 0x0448, 0xAAAA, 0};
1995 static_assert(array_size(input) == 4, "");
1996 static_assert(array_size(expected) == 4, "");
1997
1998 InternT in[array_size(input)];
1999 char exp[array_size(expected) * 2];
2000 copy(begin(input), end(input), begin(in));
2001 utf16_to_bytes(begin(expected), end(expected), begin(exp), endianess);
2002
2003 test_offsets_partial offsets[] = {
2004 {1, 0, 0, 0}, // no space for first CP
2005 {1, 1, 0, 0}, // no space for first CP
2006
2007 {2, 2, 1, 2}, // no space for second CP
2008 {2, 3, 1, 2}, // no space for second CP
2009
2010 {3, 4, 2, 4}, // no space for third CP
2011 {3, 5, 2, 4}, // no space for third CP
2012 };
2013 for (test_offsets_partial* it = begin(offsets); it != end(offsets); ++it) {
2014 test_offsets_partial t = *it;
2015 char out[array_size(exp) - 2] = {};
2016 assert(t.in_size <= array_size(in));
2017 assert(t.out_size <= array_size(out));
2018 assert(t.expected_in_next <= t.in_size);
2019 assert(t.expected_out_next <= t.out_size);
2020 mbstate_t state = {};
2021 const InternT* in_next = nullptr;
2022 char* out_next = nullptr;
2023 codecvt_base::result res = codecvt_base::ok;
2024
2025 res = cvt.out(state, in, in + t.in_size, in_next, out, out + t.out_size, out_next);
2026 assert(res == cvt.partial);
2027 assert(in_next == in + t.expected_in_next);
2028 assert(out_next == out + t.expected_out_next);
2029 assert(char_traits<char>::compare(out, exp, t.expected_out_next) == 0);
2030 if (t.expected_out_next < array_size(out))
2031 assert(out[t.expected_out_next] == 0);
2032 }
2033 }
2034
2035 template <class InternT>
ucs2_to_utf16_out_error(const std::codecvt<InternT,char,mbstate_t> & cvt,utf16_endianess endianess)2036 void ucs2_to_utf16_out_error(const std::codecvt<InternT, char, mbstate_t>& cvt, utf16_endianess endianess) {
2037 const char16_t input[] = {'b', 0x0448, 0xAAAA, 0xDBEA, 0xDEAA, 0};
2038 const char16_t expected[] = {'b', 0x0448, 0xAAAA, 0xDBEA, 0xDEAA, 0};
2039 static_assert(array_size(input) == 6, "");
2040 static_assert(array_size(expected) == 6, "");
2041
2042 InternT in[array_size(input)];
2043 char exp[array_size(expected) * 2];
2044 copy(begin(input), end(input), begin(in));
2045 utf16_to_bytes(begin(expected), end(expected), begin(exp), endianess);
2046
2047 test_offsets_error<InternT> offsets[] = {
2048 {3, 6, 0, 0, 0xD800, 0},
2049 {3, 6, 0, 0, 0xDBFF, 0},
2050 {3, 6, 0, 0, 0xDC00, 0},
2051 {3, 6, 0, 0, 0xDFFF, 0},
2052
2053 {3, 6, 1, 2, 0xD800, 1},
2054 {3, 6, 1, 2, 0xDBFF, 1},
2055 {3, 6, 1, 2, 0xDC00, 1},
2056 {3, 6, 1, 2, 0xDFFF, 1},
2057
2058 {3, 6, 2, 4, 0xD800, 2},
2059 {3, 6, 2, 4, 0xDBFF, 2},
2060 {3, 6, 2, 4, 0xDC00, 2},
2061 {3, 6, 2, 4, 0xDFFF, 2},
2062
2063 // make the leading surrogate a trailing one
2064 {5, 10, 3, 6, 0xDC00, 3},
2065 {5, 10, 3, 6, 0xDFFF, 3},
2066
2067 // make the trailing surrogate a leading one
2068 {5, 10, 3, 6, 0xD800, 4},
2069 {5, 10, 3, 6, 0xDBFF, 4},
2070
2071 // make the trailing surrogate a BMP char
2072 {5, 10, 3, 6, 'z', 4},
2073
2074 // don't replace anything in the test cases bellow, just show the surrogate
2075 // pair (fourth CP) fully or partially (just the first surrogate)
2076 {5, 10, 3, 6, 'b', 0},
2077 {5, 8, 3, 6, 'b', 0},
2078 {5, 9, 3, 6, 'b', 0},
2079
2080 {4, 10, 3, 6, 'b', 0},
2081 {4, 8, 3, 6, 'b', 0},
2082 {4, 9, 3, 6, 'b', 0},
2083 };
2084
2085 for (test_offsets_error<InternT>* it = begin(offsets); it != end(offsets); ++it) {
2086 test_offsets_error<InternT> t = *it;
2087 char out[array_size(exp) - 2] = {};
2088 assert(t.in_size <= array_size(in));
2089 assert(t.out_size <= array_size(out));
2090 assert(t.expected_in_next <= t.in_size);
2091 assert(t.expected_out_next <= t.out_size);
2092 InternT old_char = in[t.replace_pos];
2093 in[t.replace_pos] = t.replace_char;
2094
2095 mbstate_t state = {};
2096 const InternT* in_next = nullptr;
2097 char* out_next = nullptr;
2098 codecvt_base::result res = codecvt_base::ok;
2099
2100 res = cvt.out(state, in, in + t.in_size, in_next, out, out + t.out_size, out_next);
2101 assert(res == cvt.error);
2102 assert(in_next == in + t.expected_in_next);
2103 assert(out_next == out + t.expected_out_next);
2104 assert(char_traits<char>::compare(out, exp, t.expected_out_next) == 0);
2105 if (t.expected_out_next < array_size(out))
2106 assert(out[t.expected_out_next] == 0);
2107
2108 in[t.replace_pos] = old_char;
2109 }
2110 }
2111
2112 template <class InternT>
test_utf16_ucs2_cvt(const std::codecvt<InternT,char,mbstate_t> & cvt,utf16_endianess endianess)2113 void test_utf16_ucs2_cvt(const std::codecvt<InternT, char, mbstate_t>& cvt, utf16_endianess endianess) {
2114 utf16_to_ucs2_in_ok(cvt, endianess);
2115 utf16_to_ucs2_in_partial(cvt, endianess);
2116 utf16_to_ucs2_in_error(cvt, endianess);
2117 ucs2_to_utf16_out_ok(cvt, endianess);
2118 ucs2_to_utf16_out_partial(cvt, endianess);
2119 ucs2_to_utf16_out_error(cvt, endianess);
2120 }
2121
2122 using std::codecvt;
2123 using std::codecvt_utf16;
2124 using std::codecvt_utf8;
2125 using std::codecvt_utf8_utf16;
2126 using std::has_facet;
2127 using std::locale;
2128 using std::use_facet;
2129
test_utf8_utf32_codecvts()2130 void test_utf8_utf32_codecvts() {
2131 typedef codecvt<char32_t, char, mbstate_t> codecvt_c32;
2132 const locale& loc_c = locale::classic();
2133 assert(has_facet<codecvt_c32>(loc_c));
2134
2135 const codecvt_c32& cvt = use_facet<codecvt_c32>(loc_c);
2136 test_utf8_utf32_cvt(cvt);
2137
2138 codecvt_utf8<char32_t> cvt2;
2139 test_utf8_utf32_cvt(cvt2);
2140
2141 #if !defined(TEST_HAS_NO_WIDE_CHARACTERS) && !defined(TEST_SHORT_WCHAR)
2142 codecvt_utf8<wchar_t> cvt3;
2143 test_utf8_utf32_cvt(cvt3);
2144 #endif
2145
2146 #ifndef TEST_HAS_NO_CHAR8_T
2147 typedef codecvt<char32_t, char8_t, mbstate_t> codecvt_c32_c8;
2148 assert(has_facet<codecvt_c32_c8>(loc_c));
2149 const codecvt_c32_c8& cvt4 = use_facet<codecvt_c32_c8>(loc_c);
2150 test_utf8_utf32_cvt(cvt4);
2151 #endif
2152 }
2153
test_utf8_utf16_codecvts()2154 void test_utf8_utf16_codecvts() {
2155 typedef codecvt<char16_t, char, mbstate_t> codecvt_c16;
2156 const locale& loc_c = locale::classic();
2157 assert(has_facet<codecvt_c16>(loc_c));
2158
2159 const codecvt_c16& cvt = use_facet<codecvt_c16>(loc_c);
2160 test_utf8_utf16_cvt(cvt);
2161
2162 codecvt_utf8_utf16<char16_t> cvt2;
2163 test_utf8_utf16_cvt(cvt2);
2164
2165 codecvt_utf8_utf16<char32_t> cvt3;
2166 test_utf8_utf16_cvt(cvt3);
2167
2168 #ifndef TEST_HAS_NO_WIDE_CHARACTERS
2169 codecvt_utf8_utf16<wchar_t> cvt4;
2170 test_utf8_utf16_cvt(cvt4);
2171 #endif
2172
2173 #ifndef TEST_HAS_NO_CHAR8_T
2174 typedef codecvt<char16_t, char8_t, mbstate_t> codecvt_c16_c8;
2175 assert(has_facet<codecvt_c16_c8>(loc_c));
2176 const codecvt_c16_c8& cvt5 = use_facet<codecvt_c16_c8>(loc_c);
2177 test_utf8_utf16_cvt(cvt5);
2178 #endif
2179 }
2180
test_utf8_ucs2_codecvts()2181 void test_utf8_ucs2_codecvts() {
2182 codecvt_utf8<char16_t> cvt;
2183 test_utf8_ucs2_cvt(cvt);
2184
2185 #if !defined(TEST_HAS_NO_WIDE_CHARACTERS) && defined(TEST_SHORT_WCHAR)
2186 codecvt_utf8<wchar_t> cvt2;
2187 test_utf8_ucs2_cvt(cvt2);
2188 #endif
2189 }
2190
test_utf16_utf32_codecvts()2191 void test_utf16_utf32_codecvts() {
2192 codecvt_utf16<char32_t> cvt;
2193 test_utf16_utf32_cvt(cvt, utf16_big_endian);
2194
2195 codecvt_utf16<char32_t, 0x10FFFF, std::little_endian> cvt2;
2196 test_utf16_utf32_cvt(cvt2, utf16_little_endian);
2197
2198 #if !defined(TEST_HAS_NO_WIDE_CHARACTERS) && !defined(TEST_SHORT_WCHAR)
2199 codecvt_utf16<wchar_t> cvt3;
2200 test_utf16_utf32_cvt(cvt3, utf16_big_endian);
2201
2202 codecvt_utf16<wchar_t, 0x10FFFF, std::little_endian> cvt4;
2203 test_utf16_utf32_cvt(cvt4, utf16_little_endian);
2204 #endif
2205 }
2206
test_utf16_ucs2_codecvts()2207 void test_utf16_ucs2_codecvts() {
2208 codecvt_utf16<char16_t> cvt;
2209 test_utf16_ucs2_cvt(cvt, utf16_big_endian);
2210
2211 codecvt_utf16<char16_t, 0x10FFFF, std::little_endian> cvt2;
2212 test_utf16_ucs2_cvt(cvt2, utf16_little_endian);
2213
2214 #if !defined(TEST_HAS_NO_WIDE_CHARACTERS) && defined(TEST_SHORT_WCHAR)
2215 codecvt_utf16<wchar_t> cvt3;
2216 test_utf16_ucs2_cvt(cvt3, utf16_big_endian);
2217
2218 codecvt_utf16<wchar_t, 0x10FFFF, std::little_endian> cvt4;
2219 test_utf16_ucs2_cvt(cvt4, utf16_little_endian);
2220 #endif
2221 }
2222
main()2223 int main() {
2224 test_utf8_utf32_codecvts();
2225 test_utf8_utf16_codecvts();
2226 test_utf8_ucs2_codecvts();
2227 test_utf16_utf32_codecvts();
2228 test_utf16_ucs2_codecvts();
2229 }
2230