• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 //===----------------------------------------------------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 
9 // ADDITIONAL_COMPILE_FLAGS: -D_LIBCPP_DISABLE_DEPRECATION_WARNINGS -D_LIBCPP_ENABLE_CXX26_REMOVED_CODECVT
10 // XFAIL: stdlib=apple-libc++ && target={{.+}}-apple-macosx{{10.9|10.10|10.11|10.12|10.13|10.14|10.15|11.0|12.0|13.0}}
11 
12 #include <algorithm>
13 #include <cassert>
14 #include <codecvt>
15 #include <locale>
16 
17 #include "test_macros.h"
18 
19 struct test_offsets_ok {
20   size_t in_size;
21   size_t out_size;
22 };
23 struct test_offsets_partial {
24   size_t in_size;
25   size_t out_size;
26   size_t expected_in_next;
27   size_t expected_out_next;
28 };
29 
30 template <class CharT>
31 struct test_offsets_error {
32   size_t in_size;
33   size_t out_size;
34   size_t expected_in_next;
35   size_t expected_out_next;
36   CharT replace_char;
37   size_t replace_pos;
38 };
39 
40 #define array_size(x) (sizeof(x) / sizeof(x)[0])
41 
42 using std::begin;
43 using std::char_traits;
44 using std::codecvt_base;
45 using std::copy;
46 using std::end;
47 
48 template <class InternT, class ExternT>
utf8_to_utf32_in_ok(const std::codecvt<InternT,ExternT,mbstate_t> & cvt)49 void utf8_to_utf32_in_ok(const std::codecvt<InternT, ExternT, mbstate_t>& cvt) {
50   // UTF-8 string of 1-byte CP, 2-byte CP, 3-byte CP and 4-byte CP
51   const unsigned char input[] = "b\u0448\uAAAA\U0010AAAA";
52   const char32_t expected[]   = {'b', 0x0448, 0xAAAA, 0x10AAAA, 0};
53   static_assert(array_size(input) == 11, "");
54   static_assert(array_size(expected) == 5, "");
55 
56   ExternT in[array_size(input)];
57   InternT exp[array_size(expected)];
58   copy(begin(input), end(input), begin(in));
59   copy(begin(expected), end(expected), begin(exp));
60   assert(char_traits<ExternT>::length(in) == 10);
61   assert(char_traits<InternT>::length(exp) == 4);
62   test_offsets_ok offsets[] = {{0, 0}, {1, 1}, {3, 2}, {6, 3}, {10, 4}};
63   for (test_offsets_ok* it = begin(offsets); it != end(offsets); ++it) {
64     test_offsets_ok t                = *it;
65     InternT out[array_size(exp) - 1] = {};
66     assert(t.in_size <= array_size(in));
67     assert(t.out_size <= array_size(out));
68     mbstate_t state          = {};
69     const ExternT* in_next   = nullptr;
70     InternT* out_next        = nullptr;
71     codecvt_base::result res = codecvt_base::ok;
72 
73     res = cvt.in(state, in, in + t.in_size, in_next, out, out + t.out_size, out_next);
74     assert(res == cvt.ok);
75     assert(in_next == in + t.in_size);
76     assert(out_next == out + t.out_size);
77     assert(char_traits<InternT>::compare(out, exp, t.out_size) == 0);
78     if (t.out_size < array_size(out))
79       assert(out[t.out_size] == 0);
80 
81     state   = mbstate_t();
82     int len = cvt.length(state, in, in + t.in_size, t.out_size);
83     assert(len >= 0);
84     assert(static_cast<size_t>(len) == t.in_size);
85   }
86 
87   for (test_offsets_ok* it = begin(offsets); it != end(offsets); ++it) {
88     test_offsets_ok t            = *it;
89     InternT out[array_size(exp)] = {};
90     assert(t.in_size <= array_size(in));
91     assert(t.out_size <= array_size(out));
92     mbstate_t state          = {};
93     const ExternT* in_next   = nullptr;
94     InternT* out_next        = nullptr;
95     codecvt_base::result res = codecvt_base::ok;
96 
97     res = cvt.in(state, in, in + t.in_size, in_next, out, end(out), out_next);
98     assert(res == cvt.ok);
99     assert(in_next == in + t.in_size);
100     assert(out_next == out + t.out_size);
101     assert(char_traits<InternT>::compare(out, exp, t.out_size) == 0);
102     if (t.out_size < array_size(out))
103       assert(out[t.out_size] == 0);
104 
105     state   = mbstate_t();
106     int len = cvt.length(state, in, in + t.in_size, array_size(out));
107     assert(len >= 0);
108     assert(static_cast<size_t>(len) == t.in_size);
109   }
110 }
111 
112 template <class InternT, class ExternT>
utf8_to_utf32_in_partial(const std::codecvt<InternT,ExternT,mbstate_t> & cvt)113 void utf8_to_utf32_in_partial(const std::codecvt<InternT, ExternT, mbstate_t>& cvt) {
114   // UTF-8 string of 1-byte CP, 2-byte CP, 3-byte CP and 4-byte CP
115   const unsigned char input[] = "b\u0448\uAAAA\U0010AAAA";
116   const char32_t expected[]   = {'b', 0x0448, 0xAAAA, 0x10AAAA, 0};
117   static_assert(array_size(input) == 11, "");
118   static_assert(array_size(expected) == 5, "");
119 
120   ExternT in[array_size(input)];
121   InternT exp[array_size(expected)];
122   copy(begin(input), end(input), begin(in));
123   copy(begin(expected), end(expected), begin(exp));
124   assert(char_traits<ExternT>::length(in) == 10);
125   assert(char_traits<InternT>::length(exp) == 4);
126 
127   test_offsets_partial offsets[] = {
128       {1, 0, 0, 0}, // no space for first CP
129 
130       {3, 1, 1, 1}, // no space for second CP
131       {2, 2, 1, 1}, // incomplete second CP
132       {2, 1, 1, 1}, // incomplete second CP, and no space for it
133 
134       {6, 2, 3, 2}, // no space for third CP
135       {4, 3, 3, 2}, // incomplete third CP
136       {5, 3, 3, 2}, // incomplete third CP
137       {4, 2, 3, 2}, // incomplete third CP, and no space for it
138       {5, 2, 3, 2}, // incomplete third CP, and no space for it
139 
140       {10, 3, 6, 3}, // no space for fourth CP
141       {7, 4, 6, 3},  // incomplete fourth CP
142       {8, 4, 6, 3},  // incomplete fourth CP
143       {9, 4, 6, 3},  // incomplete fourth CP
144       {7, 3, 6, 3},  // incomplete fourth CP, and no space for it
145       {8, 3, 6, 3},  // incomplete fourth CP, and no space for it
146       {9, 3, 6, 3},  // incomplete fourth CP, and no space for it
147   };
148 
149   for (test_offsets_partial* it = begin(offsets); it != end(offsets); ++it) {
150     test_offsets_partial t           = *it;
151     InternT out[array_size(exp) - 1] = {};
152     assert(t.in_size <= array_size(in));
153     assert(t.out_size <= array_size(out));
154     assert(t.expected_in_next <= t.in_size);
155     assert(t.expected_out_next <= t.out_size);
156     mbstate_t state          = {};
157     const ExternT* in_next   = nullptr;
158     InternT* out_next        = nullptr;
159     codecvt_base::result res = codecvt_base::ok;
160 
161     res = cvt.in(state, in, in + t.in_size, in_next, out, out + t.out_size, out_next);
162     assert(res == cvt.partial);
163     assert(in_next == in + t.expected_in_next);
164     assert(out_next == out + t.expected_out_next);
165     assert(char_traits<InternT>::compare(out, exp, t.expected_out_next) == 0);
166     if (t.expected_out_next < array_size(out))
167       assert(out[t.expected_out_next] == 0);
168 
169     state   = mbstate_t();
170     int len = cvt.length(state, in, in + t.in_size, t.out_size);
171     assert(len >= 0);
172     assert(static_cast<size_t>(len) == t.expected_in_next);
173   }
174 }
175 
176 template <class InternT, class ExternT>
utf8_to_utf32_in_error(const std::codecvt<InternT,ExternT,mbstate_t> & cvt)177 void utf8_to_utf32_in_error(const std::codecvt<InternT, ExternT, mbstate_t>& cvt) {
178   // UTF-8 string of 1-byte CP, 2-byte CP, 3-byte CP, 4-byte CP
179   const unsigned char input[] = "b\u0448\uD700\U0010AAAA";
180   const char32_t expected[]   = {'b', 0x0448, 0xD700, 0x10AAAA, 0};
181   static_assert(array_size(input) == 11, "");
182   static_assert(array_size(expected) == 5, "");
183 
184   ExternT in[array_size(input)];
185   InternT exp[array_size(expected)];
186   copy(begin(input), end(input), begin(in));
187   copy(begin(expected), end(expected), begin(exp));
188   assert(char_traits<ExternT>::length(in) == 10);
189   assert(char_traits<InternT>::length(exp) == 4);
190 
191   // There are 5 classes of errors in UTF-8 decoding
192   // 1. Missing leading byte
193   // 2. Missing trailing byte
194   // 3. Surrogate CP
195   // 4. Overlong sequence
196   // 5. CP out of Unicode range
197   test_offsets_error<unsigned char> offsets[] = {
198 
199       // 1. Missing leading byte. We will replace the leading byte with
200       // non-leading byte, such as a byte that is always invalid or a trailing
201       // byte.
202 
203       // replace leading byte with invalid byte
204       {1, 4, 0, 0, 0xFF, 0},
205       {3, 4, 1, 1, 0xFF, 1},
206       {6, 4, 3, 2, 0xFF, 3},
207       {10, 4, 6, 3, 0xFF, 6},
208 
209       // replace leading byte with trailing byte
210       {1, 4, 0, 0, 0b10101010, 0},
211       {3, 4, 1, 1, 0b10101010, 1},
212       {6, 4, 3, 2, 0b10101010, 3},
213       {10, 4, 6, 3, 0b10101010, 6},
214 
215       // 2. Missing trailing byte. We will replace the trailing byte with
216       // non-trailing byte, such as a byte that is always invalid or a leading
217       // byte (simple ASCII byte in our case).
218 
219       // replace first trailing byte with ASCII byte
220       {3, 4, 1, 1, 'z', 2},
221       {6, 4, 3, 2, 'z', 4},
222       {10, 4, 6, 3, 'z', 7},
223 
224       // replace first trailing byte with invalid byte
225       {3, 4, 1, 1, 0xFF, 2},
226       {6, 4, 3, 2, 0xFF, 4},
227       {10, 4, 6, 3, 0xFF, 7},
228 
229       // replace second trailing byte with ASCII byte
230       {6, 4, 3, 2, 'z', 5},
231       {10, 4, 6, 3, 'z', 8},
232 
233       // replace second trailing byte with invalid byte
234       {6, 4, 3, 2, 0xFF, 5},
235       {10, 4, 6, 3, 0xFF, 8},
236 
237       // replace third trailing byte
238       {10, 4, 6, 3, 'z', 9},
239       {10, 4, 6, 3, 0xFF, 9},
240 
241       // 2.1 The following test-cases raise doubt whether error or partial should
242       // be returned. For example, we have 4-byte sequence with valid leading
243       // byte. If we hide the last byte we need to return partial. But, if the
244       // second or third byte, which are visible to the call to codecvt, are
245       // malformed then error should be returned.
246 
247       // replace first trailing byte with ASCII byte, also incomplete at end
248       {5, 4, 3, 2, 'z', 4},
249       {8, 4, 6, 3, 'z', 7},
250       {9, 4, 6, 3, 'z', 7},
251 
252       // replace first trailing byte with invalid byte, also incomplete at end
253       {5, 4, 3, 2, 0xFF, 4},
254       {8, 4, 6, 3, 0xFF, 7},
255       {9, 4, 6, 3, 0xFF, 7},
256 
257       // replace second trailing byte with ASCII byte, also incomplete at end
258       {9, 4, 6, 3, 'z', 8},
259 
260       // replace second trailing byte with invalid byte, also incomplete at end
261       {9, 4, 6, 3, 0xFF, 8},
262 
263       // 3. Surrogate CP. We modify the second byte (first trailing) of the 3-byte
264       // CP U+D700
265       {6, 4, 3, 2, 0b10100000, 4}, // turn U+D700 into U+D800
266       {6, 4, 3, 2, 0b10101100, 4}, // turn U+D700 into U+DB00
267       {6, 4, 3, 2, 0b10110000, 4}, // turn U+D700 into U+DC00
268       {6, 4, 3, 2, 0b10111100, 4}, // turn U+D700 into U+DF00
269 
270       // 4. Overlong sequence. The CPs in the input are chosen such as modifying
271       // just the leading byte is enough to make them overlong, i.e. for the
272       // 3-byte and 4-byte CP the second byte (first trailing) has enough leading
273       // zeroes.
274       {3, 4, 1, 1, 0b11000000, 1},  // make the 2-byte CP overlong
275       {3, 4, 1, 1, 0b11000001, 1},  // make the 2-byte CP overlong
276       {6, 4, 3, 2, 0b11100000, 3},  // make the 3-byte CP overlong
277       {10, 4, 6, 3, 0b11110000, 6}, // make the 4-byte CP overlong
278 
279       // 5. CP above range
280       // turn U+10AAAA into U+14AAAA by changing its leading byte
281       {10, 4, 6, 3, 0b11110101, 6},
282       // turn U+10AAAA into U+11AAAA by changing its 2nd byte
283       {10, 4, 6, 3, 0b10011010, 7},
284   };
285   for (test_offsets_error<unsigned char>* it = begin(offsets); it != end(offsets); ++it) {
286     test_offsets_error<unsigned char> t = *it;
287     InternT out[array_size(exp) - 1]    = {};
288     assert(t.in_size <= array_size(in));
289     assert(t.out_size <= array_size(out));
290     assert(t.expected_in_next <= t.in_size);
291     assert(t.expected_out_next <= t.out_size);
292     ExternT old_char  = in[t.replace_pos];
293     in[t.replace_pos] = t.replace_char;
294 
295     mbstate_t state          = {};
296     const ExternT* in_next   = nullptr;
297     InternT* out_next        = nullptr;
298     codecvt_base::result res = codecvt_base::ok;
299 
300     res = cvt.in(state, in, in + t.in_size, in_next, out, out + t.out_size, out_next);
301     assert(res == cvt.error);
302     assert(in_next == in + t.expected_in_next);
303     assert(out_next == out + t.expected_out_next);
304     assert(char_traits<InternT>::compare(out, exp, t.expected_out_next) == 0);
305     if (t.expected_out_next < array_size(out))
306       assert(out[t.expected_out_next] == 0);
307 
308     state   = mbstate_t();
309     int len = cvt.length(state, in, in + t.in_size, t.out_size);
310     assert(len >= 0);
311     assert(static_cast<size_t>(len) == t.expected_in_next);
312 
313     in[t.replace_pos] = old_char;
314   }
315 }
316 
317 template <class InternT, class ExternT>
utf8_to_utf32_in(const std::codecvt<InternT,ExternT,mbstate_t> & cvt)318 void utf8_to_utf32_in(const std::codecvt<InternT, ExternT, mbstate_t>& cvt) {
319   utf8_to_utf32_in_ok(cvt);
320   utf8_to_utf32_in_partial(cvt);
321   utf8_to_utf32_in_error(cvt);
322 }
323 
324 template <class InternT, class ExternT>
utf32_to_utf8_out_ok(const std::codecvt<InternT,ExternT,mbstate_t> & cvt)325 void utf32_to_utf8_out_ok(const std::codecvt<InternT, ExternT, mbstate_t>& cvt) {
326   // UTF-8 string of 1-byte CP, 2-byte CP, 3-byte CP and 4-byte CP
327   const char32_t input[]         = {'b', 0x0448, 0xAAAA, 0x10AAAA, 0};
328   const unsigned char expected[] = "b\u0448\uAAAA\U0010AAAA";
329   static_assert(array_size(input) == 5, "");
330   static_assert(array_size(expected) == 11, "");
331 
332   InternT in[array_size(input)];
333   ExternT exp[array_size(expected)];
334   copy(begin(input), end(input), begin(in));
335   copy(begin(expected), end(expected), begin(exp));
336   assert(char_traits<InternT>::length(in) == 4);
337   assert(char_traits<ExternT>::length(exp) == 10);
338 
339   test_offsets_ok offsets[] = {{0, 0}, {1, 1}, {2, 3}, {3, 6}, {4, 10}};
340   for (test_offsets_ok* it = begin(offsets); it != end(offsets); ++it) {
341     test_offsets_ok t                = *it;
342     ExternT out[array_size(exp) - 1] = {};
343     assert(t.in_size <= array_size(in));
344     assert(t.out_size <= array_size(out));
345     mbstate_t state          = {};
346     const InternT* in_next   = nullptr;
347     ExternT* out_next        = nullptr;
348     codecvt_base::result res = codecvt_base::ok;
349 
350     res = cvt.out(state, in, in + t.in_size, in_next, out, out + t.out_size, out_next);
351     assert(res == cvt.ok);
352     assert(in_next == in + t.in_size);
353     assert(out_next == out + t.out_size);
354     assert(char_traits<ExternT>::compare(out, exp, t.out_size) == 0);
355     if (t.out_size < array_size(out))
356       assert(out[t.out_size] == 0);
357   }
358 }
359 
360 template <class InternT, class ExternT>
utf32_to_utf8_out_partial(const std::codecvt<InternT,ExternT,mbstate_t> & cvt)361 void utf32_to_utf8_out_partial(const std::codecvt<InternT, ExternT, mbstate_t>& cvt) {
362   // UTF-8 string of 1-byte CP, 2-byte CP, 3-byte CP and 4-byte CP
363   const char32_t input[]         = {'b', 0x0448, 0xAAAA, 0x10AAAA, 0};
364   const unsigned char expected[] = "b\u0448\uAAAA\U0010AAAA";
365   static_assert(array_size(input) == 5, "");
366   static_assert(array_size(expected) == 11, "");
367 
368   InternT in[array_size(input)];
369   ExternT exp[array_size(expected)];
370   copy(begin(input), end(input), begin(in));
371   copy(begin(expected), end(expected), begin(exp));
372   assert(char_traits<InternT>::length(in) == 4);
373   assert(char_traits<ExternT>::length(exp) == 10);
374 
375   test_offsets_partial offsets[] = {
376       {1, 0, 0, 0}, // no space for first CP
377 
378       {2, 1, 1, 1}, // no space for second CP
379       {2, 2, 1, 1}, // no space for second CP
380 
381       {3, 3, 2, 3}, // no space for third CP
382       {3, 4, 2, 3}, // no space for third CP
383       {3, 5, 2, 3}, // no space for third CP
384 
385       {4, 6, 3, 6}, // no space for fourth CP
386       {4, 7, 3, 6}, // no space for fourth CP
387       {4, 8, 3, 6}, // no space for fourth CP
388       {4, 9, 3, 6}, // no space for fourth CP
389   };
390   for (test_offsets_partial* it = begin(offsets); it != end(offsets); ++it) {
391     test_offsets_partial t           = *it;
392     ExternT out[array_size(exp) - 1] = {};
393     assert(t.in_size <= array_size(in));
394     assert(t.out_size <= array_size(out));
395     assert(t.expected_in_next <= t.in_size);
396     assert(t.expected_out_next <= t.out_size);
397     mbstate_t state          = {};
398     const InternT* in_next   = nullptr;
399     ExternT* out_next        = nullptr;
400     codecvt_base::result res = codecvt_base::ok;
401 
402     res = cvt.out(state, in, in + t.in_size, in_next, out, out + t.out_size, out_next);
403     assert(res == cvt.partial);
404     assert(in_next == in + t.expected_in_next);
405     assert(out_next == out + t.expected_out_next);
406     assert(char_traits<ExternT>::compare(out, exp, t.expected_out_next) == 0);
407     if (t.expected_out_next < array_size(out))
408       assert(out[t.expected_out_next] == 0);
409   }
410 }
411 
412 template <class InternT, class ExternT>
utf32_to_utf8_out_error(const std::codecvt<InternT,ExternT,mbstate_t> & cvt)413 void utf32_to_utf8_out_error(const std::codecvt<InternT, ExternT, mbstate_t>& cvt) {
414   // UTF-8 string of 1-byte CP, 2-byte CP, 3-byte CP and 4-byte CP
415   const char32_t input[]         = {'b', 0x0448, 0xAAAA, 0x10AAAA, 0};
416   const unsigned char expected[] = "b\u0448\uAAAA\U0010AAAA";
417   static_assert(array_size(input) == 5, "");
418   static_assert(array_size(expected) == 11, "");
419 
420   InternT in[array_size(input)];
421   ExternT exp[array_size(expected)];
422   copy(begin(input), end(input), begin(in));
423   copy(begin(expected), end(expected), begin(exp));
424   assert(char_traits<InternT>::length(in) == 4);
425   assert(char_traits<ExternT>::length(exp) == 10);
426 
427   test_offsets_error<InternT> offsets[] = {
428 
429       // Surrogate CP
430       {4, 10, 0, 0, 0xD800, 0},
431       {4, 10, 1, 1, 0xDBFF, 1},
432       {4, 10, 2, 3, 0xDC00, 2},
433       {4, 10, 3, 6, 0xDFFF, 3},
434 
435       // CP out of range
436       {4, 10, 0, 0, 0x00110000, 0},
437       {4, 10, 1, 1, 0x00110000, 1},
438       {4, 10, 2, 3, 0x00110000, 2},
439       {4, 10, 3, 6, 0x00110000, 3}};
440 
441   for (test_offsets_error<InternT>* it = begin(offsets); it != end(offsets); ++it) {
442     test_offsets_error<InternT> t    = *it;
443     ExternT out[array_size(exp) - 1] = {};
444     assert(t.in_size <= array_size(in));
445     assert(t.out_size <= array_size(out));
446     assert(t.expected_in_next <= t.in_size);
447     assert(t.expected_out_next <= t.out_size);
448     InternT old_char  = in[t.replace_pos];
449     in[t.replace_pos] = t.replace_char;
450 
451     mbstate_t state          = {};
452     const InternT* in_next   = nullptr;
453     ExternT* out_next        = nullptr;
454     codecvt_base::result res = codecvt_base::ok;
455 
456     res = cvt.out(state, in, in + t.in_size, in_next, out, out + t.out_size, out_next);
457     assert(res == cvt.error);
458     assert(in_next == in + t.expected_in_next);
459     assert(out_next == out + t.expected_out_next);
460     assert(char_traits<ExternT>::compare(out, exp, t.expected_out_next) == 0);
461     if (t.expected_out_next < array_size(out))
462       assert(out[t.expected_out_next] == 0);
463 
464     in[t.replace_pos] = old_char;
465   }
466 }
467 
468 template <class InternT, class ExternT>
utf32_to_utf8_out(const std::codecvt<InternT,ExternT,mbstate_t> & cvt)469 void utf32_to_utf8_out(const std::codecvt<InternT, ExternT, mbstate_t>& cvt) {
470   utf32_to_utf8_out_ok(cvt);
471   utf32_to_utf8_out_partial(cvt);
472   utf32_to_utf8_out_error(cvt);
473 }
474 
475 template <class InternT, class ExternT>
test_utf8_utf32_cvt(const std::codecvt<InternT,ExternT,mbstate_t> & cvt)476 void test_utf8_utf32_cvt(const std::codecvt<InternT, ExternT, mbstate_t>& cvt) {
477   utf8_to_utf32_in(cvt);
478   utf32_to_utf8_out(cvt);
479 }
480 
481 template <class InternT, class ExternT>
utf8_to_utf16_in_ok(const std::codecvt<InternT,ExternT,mbstate_t> & cvt)482 void utf8_to_utf16_in_ok(const std::codecvt<InternT, ExternT, mbstate_t>& cvt) {
483   // UTF-8 string of 1-byte CP, 2-byte CP, 3-byte CP and 4-byte CP
484   const unsigned char input[] = "b\u0448\uAAAA\U0010AAAA";
485   const char16_t expected[]   = {'b', 0x0448, 0xAAAA, 0xDBEA, 0xDEAA, 0};
486   static_assert(array_size(input) == 11, "");
487   static_assert(array_size(expected) == 6, "");
488 
489   ExternT in[array_size(input)];
490   InternT exp[array_size(expected)];
491   copy(begin(input), end(input), begin(in));
492   copy(begin(expected), end(expected), begin(exp));
493   assert(char_traits<ExternT>::length(in) == 10);
494   assert(char_traits<InternT>::length(exp) == 5);
495 
496   test_offsets_ok offsets[] = {{0, 0}, {1, 1}, {3, 2}, {6, 3}, {10, 5}};
497   for (test_offsets_ok* it = begin(offsets); it != end(offsets); ++it) {
498     test_offsets_ok t                = *it;
499     InternT out[array_size(exp) - 1] = {};
500     assert(t.in_size <= array_size(in));
501     assert(t.out_size <= array_size(out));
502     mbstate_t state          = {};
503     const ExternT* in_next   = nullptr;
504     InternT* out_next        = nullptr;
505     codecvt_base::result res = codecvt_base::ok;
506 
507     res = cvt.in(state, in, in + t.in_size, in_next, out, out + t.out_size, out_next);
508     assert(res == cvt.ok);
509     assert(in_next == in + t.in_size);
510     assert(out_next == out + t.out_size);
511     assert(char_traits<InternT>::compare(out, exp, t.out_size) == 0);
512     if (t.out_size < array_size(out))
513       assert(out[t.out_size] == 0);
514 
515     state   = mbstate_t();
516     int len = cvt.length(state, in, in + t.in_size, t.out_size);
517     assert(len >= 0);
518     assert(static_cast<size_t>(len) == t.in_size);
519   }
520 
521   for (test_offsets_ok* it = begin(offsets); it != end(offsets); ++it) {
522     test_offsets_ok t            = *it;
523     InternT out[array_size(exp)] = {};
524     assert(t.in_size <= array_size(in));
525     assert(t.out_size <= array_size(out));
526     mbstate_t state          = {};
527     const ExternT* in_next   = nullptr;
528     InternT* out_next        = nullptr;
529     codecvt_base::result res = codecvt_base::ok;
530 
531     res = cvt.in(state, in, in + t.in_size, in_next, out, end(out), out_next);
532     assert(res == cvt.ok);
533     assert(in_next == in + t.in_size);
534     assert(out_next == out + t.out_size);
535     assert(char_traits<InternT>::compare(out, exp, t.out_size) == 0);
536     if (t.out_size < array_size(out))
537       assert(out[t.out_size] == 0);
538 
539     state   = mbstate_t();
540     int len = cvt.length(state, in, in + t.in_size, array_size(out));
541     assert(len >= 0);
542     assert(static_cast<size_t>(len) == t.in_size);
543   }
544 }
545 
546 template <class InternT, class ExternT>
utf8_to_utf16_in_partial(const std::codecvt<InternT,ExternT,mbstate_t> & cvt)547 void utf8_to_utf16_in_partial(const std::codecvt<InternT, ExternT, mbstate_t>& cvt) {
548   // UTF-8 string of 1-byte CP, 2-byte CP, 3-byte CP and 4-byte CP
549   const unsigned char input[] = "b\u0448\uAAAA\U0010AAAA";
550   const char16_t expected[]   = {'b', 0x0448, 0xAAAA, 0xDBEA, 0xDEAA, 0};
551   static_assert(array_size(input) == 11, "");
552   static_assert(array_size(expected) == 6, "");
553 
554   ExternT in[array_size(input)];
555   InternT exp[array_size(expected)];
556   copy(begin(input), end(input), begin(in));
557   copy(begin(expected), end(expected), begin(exp));
558   assert(char_traits<ExternT>::length(in) == 10);
559   assert(char_traits<InternT>::length(exp) == 5);
560 
561   test_offsets_partial offsets[] = {
562       {1, 0, 0, 0}, // no space for first CP
563 
564       {3, 1, 1, 1}, // no space for second CP
565       {2, 2, 1, 1}, // incomplete second CP
566       {2, 1, 1, 1}, // incomplete second CP, and no space for it
567 
568       {6, 2, 3, 2}, // no space for third CP
569       {4, 3, 3, 2}, // incomplete third CP
570       {5, 3, 3, 2}, // incomplete third CP
571       {4, 2, 3, 2}, // incomplete third CP, and no space for it
572       {5, 2, 3, 2}, // incomplete third CP, and no space for it
573 
574       {10, 3, 6, 3}, // no space for fourth CP
575       {10, 4, 6, 3}, // no space for fourth CP
576       {7, 5, 6, 3},  // incomplete fourth CP
577       {8, 5, 6, 3},  // incomplete fourth CP
578       {9, 5, 6, 3},  // incomplete fourth CP
579       {7, 3, 6, 3},  // incomplete fourth CP, and no space for it
580       {8, 3, 6, 3},  // incomplete fourth CP, and no space for it
581       {9, 3, 6, 3},  // incomplete fourth CP, and no space for it
582       {7, 4, 6, 3},  // incomplete fourth CP, and no space for it
583       {8, 4, 6, 3},  // incomplete fourth CP, and no space for it
584       {9, 4, 6, 3},  // incomplete fourth CP, and no space for it
585 
586   };
587 
588   for (test_offsets_partial* it = begin(offsets); it != end(offsets); ++it) {
589     test_offsets_partial t           = *it;
590     InternT out[array_size(exp) - 1] = {};
591     assert(t.in_size <= array_size(in));
592     assert(t.out_size <= array_size(out));
593     assert(t.expected_in_next <= t.in_size);
594     assert(t.expected_out_next <= t.out_size);
595     mbstate_t state          = {};
596     const ExternT* in_next   = nullptr;
597     InternT* out_next        = nullptr;
598     codecvt_base::result res = codecvt_base::ok;
599 
600     res = cvt.in(state, in, in + t.in_size, in_next, out, out + t.out_size, out_next);
601     assert(res == cvt.partial);
602     assert(in_next == in + t.expected_in_next);
603     assert(out_next == out + t.expected_out_next);
604     assert(char_traits<InternT>::compare(out, exp, t.expected_out_next) == 0);
605     if (t.expected_out_next < array_size(out))
606       assert(out[t.expected_out_next] == 0);
607 
608     state   = mbstate_t();
609     int len = cvt.length(state, in, in + t.in_size, t.out_size);
610     assert(len >= 0);
611     assert(static_cast<size_t>(len) == t.expected_in_next);
612   }
613 }
614 
615 template <class InternT, class ExternT>
utf8_to_utf16_in_error(const std::codecvt<InternT,ExternT,mbstate_t> & cvt)616 void utf8_to_utf16_in_error(const std::codecvt<InternT, ExternT, mbstate_t>& cvt) {
617   // UTF-8 string of 1-byte CP, 2-byte CP, 3-byte CP, 4-byte CP
618   const unsigned char input[] = "b\u0448\uD700\U0010AAAA";
619   const char16_t expected[]   = {'b', 0x0448, 0xD700, 0xDBEA, 0xDEAA, 0};
620   static_assert(array_size(input) == 11, "");
621   static_assert(array_size(expected) == 6, "");
622 
623   ExternT in[array_size(input)];
624   InternT exp[array_size(expected)];
625   copy(begin(input), end(input), begin(in));
626   copy(begin(expected), end(expected), begin(exp));
627   assert(char_traits<ExternT>::length(in) == 10);
628   assert(char_traits<InternT>::length(exp) == 5);
629 
630   // There are 5 classes of errors in UTF-8 decoding
631   // 1. Missing leading byte
632   // 2. Missing trailing byte
633   // 3. Surrogate CP
634   // 4. Overlong sequence
635   // 5. CP out of Unicode range
636   test_offsets_error<unsigned char> offsets[] = {
637 
638       // 1. Missing leading byte. We will replace the leading byte with
639       // non-leading byte, such as a byte that is always invalid or a trailing
640       // byte.
641 
642       // replace leading byte with invalid byte
643       {1, 5, 0, 0, 0xFF, 0},
644       {3, 5, 1, 1, 0xFF, 1},
645       {6, 5, 3, 2, 0xFF, 3},
646       {10, 5, 6, 3, 0xFF, 6},
647 
648       // replace leading byte with trailing byte
649       {1, 5, 0, 0, 0b10101010, 0},
650       {3, 5, 1, 1, 0b10101010, 1},
651       {6, 5, 3, 2, 0b10101010, 3},
652       {10, 5, 6, 3, 0b10101010, 6},
653 
654       // 2. Missing trailing byte. We will replace the trailing byte with
655       // non-trailing byte, such as a byte that is always invalid or a leading
656       // byte (simple ASCII byte in our case).
657 
658       // replace first trailing byte with ASCII byte
659       {3, 5, 1, 1, 'z', 2},
660       {6, 5, 3, 2, 'z', 4},
661       {10, 5, 6, 3, 'z', 7},
662 
663       // replace first trailing byte with invalid byte
664       {3, 5, 1, 1, 0xFF, 2},
665       {6, 5, 3, 2, 0xFF, 4},
666       {10, 5, 6, 3, 0xFF, 7},
667 
668       // replace second trailing byte with ASCII byte
669       {6, 5, 3, 2, 'z', 5},
670       {10, 5, 6, 3, 'z', 8},
671 
672       // replace second trailing byte with invalid byte
673       {6, 5, 3, 2, 0xFF, 5},
674       {10, 5, 6, 3, 0xFF, 8},
675 
676       // replace third trailing byte
677       {10, 5, 6, 3, 'z', 9},
678       {10, 5, 6, 3, 0xFF, 9},
679 
680       // 2.1 The following test-cases raise doubt whether error or partial should
681       // be returned. For example, we have 4-byte sequence with valid leading
682       // byte. If we hide the last byte we need to return partial. But, if the
683       // second or third byte, which are visible to the call to codecvt, are
684       // malformed then error should be returned.
685 
686       // replace first trailing byte with ASCII byte, also incomplete at end
687       {5, 5, 3, 2, 'z', 4},
688       {8, 5, 6, 3, 'z', 7},
689       {9, 5, 6, 3, 'z', 7},
690 
691       // replace first trailing byte with invalid byte, also incomplete at end
692       {5, 5, 3, 2, 0xFF, 4},
693       {8, 5, 6, 3, 0xFF, 7},
694       {9, 5, 6, 3, 0xFF, 7},
695 
696       // replace second trailing byte with ASCII byte, also incomplete at end
697       {9, 5, 6, 3, 'z', 8},
698 
699       // replace second trailing byte with invalid byte, also incomplete at end
700       {9, 5, 6, 3, 0xFF, 8},
701 
702       // 3. Surrogate CP. We modify the second byte (first trailing) of the 3-byte
703       // CP U+D700
704       {6, 5, 3, 2, 0b10100000, 4}, // turn U+D700 into U+D800
705       {6, 5, 3, 2, 0b10101100, 4}, // turn U+D700 into U+DB00
706       {6, 5, 3, 2, 0b10110000, 4}, // turn U+D700 into U+DC00
707       {6, 5, 3, 2, 0b10111100, 4}, // turn U+D700 into U+DF00
708 
709       // 4. Overlong sequence. The CPs in the input are chosen such as modifying
710       // just the leading byte is enough to make them overlong, i.e. for the
711       // 3-byte and 4-byte CP the second byte (first trailing) has enough leading
712       // zeroes.
713       {3, 5, 1, 1, 0b11000000, 1},  // make the 2-byte CP overlong
714       {3, 5, 1, 1, 0b11000001, 1},  // make the 2-byte CP overlong
715       {6, 5, 3, 2, 0b11100000, 3},  // make the 3-byte CP overlong
716       {10, 5, 6, 3, 0b11110000, 6}, // make the 4-byte CP overlong
717 
718       // 5. CP above range
719       // turn U+10AAAA into U+14AAAA by changing its leading byte
720       {10, 5, 6, 3, 0b11110101, 6},
721       // turn U+10AAAA into U+11AAAA by changing its 2nd byte
722       {10, 5, 6, 3, 0b10011010, 7},
723   };
724   for (test_offsets_error<unsigned char>* it = begin(offsets); it != end(offsets); ++it) {
725     test_offsets_error<unsigned char> t = *it;
726     InternT out[array_size(exp) - 1]    = {};
727     assert(t.in_size <= array_size(in));
728     assert(t.out_size <= array_size(out));
729     assert(t.expected_in_next <= t.in_size);
730     assert(t.expected_out_next <= t.out_size);
731     ExternT old_char  = in[t.replace_pos];
732     in[t.replace_pos] = t.replace_char;
733 
734     mbstate_t state          = {};
735     const ExternT* in_next   = nullptr;
736     InternT* out_next        = nullptr;
737     codecvt_base::result res = codecvt_base::ok;
738 
739     res = cvt.in(state, in, in + t.in_size, in_next, out, out + t.out_size, out_next);
740     assert(res == cvt.error);
741     assert(in_next == in + t.expected_in_next);
742     assert(out_next == out + t.expected_out_next);
743     assert(char_traits<InternT>::compare(out, exp, t.expected_out_next) == 0);
744     if (t.expected_out_next < array_size(out))
745       assert(out[t.expected_out_next] == 0);
746 
747     state   = mbstate_t();
748     int len = cvt.length(state, in, in + t.in_size, t.out_size);
749     assert(len >= 0);
750     assert(static_cast<size_t>(len) == t.expected_in_next);
751 
752     in[t.replace_pos] = old_char;
753   }
754 }
755 
756 template <class InternT, class ExternT>
utf8_to_utf16_in(const std::codecvt<InternT,ExternT,mbstate_t> & cvt)757 void utf8_to_utf16_in(const std::codecvt<InternT, ExternT, mbstate_t>& cvt) {
758   utf8_to_utf16_in_ok(cvt);
759   utf8_to_utf16_in_partial(cvt);
760   utf8_to_utf16_in_error(cvt);
761 }
762 
763 template <class InternT, class ExternT>
utf16_to_utf8_out_ok(const std::codecvt<InternT,ExternT,mbstate_t> & cvt)764 void utf16_to_utf8_out_ok(const std::codecvt<InternT, ExternT, mbstate_t>& cvt) {
765   // UTF-8 string of 1-byte CP, 2-byte CP, 3-byte CP and 4-byte CP
766   const char16_t input[]         = {'b', 0x0448, 0xAAAA, 0xDBEA, 0xDEAA, 0};
767   const unsigned char expected[] = "b\u0448\uAAAA\U0010AAAA";
768   static_assert(array_size(input) == 6, "");
769   static_assert(array_size(expected) == 11, "");
770 
771   InternT in[array_size(input)];
772   ExternT exp[array_size(expected)];
773   copy(begin(input), end(input), begin(in));
774   copy(begin(expected), end(expected), begin(exp));
775   assert(char_traits<InternT>::length(in) == 5);
776   assert(char_traits<ExternT>::length(exp) == 10);
777 
778   test_offsets_ok offsets[] = {{0, 0}, {1, 1}, {2, 3}, {3, 6}, {5, 10}};
779   for (test_offsets_ok* it = begin(offsets); it != end(offsets); ++it) {
780     test_offsets_ok t                = *it;
781     ExternT out[array_size(exp) - 1] = {};
782     assert(t.in_size <= array_size(in));
783     assert(t.out_size <= array_size(out));
784     mbstate_t state          = {};
785     const InternT* in_next   = nullptr;
786     ExternT* out_next        = nullptr;
787     codecvt_base::result res = codecvt_base::ok;
788 
789     res = cvt.out(state, in, in + t.in_size, in_next, out, out + t.out_size, out_next);
790     assert(res == cvt.ok);
791     assert(in_next == in + t.in_size);
792     assert(out_next == out + t.out_size);
793     assert(char_traits<ExternT>::compare(out, exp, t.out_size) == 0);
794     if (t.out_size < array_size(out))
795       assert(out[t.out_size] == 0);
796   }
797 }
798 
799 template <class InternT, class ExternT>
utf16_to_utf8_out_partial(const std::codecvt<InternT,ExternT,mbstate_t> & cvt)800 void utf16_to_utf8_out_partial(const std::codecvt<InternT, ExternT, mbstate_t>& cvt) {
801   // UTF-8 string of 1-byte CP, 2-byte CP, 3-byte CP and 4-byte CP
802   const char16_t input[]         = {'b', 0x0448, 0xAAAA, 0xDBEA, 0xDEAA, 0};
803   const unsigned char expected[] = "b\u0448\uAAAA\U0010AAAA";
804   static_assert(array_size(input) == 6, "");
805   static_assert(array_size(expected) == 11, "");
806 
807   InternT in[array_size(input)];
808   ExternT exp[array_size(expected)];
809   copy(begin(input), end(input), begin(in));
810   copy(begin(expected), end(expected), begin(exp));
811   assert(char_traits<InternT>::length(in) == 5);
812   assert(char_traits<ExternT>::length(exp) == 10);
813 
814   test_offsets_partial offsets[] = {
815       {1, 0, 0, 0}, // no space for first CP
816 
817       {2, 1, 1, 1}, // no space for second CP
818       {2, 2, 1, 1}, // no space for second CP
819 
820       {3, 3, 2, 3}, // no space for third CP
821       {3, 4, 2, 3}, // no space for third CP
822       {3, 5, 2, 3}, // no space for third CP
823 
824       {5, 6, 3, 6}, // no space for fourth CP
825       {5, 7, 3, 6}, // no space for fourth CP
826       {5, 8, 3, 6}, // no space for fourth CP
827       {5, 9, 3, 6}, // no space for fourth CP
828 
829       {4, 10, 3, 6}, // incomplete fourth CP
830 
831       {4, 6, 3, 6}, // incomplete fourth CP, and no space for it
832       {4, 7, 3, 6}, // incomplete fourth CP, and no space for it
833       {4, 8, 3, 6}, // incomplete fourth CP, and no space for it
834       {4, 9, 3, 6}, // incomplete fourth CP, and no space for it
835   };
836   for (test_offsets_partial* it = begin(offsets); it != end(offsets); ++it) {
837     test_offsets_partial t           = *it;
838     ExternT out[array_size(exp) - 1] = {};
839     assert(t.in_size <= array_size(in));
840     assert(t.out_size <= array_size(out));
841     assert(t.expected_in_next <= t.in_size);
842     assert(t.expected_out_next <= t.out_size);
843     mbstate_t state          = {};
844     const InternT* in_next   = nullptr;
845     ExternT* out_next        = nullptr;
846     codecvt_base::result res = codecvt_base::ok;
847 
848     res = cvt.out(state, in, in + t.in_size, in_next, out, out + t.out_size, out_next);
849     assert(res == cvt.partial);
850     assert(in_next == in + t.expected_in_next);
851     assert(out_next == out + t.expected_out_next);
852     assert(char_traits<ExternT>::compare(out, exp, t.expected_out_next) == 0);
853     if (t.expected_out_next < array_size(out))
854       assert(out[t.expected_out_next] == 0);
855   }
856 }
857 
858 template <class InternT, class ExternT>
utf16_to_utf8_out_error(const std::codecvt<InternT,ExternT,mbstate_t> & cvt)859 void utf16_to_utf8_out_error(const std::codecvt<InternT, ExternT, mbstate_t>& cvt) {
860   // UTF-8 string of 1-byte CP, 2-byte CP, 3-byte CP and 4-byte CP
861   const char16_t input[]         = {'b', 0x0448, 0xAAAA, 0xDBEA, 0xDEAA, 0};
862   const unsigned char expected[] = "b\u0448\uAAAA\U0010AAAA";
863   static_assert(array_size(input) == 6, "");
864   static_assert(array_size(expected) == 11, "");
865 
866   InternT in[array_size(input)];
867   ExternT exp[array_size(expected)];
868   copy(begin(input), end(input), begin(in));
869   copy(begin(expected), end(expected), begin(exp));
870   assert(char_traits<InternT>::length(in) == 5);
871   assert(char_traits<ExternT>::length(exp) == 10);
872 
873   // The only possible error in UTF-16 is unpaired surrogate code units.
874   // So we replace valid code points (scalar values) with lone surrogate CU.
875   test_offsets_error<InternT> offsets[] = {
876       {5, 10, 0, 0, 0xD800, 0},
877       {5, 10, 0, 0, 0xDBFF, 0},
878       {5, 10, 0, 0, 0xDC00, 0},
879       {5, 10, 0, 0, 0xDFFF, 0},
880 
881       {5, 10, 1, 1, 0xD800, 1},
882       {5, 10, 1, 1, 0xDBFF, 1},
883       {5, 10, 1, 1, 0xDC00, 1},
884       {5, 10, 1, 1, 0xDFFF, 1},
885 
886       {5, 10, 2, 3, 0xD800, 2},
887       {5, 10, 2, 3, 0xDBFF, 2},
888       {5, 10, 2, 3, 0xDC00, 2},
889       {5, 10, 2, 3, 0xDFFF, 2},
890 
891       // make the leading surrogate a trailing one
892       {5, 10, 3, 6, 0xDC00, 3},
893       {5, 10, 3, 6, 0xDFFF, 3},
894 
895       // make the trailing surrogate a leading one
896       {5, 10, 3, 6, 0xD800, 4},
897       {5, 10, 3, 6, 0xDBFF, 4},
898 
899       // make the trailing surrogate a BMP char
900       {5, 10, 3, 6, 'z', 4},
901   };
902 
903   for (test_offsets_error<InternT>* it = begin(offsets); it != end(offsets); ++it) {
904     test_offsets_error<InternT> t    = *it;
905     ExternT out[array_size(exp) - 1] = {};
906     assert(t.in_size <= array_size(in));
907     assert(t.out_size <= array_size(out));
908     assert(t.expected_in_next <= t.in_size);
909     assert(t.expected_out_next <= t.out_size);
910     InternT old_char  = in[t.replace_pos];
911     in[t.replace_pos] = t.replace_char;
912 
913     mbstate_t state          = {};
914     const InternT* in_next   = nullptr;
915     ExternT* out_next        = nullptr;
916     codecvt_base::result res = codecvt_base::ok;
917 
918     res = cvt.out(state, in, in + t.in_size, in_next, out, out + t.out_size, out_next);
919     assert(res == cvt.error);
920     assert(in_next == in + t.expected_in_next);
921     assert(out_next == out + t.expected_out_next);
922     assert(char_traits<ExternT>::compare(out, exp, t.expected_out_next) == 0);
923     if (t.expected_out_next < array_size(out))
924       assert(out[t.expected_out_next] == 0);
925 
926     in[t.replace_pos] = old_char;
927   }
928 }
929 
930 template <class InternT, class ExternT>
utf16_to_utf8_out(const std::codecvt<InternT,ExternT,mbstate_t> & cvt)931 void utf16_to_utf8_out(const std::codecvt<InternT, ExternT, mbstate_t>& cvt) {
932   utf16_to_utf8_out_ok(cvt);
933   utf16_to_utf8_out_partial(cvt);
934   utf16_to_utf8_out_error(cvt);
935 }
936 
937 template <class InternT, class ExternT>
test_utf8_utf16_cvt(const std::codecvt<InternT,ExternT,mbstate_t> & cvt)938 void test_utf8_utf16_cvt(const std::codecvt<InternT, ExternT, mbstate_t>& cvt) {
939   utf8_to_utf16_in(cvt);
940   utf16_to_utf8_out(cvt);
941 }
942 
943 template <class InternT, class ExternT>
utf8_to_ucs2_in_ok(const std::codecvt<InternT,ExternT,mbstate_t> & cvt)944 void utf8_to_ucs2_in_ok(const std::codecvt<InternT, ExternT, mbstate_t>& cvt) {
945   // UTF-8 string of 1-byte CP, 2-byte CP and 3-byte CP
946   const unsigned char input[] = "b\u0448\uAAAA";
947   const char16_t expected[]   = {'b', 0x0448, 0xAAAA, 0};
948   static_assert(array_size(input) == 7, "");
949   static_assert(array_size(expected) == 4, "");
950 
951   ExternT in[array_size(input)];
952   InternT exp[array_size(expected)];
953   copy(begin(input), end(input), begin(in));
954   copy(begin(expected), end(expected), begin(exp));
955   assert(char_traits<ExternT>::length(in) == 6);
956   assert(char_traits<InternT>::length(exp) == 3);
957 
958   test_offsets_ok offsets[] = {{0, 0}, {1, 1}, {3, 2}, {6, 3}};
959   for (test_offsets_ok* it = begin(offsets); it != end(offsets); ++it) {
960     test_offsets_ok t                = *it;
961     InternT out[array_size(exp) - 1] = {};
962     assert(t.in_size <= array_size(in));
963     assert(t.out_size <= array_size(out));
964     mbstate_t state          = {};
965     const ExternT* in_next   = nullptr;
966     InternT* out_next        = nullptr;
967     codecvt_base::result res = codecvt_base::ok;
968 
969     res = cvt.in(state, in, in + t.in_size, in_next, out, out + t.out_size, out_next);
970     assert(res == cvt.ok);
971     assert(in_next == in + t.in_size);
972     assert(out_next == out + t.out_size);
973     assert(char_traits<InternT>::compare(out, exp, t.out_size) == 0);
974     if (t.out_size < array_size(out))
975       assert(out[t.out_size] == 0);
976 
977     state   = mbstate_t();
978     int len = cvt.length(state, in, in + t.in_size, t.out_size);
979     assert(len >= 0);
980     assert(static_cast<size_t>(len) == t.in_size);
981   }
982 
983   for (test_offsets_ok* it = begin(offsets); it != end(offsets); ++it) {
984     test_offsets_ok t            = *it;
985     InternT out[array_size(exp)] = {};
986     assert(t.in_size <= array_size(in));
987     assert(t.out_size <= array_size(out));
988     mbstate_t state          = {};
989     const ExternT* in_next   = nullptr;
990     InternT* out_next        = nullptr;
991     codecvt_base::result res = codecvt_base::ok;
992 
993     res = cvt.in(state, in, in + t.in_size, in_next, out, end(out), out_next);
994     assert(res == cvt.ok);
995     assert(in_next == in + t.in_size);
996     assert(out_next == out + t.out_size);
997     assert(char_traits<InternT>::compare(out, exp, t.out_size) == 0);
998     if (t.out_size < array_size(out))
999       assert(out[t.out_size] == 0);
1000 
1001     state   = mbstate_t();
1002     int len = cvt.length(state, in, in + t.in_size, array_size(out));
1003     assert(len >= 0);
1004     assert(static_cast<size_t>(len) == t.in_size);
1005   }
1006 }
1007 
1008 template <class InternT, class ExternT>
utf8_to_ucs2_in_partial(const std::codecvt<InternT,ExternT,mbstate_t> & cvt)1009 void utf8_to_ucs2_in_partial(const std::codecvt<InternT, ExternT, mbstate_t>& cvt) {
1010   // UTF-8 string of 1-byte CP, 2-byte CP and 3-byte CP
1011   const unsigned char input[] = "b\u0448\uAAAA";
1012   const char16_t expected[]   = {'b', 0x0448, 0xAAAA, 0};
1013   static_assert(array_size(input) == 7, "");
1014   static_assert(array_size(expected) == 4, "");
1015 
1016   ExternT in[array_size(input)];
1017   InternT exp[array_size(expected)];
1018   copy(begin(input), end(input), begin(in));
1019   copy(begin(expected), end(expected), begin(exp));
1020   assert(char_traits<ExternT>::length(in) == 6);
1021   assert(char_traits<InternT>::length(exp) == 3);
1022 
1023   test_offsets_partial offsets[] = {
1024       {1, 0, 0, 0}, // no space for first CP
1025 
1026       {3, 1, 1, 1}, // no space for second CP
1027       {2, 2, 1, 1}, // incomplete second CP
1028       {2, 1, 1, 1}, // incomplete second CP, and no space for it
1029 
1030       {6, 2, 3, 2}, // no space for third CP
1031       {4, 3, 3, 2}, // incomplete third CP
1032       {5, 3, 3, 2}, // incomplete third CP
1033       {4, 2, 3, 2}, // incomplete third CP, and no space for it
1034       {5, 2, 3, 2}, // incomplete third CP, and no space for it
1035   };
1036 
1037   for (test_offsets_partial* it = begin(offsets); it != end(offsets); ++it) {
1038     test_offsets_partial t           = *it;
1039     InternT out[array_size(exp) - 1] = {};
1040     assert(t.in_size <= array_size(in));
1041     assert(t.out_size <= array_size(out));
1042     assert(t.expected_in_next <= t.in_size);
1043     assert(t.expected_out_next <= t.out_size);
1044     mbstate_t state          = {};
1045     const ExternT* in_next   = nullptr;
1046     InternT* out_next        = nullptr;
1047     codecvt_base::result res = codecvt_base::ok;
1048 
1049     res = cvt.in(state, in, in + t.in_size, in_next, out, out + t.out_size, out_next);
1050     assert(res == cvt.partial);
1051     assert(in_next == in + t.expected_in_next);
1052     assert(out_next == out + t.expected_out_next);
1053     assert(char_traits<InternT>::compare(out, exp, t.expected_out_next) == 0);
1054     if (t.expected_out_next < array_size(out))
1055       assert(out[t.expected_out_next] == 0);
1056 
1057     state   = mbstate_t();
1058     int len = cvt.length(state, in, in + t.in_size, t.out_size);
1059     assert(len >= 0);
1060     assert(static_cast<size_t>(len) == t.expected_in_next);
1061   }
1062 }
1063 
1064 template <class InternT, class ExternT>
utf8_to_ucs2_in_error(const std::codecvt<InternT,ExternT,mbstate_t> & cvt)1065 void utf8_to_ucs2_in_error(const std::codecvt<InternT, ExternT, mbstate_t>& cvt) {
1066   const unsigned char input[] = "b\u0448\uD700\U0010AAAA";
1067   const char16_t expected[]   = {'b', 0x0448, 0xD700, 0xDBEA, 0xDEAA, 0};
1068   static_assert(array_size(input) == 11, "");
1069   static_assert(array_size(expected) == 6, "");
1070 
1071   ExternT in[array_size(input)];
1072   InternT exp[array_size(expected)];
1073   copy(begin(input), end(input), begin(in));
1074   copy(begin(expected), end(expected), begin(exp));
1075   assert(char_traits<ExternT>::length(in) == 10);
1076   assert(char_traits<InternT>::length(exp) == 5);
1077 
1078   // There are 5 classes of errors in UTF-8 decoding
1079   // 1. Missing leading byte
1080   // 2. Missing trailing byte
1081   // 3. Surrogate CP
1082   // 4. Overlong sequence
1083   // 5. CP out of Unicode range
1084   test_offsets_error<unsigned char> offsets[] = {
1085 
1086       // 1. Missing leading byte. We will replace the leading byte with
1087       // non-leading byte, such as a byte that is always invalid or a trailing
1088       // byte.
1089 
1090       // replace leading byte with invalid byte
1091       {1, 5, 0, 0, 0xFF, 0},
1092       {3, 5, 1, 1, 0xFF, 1},
1093       {6, 5, 3, 2, 0xFF, 3},
1094       {10, 5, 6, 3, 0xFF, 6},
1095 
1096       // replace leading byte with trailing byte
1097       {1, 5, 0, 0, 0b10101010, 0},
1098       {3, 5, 1, 1, 0b10101010, 1},
1099       {6, 5, 3, 2, 0b10101010, 3},
1100       {10, 5, 6, 3, 0b10101010, 6},
1101 
1102       // 2. Missing trailing byte. We will replace the trailing byte with
1103       // non-trailing byte, such as a byte that is always invalid or a leading
1104       // byte (simple ASCII byte in our case).
1105 
1106       // replace first trailing byte with ASCII byte
1107       {3, 5, 1, 1, 'z', 2},
1108       {6, 5, 3, 2, 'z', 4},
1109       {10, 5, 6, 3, 'z', 7},
1110 
1111       // replace first trailing byte with invalid byte
1112       {3, 5, 1, 1, 0xFF, 2},
1113       {6, 5, 3, 2, 0xFF, 4},
1114       {10, 5, 6, 3, 0xFF, 7},
1115 
1116       // replace second trailing byte with ASCII byte
1117       {6, 5, 3, 2, 'z', 5},
1118       {10, 5, 6, 3, 'z', 8},
1119 
1120       // replace second trailing byte with invalid byte
1121       {6, 5, 3, 2, 0xFF, 5},
1122       {10, 5, 6, 3, 0xFF, 8},
1123 
1124       // replace third trailing byte
1125       {10, 5, 6, 3, 'z', 9},
1126       {10, 5, 6, 3, 0xFF, 9},
1127 
1128       // 2.1 The following test-cases raise doubt whether error or partial should
1129       // be returned. For example, we have 4-byte sequence with valid leading
1130       // byte. If we hide the last byte we need to return partial. But, if the
1131       // second or third byte, which are visible to the call to codecvt, are
1132       // malformed then error should be returned.
1133 
1134       // replace first trailing byte with ASCII byte, also incomplete at end
1135       {5, 5, 3, 2, 'z', 4},
1136       {8, 5, 6, 3, 'z', 7},
1137       {9, 5, 6, 3, 'z', 7},
1138 
1139       // replace first trailing byte with invalid byte, also incomplete at end
1140       {5, 5, 3, 2, 0xFF, 4},
1141       {8, 5, 6, 3, 0xFF, 7},
1142       {9, 5, 6, 3, 0xFF, 7},
1143 
1144       // replace second trailing byte with ASCII byte, also incomplete at end
1145       {9, 5, 6, 3, 'z', 8},
1146 
1147       // replace second trailing byte with invalid byte, also incomplete at end
1148       {9, 5, 6, 3, 0xFF, 8},
1149 
1150       // 3. Surrogate CP. We modify the second byte (first trailing) of the 3-byte
1151       // CP U+D700
1152       {6, 5, 3, 2, 0b10100000, 4}, // turn U+D700 into U+D800
1153       {6, 5, 3, 2, 0b10101100, 4}, // turn U+D700 into U+DB00
1154       {6, 5, 3, 2, 0b10110000, 4}, // turn U+D700 into U+DC00
1155       {6, 5, 3, 2, 0b10111100, 4}, // turn U+D700 into U+DF00
1156 
1157       // 4. Overlong sequence. The CPs in the input are chosen such as modifying
1158       // just the leading byte is enough to make them overlong, i.e. for the
1159       // 3-byte and 4-byte CP the second byte (first trailing) has enough leading
1160       // zeroes.
1161       {3, 5, 1, 1, 0b11000000, 1},  // make the 2-byte CP overlong
1162       {3, 5, 1, 1, 0b11000001, 1},  // make the 2-byte CP overlong
1163       {6, 5, 3, 2, 0b11100000, 3},  // make the 3-byte CP overlong
1164       {10, 5, 6, 3, 0b11110000, 6}, // make the 4-byte CP overlong
1165 
1166       // 5. CP above range
1167       // turn U+10AAAA into U+14AAAA by changing its leading byte
1168       {10, 5, 6, 3, 0b11110101, 6},
1169       // turn U+10AAAA into U+11AAAA by changing its 2nd byte
1170       {10, 5, 6, 3, 0b10011010, 7},
1171       // Don't replace anything, show full 4-byte CP U+10AAAA
1172       {10, 4, 6, 3, 'b', 0},
1173       {10, 5, 6, 3, 'b', 0},
1174       // Don't replace anything, show incomplete 4-byte CP at the end. It's still
1175       // out of UCS2 range just by seeing the first byte.
1176       {7, 4, 6, 3, 'b', 0}, // incomplete fourth CP
1177       {8, 4, 6, 3, 'b', 0}, // incomplete fourth CP
1178       {9, 4, 6, 3, 'b', 0}, // incomplete fourth CP
1179       {7, 5, 6, 3, 'b', 0}, // incomplete fourth CP
1180       {8, 5, 6, 3, 'b', 0}, // incomplete fourth CP
1181       {9, 5, 6, 3, 'b', 0}, // incomplete fourth CP
1182   };
1183   for (test_offsets_error<unsigned char>* it = begin(offsets); it != end(offsets); ++it) {
1184     test_offsets_error<unsigned char> t = *it;
1185     InternT out[array_size(exp) - 1]    = {};
1186     assert(t.in_size <= array_size(in));
1187     assert(t.out_size <= array_size(out));
1188     assert(t.expected_in_next <= t.in_size);
1189     assert(t.expected_out_next <= t.out_size);
1190     ExternT old_char  = in[t.replace_pos];
1191     in[t.replace_pos] = t.replace_char;
1192 
1193     mbstate_t state          = {};
1194     const ExternT* in_next   = nullptr;
1195     InternT* out_next        = nullptr;
1196     codecvt_base::result res = codecvt_base::ok;
1197 
1198     res = cvt.in(state, in, in + t.in_size, in_next, out, out + t.out_size, out_next);
1199     assert(res == cvt.error);
1200     assert(in_next == in + t.expected_in_next);
1201     assert(out_next == out + t.expected_out_next);
1202     assert(char_traits<InternT>::compare(out, exp, t.expected_out_next) == 0);
1203     if (t.expected_out_next < array_size(out))
1204       assert(out[t.expected_out_next] == 0);
1205 
1206     state   = mbstate_t();
1207     int len = cvt.length(state, in, in + t.in_size, t.out_size);
1208     assert(len >= 0);
1209     assert(static_cast<size_t>(len) == t.expected_in_next);
1210 
1211     in[t.replace_pos] = old_char;
1212   }
1213 }
1214 
1215 template <class InternT, class ExternT>
utf8_to_ucs2_in(const std::codecvt<InternT,ExternT,mbstate_t> & cvt)1216 void utf8_to_ucs2_in(const std::codecvt<InternT, ExternT, mbstate_t>& cvt) {
1217   utf8_to_ucs2_in_ok(cvt);
1218   utf8_to_ucs2_in_partial(cvt);
1219   utf8_to_ucs2_in_error(cvt);
1220 }
1221 
1222 template <class InternT, class ExternT>
ucs2_to_utf8_out_ok(const std::codecvt<InternT,ExternT,mbstate_t> & cvt)1223 void ucs2_to_utf8_out_ok(const std::codecvt<InternT, ExternT, mbstate_t>& cvt) {
1224   // UTF-8 string of 1-byte CP, 2-byte CP and 3-byte CP
1225   const char16_t input[]         = {'b', 0x0448, 0xAAAA, 0};
1226   const unsigned char expected[] = "b\u0448\uAAAA";
1227   static_assert(array_size(input) == 4, "");
1228   static_assert(array_size(expected) == 7, "");
1229 
1230   InternT in[array_size(input)];
1231   ExternT exp[array_size(expected)];
1232   copy(begin(input), end(input), begin(in));
1233   copy(begin(expected), end(expected), begin(exp));
1234   assert(char_traits<InternT>::length(in) == 3);
1235   assert(char_traits<ExternT>::length(exp) == 6);
1236 
1237   test_offsets_ok offsets[] = {{0, 0}, {1, 1}, {2, 3}, {3, 6}};
1238   for (test_offsets_ok* it = begin(offsets); it != end(offsets); ++it) {
1239     test_offsets_ok t                = *it;
1240     ExternT out[array_size(exp) - 1] = {};
1241     assert(t.in_size <= array_size(in));
1242     assert(t.out_size <= array_size(out));
1243     mbstate_t state          = {};
1244     const InternT* in_next   = nullptr;
1245     ExternT* out_next        = nullptr;
1246     codecvt_base::result res = codecvt_base::ok;
1247 
1248     res = cvt.out(state, in, in + t.in_size, in_next, out, out + t.out_size, out_next);
1249     assert(res == cvt.ok);
1250     assert(in_next == in + t.in_size);
1251     assert(out_next == out + t.out_size);
1252     assert(char_traits<ExternT>::compare(out, exp, t.out_size) == 0);
1253     if (t.out_size < array_size(out))
1254       assert(out[t.out_size] == 0);
1255   }
1256 }
1257 
1258 template <class InternT, class ExternT>
ucs2_to_utf8_out_partial(const std::codecvt<InternT,ExternT,mbstate_t> & cvt)1259 void ucs2_to_utf8_out_partial(const std::codecvt<InternT, ExternT, mbstate_t>& cvt) {
1260   // UTF-8 string of 1-byte CP, 2-byte CP and 3-byte CP
1261   const char16_t input[]         = {'b', 0x0448, 0xAAAA, 0};
1262   const unsigned char expected[] = "b\u0448\uAAAA";
1263   static_assert(array_size(input) == 4, "");
1264   static_assert(array_size(expected) == 7, "");
1265 
1266   InternT in[array_size(input)];
1267   ExternT exp[array_size(expected)];
1268   copy(begin(input), end(input), begin(in));
1269   copy(begin(expected), end(expected), begin(exp));
1270   assert(char_traits<InternT>::length(in) == 3);
1271   assert(char_traits<ExternT>::length(exp) == 6);
1272 
1273   test_offsets_partial offsets[] = {
1274       {1, 0, 0, 0}, // no space for first CP
1275 
1276       {2, 1, 1, 1}, // no space for second CP
1277       {2, 2, 1, 1}, // no space for second CP
1278 
1279       {3, 3, 2, 3}, // no space for third CP
1280       {3, 4, 2, 3}, // no space for third CP
1281       {3, 5, 2, 3}, // no space for third CP
1282   };
1283   for (test_offsets_partial* it = begin(offsets); it != end(offsets); ++it) {
1284     test_offsets_partial t           = *it;
1285     ExternT out[array_size(exp) - 1] = {};
1286     assert(t.in_size <= array_size(in));
1287     assert(t.out_size <= array_size(out));
1288     assert(t.expected_in_next <= t.in_size);
1289     assert(t.expected_out_next <= t.out_size);
1290     mbstate_t state          = {};
1291     const InternT* in_next   = nullptr;
1292     ExternT* out_next        = nullptr;
1293     codecvt_base::result res = codecvt_base::ok;
1294 
1295     res = cvt.out(state, in, in + t.in_size, in_next, out, out + t.out_size, out_next);
1296     assert(res == cvt.partial);
1297     assert(in_next == in + t.expected_in_next);
1298     assert(out_next == out + t.expected_out_next);
1299     assert(char_traits<ExternT>::compare(out, exp, t.expected_out_next) == 0);
1300     if (t.expected_out_next < array_size(out))
1301       assert(out[t.expected_out_next] == 0);
1302   }
1303 }
1304 
1305 template <class InternT, class ExternT>
ucs2_to_utf8_out_error(const std::codecvt<InternT,ExternT,mbstate_t> & cvt)1306 void ucs2_to_utf8_out_error(const std::codecvt<InternT, ExternT, mbstate_t>& cvt) {
1307   const char16_t input[]         = {'b', 0x0448, 0xAAAA, 0xDBEA, 0xDEAA, 0};
1308   const unsigned char expected[] = "b\u0448\uAAAA\U0010AAAA";
1309   static_assert(array_size(input) == 6, "");
1310   static_assert(array_size(expected) == 11, "");
1311 
1312   InternT in[array_size(input)];
1313   ExternT exp[array_size(expected)];
1314   copy(begin(input), end(input), begin(in));
1315   copy(begin(expected), end(expected), begin(exp));
1316   assert(char_traits<InternT>::length(in) == 5);
1317   assert(char_traits<ExternT>::length(exp) == 10);
1318 
1319   test_offsets_error<InternT> offsets[] = {
1320       {3, 6, 0, 0, 0xD800, 0},
1321       {3, 6, 0, 0, 0xDBFF, 0},
1322       {3, 6, 0, 0, 0xDC00, 0},
1323       {3, 6, 0, 0, 0xDFFF, 0},
1324 
1325       {3, 6, 1, 1, 0xD800, 1},
1326       {3, 6, 1, 1, 0xDBFF, 1},
1327       {3, 6, 1, 1, 0xDC00, 1},
1328       {3, 6, 1, 1, 0xDFFF, 1},
1329 
1330       {3, 6, 2, 3, 0xD800, 2},
1331       {3, 6, 2, 3, 0xDBFF, 2},
1332       {3, 6, 2, 3, 0xDC00, 2},
1333       {3, 6, 2, 3, 0xDFFF, 2},
1334 
1335       // make the leading surrogate a trailing one
1336       {5, 10, 3, 6, 0xDC00, 3},
1337       {5, 10, 3, 6, 0xDFFF, 3},
1338 
1339       // make the trailing surrogate a leading one
1340       {5, 10, 3, 6, 0xD800, 4},
1341       {5, 10, 3, 6, 0xDBFF, 4},
1342 
1343       // make the trailing surrogate a BMP char
1344       {5, 10, 3, 6, 'z', 4},
1345 
1346       // don't replace anything in the test cases bellow, just show the surrogate
1347       // pair (fourth CP) fully or partially
1348       {5, 10, 3, 6, 'b', 0},
1349       {5, 7, 3, 6, 'b', 0}, // no space for fourth CP
1350       {5, 8, 3, 6, 'b', 0}, // no space for fourth CP
1351       {5, 9, 3, 6, 'b', 0}, // no space for fourth CP
1352 
1353       {4, 10, 3, 6, 'b', 0}, // incomplete fourth CP
1354       {4, 7, 3, 6, 'b', 0},  // incomplete fourth CP, and no space for it
1355       {4, 8, 3, 6, 'b', 0},  // incomplete fourth CP, and no space for it
1356       {4, 9, 3, 6, 'b', 0},  // incomplete fourth CP, and no space for it
1357   };
1358 
1359   for (test_offsets_error<InternT>* it = begin(offsets); it != end(offsets); ++it) {
1360     test_offsets_error<InternT> t    = *it;
1361     ExternT out[array_size(exp) - 1] = {};
1362     assert(t.in_size <= array_size(in));
1363     assert(t.out_size <= array_size(out));
1364     assert(t.expected_in_next <= t.in_size);
1365     assert(t.expected_out_next <= t.out_size);
1366     InternT old_char  = in[t.replace_pos];
1367     in[t.replace_pos] = t.replace_char;
1368 
1369     mbstate_t state          = {};
1370     const InternT* in_next   = nullptr;
1371     ExternT* out_next        = nullptr;
1372     codecvt_base::result res = codecvt_base::ok;
1373 
1374     res = cvt.out(state, in, in + t.in_size, in_next, out, out + t.out_size, out_next);
1375     assert(res == cvt.error);
1376     assert(in_next == in + t.expected_in_next);
1377     assert(out_next == out + t.expected_out_next);
1378     assert(char_traits<ExternT>::compare(out, exp, t.expected_out_next) == 0);
1379     if (t.expected_out_next < array_size(out))
1380       assert(out[t.expected_out_next] == 0);
1381 
1382     in[t.replace_pos] = old_char;
1383   }
1384 }
1385 
1386 template <class InternT, class ExternT>
ucs2_to_utf8_out(const std::codecvt<InternT,ExternT,mbstate_t> & cvt)1387 void ucs2_to_utf8_out(const std::codecvt<InternT, ExternT, mbstate_t>& cvt) {
1388   ucs2_to_utf8_out_ok(cvt);
1389   ucs2_to_utf8_out_partial(cvt);
1390   ucs2_to_utf8_out_error(cvt);
1391 }
1392 
1393 template <class InternT, class ExternT>
test_utf8_ucs2_cvt(const std::codecvt<InternT,ExternT,mbstate_t> & cvt)1394 void test_utf8_ucs2_cvt(const std::codecvt<InternT, ExternT, mbstate_t>& cvt) {
1395   utf8_to_ucs2_in(cvt);
1396   ucs2_to_utf8_out(cvt);
1397 }
1398 
1399 enum utf16_endianess { utf16_big_endian, utf16_little_endian };
1400 
1401 template <class Iter1, class Iter2>
utf16_to_bytes(Iter1 f,Iter1 l,Iter2 o,utf16_endianess e)1402 Iter2 utf16_to_bytes(Iter1 f, Iter1 l, Iter2 o, utf16_endianess e) {
1403   if (e == utf16_big_endian)
1404     for (; f != l; ++f) {
1405       *o++ = (*f >> 8) & 0xFF;
1406       *o++ = *f & 0xFF;
1407     }
1408   else
1409     for (; f != l; ++f) {
1410       *o++ = *f & 0xFF;
1411       *o++ = (*f >> 8) & 0xFF;
1412     }
1413   return o;
1414 }
1415 
1416 template <class InternT>
utf16_to_utf32_in_ok(const std::codecvt<InternT,char,mbstate_t> & cvt,utf16_endianess endianess)1417 void utf16_to_utf32_in_ok(const std::codecvt<InternT, char, mbstate_t>& cvt, utf16_endianess endianess) {
1418   const char16_t input[]    = {'b', 0x0448, 0xAAAA, 0xDBEA, 0xDEAA, 0};
1419   const char32_t expected[] = {'b', 0x0448, 0xAAAA, 0x10AAAA, 0};
1420   static_assert(array_size(input) == 6, "");
1421   static_assert(array_size(expected) == 5, "");
1422 
1423   char in[array_size(input) * 2];
1424   InternT exp[array_size(expected)];
1425   utf16_to_bytes(begin(input), end(input), begin(in), endianess);
1426   copy(begin(expected), end(expected), begin(exp));
1427 
1428   test_offsets_ok offsets[] = {{0, 0}, {2, 1}, {4, 2}, {6, 3}, {10, 4}};
1429   for (test_offsets_ok* it = begin(offsets); it != end(offsets); ++it) {
1430     test_offsets_ok t                = *it;
1431     InternT out[array_size(exp) - 1] = {};
1432     assert(t.in_size <= array_size(in));
1433     assert(t.out_size <= array_size(out));
1434     mbstate_t state          = {};
1435     const char* in_next      = nullptr;
1436     InternT* out_next        = nullptr;
1437     codecvt_base::result res = codecvt_base::ok;
1438 
1439     res = cvt.in(state, in, in + t.in_size, in_next, out, out + t.out_size, out_next);
1440     assert(res == cvt.ok);
1441     assert(in_next == in + t.in_size);
1442     assert(out_next == out + t.out_size);
1443     assert(char_traits<InternT>::compare(out, exp, t.out_size) == 0);
1444     if (t.out_size < array_size(out))
1445       assert(out[t.out_size] == 0);
1446 
1447     state   = mbstate_t();
1448     int len = cvt.length(state, in, in + t.in_size, t.out_size);
1449     assert(len >= 0);
1450     assert(static_cast<size_t>(len) == t.in_size);
1451   }
1452 
1453   for (test_offsets_ok* it = begin(offsets); it != end(offsets); ++it) {
1454     test_offsets_ok t            = *it;
1455     InternT out[array_size(exp)] = {};
1456     assert(t.in_size <= array_size(in));
1457     assert(t.out_size <= array_size(out));
1458     mbstate_t state          = {};
1459     const char* in_next      = nullptr;
1460     InternT* out_next        = nullptr;
1461     codecvt_base::result res = codecvt_base::ok;
1462 
1463     res = cvt.in(state, in, in + t.in_size, in_next, out, end(out), out_next);
1464     assert(res == cvt.ok);
1465     assert(in_next == in + t.in_size);
1466     assert(out_next == out + t.out_size);
1467     assert(char_traits<InternT>::compare(out, exp, t.out_size) == 0);
1468     if (t.out_size < array_size(out))
1469       assert(out[t.out_size] == 0);
1470 
1471     state   = mbstate_t();
1472     int len = cvt.length(state, in, in + t.in_size, array_size(out));
1473     assert(len >= 0);
1474     assert(static_cast<size_t>(len) == t.in_size);
1475   }
1476 }
1477 
1478 template <class InternT>
utf16_to_utf32_in_partial(const std::codecvt<InternT,char,mbstate_t> & cvt,utf16_endianess endianess)1479 void utf16_to_utf32_in_partial(const std::codecvt<InternT, char, mbstate_t>& cvt, utf16_endianess endianess) {
1480   const char16_t input[]    = {'b', 0x0448, 0xAAAA, 0xDBEA, 0xDEAA, 0};
1481   const char32_t expected[] = {'b', 0x0448, 0xAAAA, 0x10AAAA, 0};
1482   static_assert(array_size(input) == 6, "");
1483   static_assert(array_size(expected) == 5, "");
1484 
1485   char in[array_size(input) * 2];
1486   InternT exp[array_size(expected)];
1487   utf16_to_bytes(begin(input), end(input), begin(in), endianess);
1488   copy(begin(expected), end(expected), begin(exp));
1489 
1490   test_offsets_partial offsets[] = {
1491       {2, 0, 0, 0}, // no space for first CP
1492       {1, 1, 0, 0}, // incomplete first CP
1493       {1, 0, 0, 0}, // incomplete first CP, and no space for it
1494 
1495       {4, 1, 2, 1}, // no space for second CP
1496       {3, 2, 2, 1}, // incomplete second CP
1497       {3, 1, 2, 1}, // incomplete second CP, and no space for it
1498 
1499       {6, 2, 4, 2}, // no space for third CP
1500       {5, 3, 4, 2}, // incomplete third CP
1501       {5, 2, 4, 2}, // incomplete third CP, and no space for it
1502 
1503       {10, 3, 6, 3}, // no space for fourth CP
1504       {7, 4, 6, 3},  // incomplete fourth CP
1505       {8, 4, 6, 3},  // incomplete fourth CP
1506       {9, 4, 6, 3},  // incomplete fourth CP
1507       {7, 3, 6, 3},  // incomplete fourth CP, and no space for it
1508       {8, 3, 6, 3},  // incomplete fourth CP, and no space for it
1509       {9, 3, 6, 3},  // incomplete fourth CP, and no space for it
1510   };
1511 
1512   for (test_offsets_partial* it = begin(offsets); it != end(offsets); ++it) {
1513     test_offsets_partial t           = *it;
1514     InternT out[array_size(exp) - 1] = {};
1515     assert(t.in_size <= array_size(in));
1516     assert(t.out_size <= array_size(out));
1517     assert(t.expected_in_next <= t.in_size);
1518     assert(t.expected_out_next <= t.out_size);
1519     mbstate_t state          = {};
1520     const char* in_next      = nullptr;
1521     InternT* out_next        = nullptr;
1522     codecvt_base::result res = codecvt_base::ok;
1523 
1524     res = cvt.in(state, in, in + t.in_size, in_next, out, out + t.out_size, out_next);
1525     assert(res == cvt.partial);
1526     assert(in_next == in + t.expected_in_next);
1527     assert(out_next == out + t.expected_out_next);
1528     assert(char_traits<InternT>::compare(out, exp, t.expected_out_next) == 0);
1529     if (t.expected_out_next < array_size(out))
1530       assert(out[t.expected_out_next] == 0);
1531 
1532     state   = mbstate_t();
1533     int len = cvt.length(state, in, in + t.in_size, t.out_size);
1534     assert(len >= 0);
1535     assert(static_cast<size_t>(len) == t.expected_in_next);
1536   }
1537 }
1538 
1539 template <class InternT>
utf16_to_utf32_in_error(const std::codecvt<InternT,char,mbstate_t> & cvt,utf16_endianess endianess)1540 void utf16_to_utf32_in_error(const std::codecvt<InternT, char, mbstate_t>& cvt, utf16_endianess endianess) {
1541   char16_t input[]          = {'b', 0x0448, 0xAAAA, 0xDBEA, 0xDEAA, 0};
1542   const char32_t expected[] = {'b', 0x0448, 0xAAAA, 0x10AAAA, 0};
1543   static_assert(array_size(input) == 6, "");
1544   static_assert(array_size(expected) == 5, "");
1545 
1546   InternT exp[array_size(expected)];
1547   copy(begin(expected), end(expected), begin(exp));
1548 
1549   // The only possible error in UTF-16 is unpaired surrogate code units.
1550   // So we replace valid code points (scalar values) with lone surrogate CU.
1551   test_offsets_error<char16_t> offsets[] = {
1552       {10, 4, 0, 0, 0xD800, 0},
1553       {10, 4, 0, 0, 0xDBFF, 0},
1554       {10, 4, 0, 0, 0xDC00, 0},
1555       {10, 4, 0, 0, 0xDFFF, 0},
1556 
1557       {10, 4, 2, 1, 0xD800, 1},
1558       {10, 4, 2, 1, 0xDBFF, 1},
1559       {10, 4, 2, 1, 0xDC00, 1},
1560       {10, 4, 2, 1, 0xDFFF, 1},
1561 
1562       {10, 4, 4, 2, 0xD800, 2},
1563       {10, 4, 4, 2, 0xDBFF, 2},
1564       {10, 4, 4, 2, 0xDC00, 2},
1565       {10, 4, 4, 2, 0xDFFF, 2},
1566 
1567       // make the leading surrogate a trailing one
1568       {10, 4, 6, 3, 0xDC00, 3},
1569       {10, 4, 6, 3, 0xDFFF, 3},
1570 
1571       // make the trailing surrogate a leading one
1572       {10, 4, 6, 3, 0xD800, 4},
1573       {10, 4, 6, 3, 0xDBFF, 4},
1574 
1575       // make the trailing surrogate a BMP char
1576       {10, 4, 6, 3, 'z', 4},
1577   };
1578 
1579   for (test_offsets_error<char16_t>* it = begin(offsets); it != end(offsets); ++it) {
1580     test_offsets_error<char16_t> t = *it;
1581     char in[array_size(input) * 2];
1582     InternT out[array_size(exp) - 1] = {};
1583     assert(t.in_size <= array_size(in));
1584     assert(t.out_size <= array_size(out));
1585     assert(t.expected_in_next <= t.in_size);
1586     assert(t.expected_out_next <= t.out_size);
1587     char16_t old_char    = input[t.replace_pos];
1588     input[t.replace_pos] = t.replace_char; // replace in input, not in in
1589     utf16_to_bytes(begin(input), end(input), begin(in), endianess);
1590 
1591     mbstate_t state          = {};
1592     const char* in_next      = nullptr;
1593     InternT* out_next        = nullptr;
1594     codecvt_base::result res = codecvt_base::ok;
1595 
1596     res = cvt.in(state, in, in + t.in_size, in_next, out, out + t.out_size, out_next);
1597     assert(res == cvt.error);
1598     assert(in_next == in + t.expected_in_next);
1599     assert(out_next == out + t.expected_out_next);
1600     assert(char_traits<InternT>::compare(out, exp, t.expected_out_next) == 0);
1601     if (t.expected_out_next < array_size(out))
1602       assert(out[t.expected_out_next] == 0);
1603 
1604     state   = mbstate_t();
1605     int len = cvt.length(state, in, in + t.in_size, t.out_size);
1606     assert(len >= 0);
1607     assert(static_cast<size_t>(len) == t.expected_in_next);
1608 
1609     input[t.replace_pos] = old_char;
1610   }
1611 }
1612 
1613 template <class InternT>
utf32_to_utf16_out_ok(const std::codecvt<InternT,char,mbstate_t> & cvt,utf16_endianess endianess)1614 void utf32_to_utf16_out_ok(const std::codecvt<InternT, char, mbstate_t>& cvt, utf16_endianess endianess) {
1615   const char32_t input[]    = {'b', 0x0448, 0xAAAA, 0x10AAAA, 0};
1616   const char16_t expected[] = {'b', 0x0448, 0xAAAA, 0xDBEA, 0xDEAA, 0};
1617   static_assert(array_size(input) == 5, "");
1618   static_assert(array_size(expected) == 6, "");
1619 
1620   InternT in[array_size(input)];
1621   char exp[array_size(expected) * 2];
1622   copy(begin(input), end(input), begin(in));
1623   utf16_to_bytes(begin(expected), end(expected), begin(exp), endianess);
1624 
1625   test_offsets_ok offsets[] = {{0, 0}, {1, 2}, {2, 4}, {3, 6}, {4, 10}};
1626   for (test_offsets_ok* it = begin(offsets); it != end(offsets); ++it) {
1627     test_offsets_ok t             = *it;
1628     char out[array_size(exp) - 2] = {};
1629     assert(t.in_size <= array_size(in));
1630     assert(t.out_size <= array_size(out));
1631     mbstate_t state          = {};
1632     const InternT* in_next   = nullptr;
1633     char* out_next           = nullptr;
1634     codecvt_base::result res = codecvt_base::ok;
1635 
1636     res = cvt.out(state, in, in + t.in_size, in_next, out, out + t.out_size, out_next);
1637     assert(res == cvt.ok);
1638     assert(in_next == in + t.in_size);
1639     assert(out_next == out + t.out_size);
1640     assert(char_traits<char>::compare(out, exp, t.out_size) == 0);
1641     if (t.out_size < array_size(out))
1642       assert(out[t.out_size] == 0);
1643   }
1644 }
1645 
1646 template <class InternT>
utf32_to_utf16_out_partial(const std::codecvt<InternT,char,mbstate_t> & cvt,utf16_endianess endianess)1647 void utf32_to_utf16_out_partial(const std::codecvt<InternT, char, mbstate_t>& cvt, utf16_endianess endianess) {
1648   const char32_t input[]    = {'b', 0x0448, 0xAAAA, 0x10AAAA, 0};
1649   const char16_t expected[] = {'b', 0x0448, 0xAAAA, 0xDBEA, 0xDEAA, 0};
1650   static_assert(array_size(input) == 5, "");
1651   static_assert(array_size(expected) == 6, "");
1652 
1653   InternT in[array_size(input)];
1654   char exp[array_size(expected) * 2];
1655   copy(begin(input), end(input), begin(in));
1656   utf16_to_bytes(begin(expected), end(expected), begin(exp), endianess);
1657 
1658   test_offsets_partial offsets[] = {
1659       {1, 0, 0, 0}, // no space for first CP
1660       {1, 1, 0, 0}, // no space for first CP
1661 
1662       {2, 2, 1, 2}, // no space for second CP
1663       {2, 3, 1, 2}, // no space for second CP
1664 
1665       {3, 4, 2, 4}, // no space for third CP
1666       {3, 5, 2, 4}, // no space for third CP
1667 
1668       {4, 6, 3, 6}, // no space for fourth CP
1669       {4, 7, 3, 6}, // no space for fourth CP
1670       {4, 8, 3, 6}, // no space for fourth CP
1671       {4, 9, 3, 6}, // no space for fourth CP
1672   };
1673   for (test_offsets_partial* it = begin(offsets); it != end(offsets); ++it) {
1674     test_offsets_partial t        = *it;
1675     char out[array_size(exp) - 2] = {};
1676     assert(t.in_size <= array_size(in));
1677     assert(t.out_size <= array_size(out));
1678     assert(t.expected_in_next <= t.in_size);
1679     assert(t.expected_out_next <= t.out_size);
1680     mbstate_t state          = {};
1681     const InternT* in_next   = nullptr;
1682     char* out_next           = nullptr;
1683     codecvt_base::result res = codecvt_base::ok;
1684 
1685     res = cvt.out(state, in, in + t.in_size, in_next, out, out + t.out_size, out_next);
1686     assert(res == cvt.partial);
1687     assert(in_next == in + t.expected_in_next);
1688     assert(out_next == out + t.expected_out_next);
1689     assert(char_traits<char>::compare(out, exp, t.expected_out_next) == 0);
1690     if (t.expected_out_next < array_size(out))
1691       assert(out[t.expected_out_next] == 0);
1692   }
1693 }
1694 
1695 template <class InternT>
utf32_to_utf16_out_error(const std::codecvt<InternT,char,mbstate_t> & cvt,utf16_endianess endianess)1696 void utf32_to_utf16_out_error(const std::codecvt<InternT, char, mbstate_t>& cvt, utf16_endianess endianess) {
1697   const char32_t input[]    = {'b', 0x0448, 0xAAAA, 0x10AAAA, 0};
1698   const char16_t expected[] = {'b', 0x0448, 0xAAAA, 0xDBEA, 0xDEAA, 0};
1699   static_assert(array_size(input) == 5, "");
1700   static_assert(array_size(expected) == 6, "");
1701 
1702   InternT in[array_size(input)];
1703   char exp[array_size(expected) * 2];
1704   copy(begin(input), end(input), begin(in));
1705   utf16_to_bytes(begin(expected), end(expected), begin(exp), endianess);
1706 
1707   test_offsets_error<InternT> offsets[] = {
1708 
1709       // Surrogate CP
1710       {4, 10, 0, 0, 0xD800, 0},
1711       {4, 10, 1, 2, 0xDBFF, 1},
1712       {4, 10, 2, 4, 0xDC00, 2},
1713       {4, 10, 3, 6, 0xDFFF, 3},
1714 
1715       // CP out of range
1716       {4, 10, 0, 0, 0x00110000, 0},
1717       {4, 10, 1, 2, 0x00110000, 1},
1718       {4, 10, 2, 4, 0x00110000, 2},
1719       {4, 10, 3, 6, 0x00110000, 3}};
1720 
1721   for (test_offsets_error<InternT>* it = begin(offsets); it != end(offsets); ++it) {
1722     test_offsets_error<InternT> t = *it;
1723     char out[array_size(exp) - 2] = {};
1724     assert(t.in_size <= array_size(in));
1725     assert(t.out_size <= array_size(out));
1726     assert(t.expected_in_next <= t.in_size);
1727     assert(t.expected_out_next <= t.out_size);
1728     InternT old_char  = in[t.replace_pos];
1729     in[t.replace_pos] = t.replace_char;
1730 
1731     mbstate_t state          = {};
1732     const InternT* in_next   = nullptr;
1733     char* out_next           = nullptr;
1734     codecvt_base::result res = codecvt_base::ok;
1735 
1736     res = cvt.out(state, in, in + t.in_size, in_next, out, out + t.out_size, out_next);
1737     assert(res == cvt.error);
1738     assert(in_next == in + t.expected_in_next);
1739     assert(out_next == out + t.expected_out_next);
1740     assert(char_traits<char>::compare(out, exp, t.expected_out_next) == 0);
1741     if (t.expected_out_next < array_size(out))
1742       assert(out[t.expected_out_next] == 0);
1743 
1744     in[t.replace_pos] = old_char;
1745   }
1746 }
1747 
1748 template <class InternT>
test_utf16_utf32_cvt(const std::codecvt<InternT,char,mbstate_t> & cvt,utf16_endianess endianess)1749 void test_utf16_utf32_cvt(const std::codecvt<InternT, char, mbstate_t>& cvt, utf16_endianess endianess) {
1750   utf16_to_utf32_in_ok(cvt, endianess);
1751   utf16_to_utf32_in_partial(cvt, endianess);
1752   utf16_to_utf32_in_error(cvt, endianess);
1753   utf32_to_utf16_out_ok(cvt, endianess);
1754   utf32_to_utf16_out_partial(cvt, endianess);
1755   utf32_to_utf16_out_error(cvt, endianess);
1756 }
1757 
1758 template <class InternT>
utf16_to_ucs2_in_ok(const std::codecvt<InternT,char,mbstate_t> & cvt,utf16_endianess endianess)1759 void utf16_to_ucs2_in_ok(const std::codecvt<InternT, char, mbstate_t>& cvt, utf16_endianess endianess) {
1760   const char16_t input[]    = {'b', 0x0448, 0xAAAA, 0};
1761   const char16_t expected[] = {'b', 0x0448, 0xAAAA, 0};
1762   static_assert(array_size(input) == 4, "");
1763   static_assert(array_size(expected) == 4, "");
1764 
1765   char in[array_size(input) * 2];
1766   InternT exp[array_size(expected)];
1767   utf16_to_bytes(begin(input), end(input), begin(in), endianess);
1768   copy(begin(expected), end(expected), begin(exp));
1769 
1770   test_offsets_ok offsets[] = {{0, 0}, {2, 1}, {4, 2}, {6, 3}};
1771   for (test_offsets_ok* it = begin(offsets); it != end(offsets); ++it) {
1772     test_offsets_ok t                = *it;
1773     InternT out[array_size(exp) - 1] = {};
1774     assert(t.in_size <= array_size(in));
1775     assert(t.out_size <= array_size(out));
1776     mbstate_t state          = {};
1777     const char* in_next      = nullptr;
1778     InternT* out_next        = nullptr;
1779     codecvt_base::result res = codecvt_base::ok;
1780 
1781     res = cvt.in(state, in, in + t.in_size, in_next, out, out + t.out_size, out_next);
1782     assert(res == cvt.ok);
1783     assert(in_next == in + t.in_size);
1784     assert(out_next == out + t.out_size);
1785     assert(char_traits<InternT>::compare(out, exp, t.out_size) == 0);
1786     if (t.out_size < array_size(out))
1787       assert(out[t.out_size] == 0);
1788 
1789     state   = mbstate_t();
1790     int len = cvt.length(state, in, in + t.in_size, t.out_size);
1791     assert(len >= 0);
1792     assert(static_cast<size_t>(len) == t.in_size);
1793   }
1794 
1795   for (test_offsets_ok* it = begin(offsets); it != end(offsets); ++it) {
1796     test_offsets_ok t            = *it;
1797     InternT out[array_size(exp)] = {};
1798     assert(t.in_size <= array_size(in));
1799     assert(t.out_size <= array_size(out));
1800     mbstate_t state          = {};
1801     const char* in_next      = nullptr;
1802     InternT* out_next        = nullptr;
1803     codecvt_base::result res = codecvt_base::ok;
1804 
1805     res = cvt.in(state, in, in + t.in_size, in_next, out, end(out), out_next);
1806     assert(res == cvt.ok);
1807     assert(in_next == in + t.in_size);
1808     assert(out_next == out + t.out_size);
1809     assert(char_traits<InternT>::compare(out, exp, t.out_size) == 0);
1810     if (t.out_size < array_size(out))
1811       assert(out[t.out_size] == 0);
1812 
1813     state   = mbstate_t();
1814     int len = cvt.length(state, in, in + t.in_size, array_size(out));
1815     assert(len >= 0);
1816     assert(static_cast<size_t>(len) == t.in_size);
1817   }
1818 }
1819 
1820 template <class InternT>
utf16_to_ucs2_in_partial(const std::codecvt<InternT,char,mbstate_t> & cvt,utf16_endianess endianess)1821 void utf16_to_ucs2_in_partial(const std::codecvt<InternT, char, mbstate_t>& cvt, utf16_endianess endianess) {
1822   const char16_t input[]    = {'b', 0x0448, 0xAAAA, 0};
1823   const char16_t expected[] = {'b', 0x0448, 0xAAAA, 0};
1824   static_assert(array_size(input) == 4, "");
1825   static_assert(array_size(expected) == 4, "");
1826 
1827   char in[array_size(input) * 2];
1828   InternT exp[array_size(expected)];
1829   utf16_to_bytes(begin(input), end(input), begin(in), endianess);
1830   copy(begin(expected), end(expected), begin(exp));
1831 
1832   test_offsets_partial offsets[] = {
1833       {2, 0, 0, 0}, // no space for first CP
1834       {1, 1, 0, 0}, // incomplete first CP
1835       {1, 0, 0, 0}, // incomplete first CP, and no space for it
1836 
1837       {4, 1, 2, 1}, // no space for second CP
1838       {3, 2, 2, 1}, // incomplete second CP
1839       {3, 1, 2, 1}, // incomplete second CP, and no space for it
1840 
1841       {6, 2, 4, 2}, // no space for third CP
1842       {5, 3, 4, 2}, // incomplete third CP
1843       {5, 2, 4, 2}, // incomplete third CP, and no space for it
1844   };
1845 
1846   for (test_offsets_partial* it = begin(offsets); it != end(offsets); ++it) {
1847     test_offsets_partial t           = *it;
1848     InternT out[array_size(exp) - 1] = {};
1849     assert(t.in_size <= array_size(in));
1850     assert(t.out_size <= array_size(out));
1851     assert(t.expected_in_next <= t.in_size);
1852     assert(t.expected_out_next <= t.out_size);
1853     mbstate_t state          = {};
1854     const char* in_next      = nullptr;
1855     InternT* out_next        = nullptr;
1856     codecvt_base::result res = codecvt_base::ok;
1857 
1858     res = cvt.in(state, in, in + t.in_size, in_next, out, out + t.out_size, out_next);
1859     assert(res == cvt.partial);
1860     assert(in_next == in + t.expected_in_next);
1861     assert(out_next == out + t.expected_out_next);
1862     assert(char_traits<InternT>::compare(out, exp, t.expected_out_next) == 0);
1863     if (t.expected_out_next < array_size(out))
1864       assert(out[t.expected_out_next] == 0);
1865 
1866     state   = mbstate_t();
1867     int len = cvt.length(state, in, in + t.in_size, t.out_size);
1868     assert(len >= 0);
1869     assert(static_cast<size_t>(len) == t.expected_in_next);
1870   }
1871 }
1872 
1873 template <class InternT>
utf16_to_ucs2_in_error(const std::codecvt<InternT,char,mbstate_t> & cvt,utf16_endianess endianess)1874 void utf16_to_ucs2_in_error(const std::codecvt<InternT, char, mbstate_t>& cvt, utf16_endianess endianess) {
1875   char16_t input[]          = {'b', 0x0448, 0xAAAA, 0xDBEA, 0xDEAA, 0};
1876   const char16_t expected[] = {'b', 0x0448, 0xAAAA, 0xDBEA, 0xDEAA, 0};
1877   static_assert(array_size(input) == 6, "");
1878   static_assert(array_size(expected) == 6, "");
1879 
1880   InternT exp[array_size(expected)];
1881   copy(begin(expected), end(expected), begin(exp));
1882 
1883   // The only possible error in UTF-16 is unpaired surrogate code units.
1884   // Additionally, because the target encoding is UCS-2, a proper pair of
1885   // surrogates is also error. Simply, any surrogate CU is error.
1886   test_offsets_error<char16_t> offsets[] = {
1887       {6, 3, 0, 0, 0xD800, 0},
1888       {6, 3, 0, 0, 0xDBFF, 0},
1889       {6, 3, 0, 0, 0xDC00, 0},
1890       {6, 3, 0, 0, 0xDFFF, 0},
1891 
1892       {6, 3, 2, 1, 0xD800, 1},
1893       {6, 3, 2, 1, 0xDBFF, 1},
1894       {6, 3, 2, 1, 0xDC00, 1},
1895       {6, 3, 2, 1, 0xDFFF, 1},
1896 
1897       {6, 3, 4, 2, 0xD800, 2},
1898       {6, 3, 4, 2, 0xDBFF, 2},
1899       {6, 3, 4, 2, 0xDC00, 2},
1900       {6, 3, 4, 2, 0xDFFF, 2},
1901 
1902       // make the leading surrogate a trailing one
1903       {10, 5, 6, 3, 0xDC00, 3},
1904       {10, 5, 6, 3, 0xDFFF, 3},
1905 
1906       // make the trailing surrogate a leading one
1907       {10, 5, 6, 3, 0xD800, 4},
1908       {10, 5, 6, 3, 0xDBFF, 4},
1909 
1910       // make the trailing surrogate a BMP char
1911       {10, 5, 6, 3, 'z', 4},
1912 
1913       // don't replace anything in the test cases bellow, just show the surrogate
1914       // pair (fourth CP) fully or partially (just the first surrogate)
1915       {10, 5, 6, 3, 'b', 0},
1916       {8, 5, 6, 3, 'b', 0},
1917       {9, 5, 6, 3, 'b', 0},
1918 
1919       {10, 4, 6, 3, 'b', 0},
1920       {8, 4, 6, 3, 'b', 0},
1921       {9, 4, 6, 3, 'b', 0},
1922   };
1923 
1924   for (test_offsets_error<char16_t>* it = begin(offsets); it != end(offsets); ++it) {
1925     test_offsets_error<char16_t> t = *it;
1926     char in[array_size(input) * 2];
1927     InternT out[array_size(exp) - 1] = {};
1928     assert(t.in_size <= array_size(in));
1929     assert(t.out_size <= array_size(out));
1930     assert(t.expected_in_next <= t.in_size);
1931     assert(t.expected_out_next <= t.out_size);
1932     char16_t old_char    = input[t.replace_pos];
1933     input[t.replace_pos] = t.replace_char; // replace in input, not in in
1934     utf16_to_bytes(begin(input), end(input), begin(in), endianess);
1935 
1936     mbstate_t state          = {};
1937     const char* in_next      = nullptr;
1938     InternT* out_next        = nullptr;
1939     codecvt_base::result res = codecvt_base::ok;
1940 
1941     res = cvt.in(state, in, in + t.in_size, in_next, out, out + t.out_size, out_next);
1942     assert(res == cvt.error);
1943     assert(in_next == in + t.expected_in_next);
1944     assert(out_next == out + t.expected_out_next);
1945     assert(char_traits<InternT>::compare(out, exp, t.expected_out_next) == 0);
1946     if (t.expected_out_next < array_size(out))
1947       assert(out[t.expected_out_next] == 0);
1948 
1949     state   = mbstate_t();
1950     int len = cvt.length(state, in, in + t.in_size, t.out_size);
1951     assert(len >= 0);
1952     assert(static_cast<size_t>(len) == t.expected_in_next);
1953 
1954     input[t.replace_pos] = old_char;
1955   }
1956 }
1957 
1958 template <class InternT>
ucs2_to_utf16_out_ok(const std::codecvt<InternT,char,mbstate_t> & cvt,utf16_endianess endianess)1959 void ucs2_to_utf16_out_ok(const std::codecvt<InternT, char, mbstate_t>& cvt, utf16_endianess endianess) {
1960   const char16_t input[]    = {'b', 0x0448, 0xAAAA, 0};
1961   const char16_t expected[] = {'b', 0x0448, 0xAAAA, 0};
1962   static_assert(array_size(input) == 4, "");
1963   static_assert(array_size(expected) == 4, "");
1964 
1965   InternT in[array_size(input)];
1966   char exp[array_size(expected) * 2];
1967   copy(begin(input), end(input), begin(in));
1968   utf16_to_bytes(begin(expected), end(expected), begin(exp), endianess);
1969 
1970   test_offsets_ok offsets[] = {{0, 0}, {1, 2}, {2, 4}, {3, 6}};
1971   for (test_offsets_ok* it = begin(offsets); it != end(offsets); ++it) {
1972     test_offsets_ok t             = *it;
1973     char out[array_size(exp) - 2] = {};
1974     assert(t.in_size <= array_size(in));
1975     assert(t.out_size <= array_size(out));
1976     mbstate_t state          = {};
1977     const InternT* in_next   = nullptr;
1978     char* out_next           = nullptr;
1979     codecvt_base::result res = codecvt_base::ok;
1980 
1981     res = cvt.out(state, in, in + t.in_size, in_next, out, out + t.out_size, out_next);
1982     assert(res == cvt.ok);
1983     assert(in_next == in + t.in_size);
1984     assert(out_next == out + t.out_size);
1985     assert(char_traits<char>::compare(out, exp, t.out_size) == 0);
1986     if (t.out_size < array_size(out))
1987       assert(out[t.out_size] == 0);
1988   }
1989 }
1990 
1991 template <class InternT>
ucs2_to_utf16_out_partial(const std::codecvt<InternT,char,mbstate_t> & cvt,utf16_endianess endianess)1992 void ucs2_to_utf16_out_partial(const std::codecvt<InternT, char, mbstate_t>& cvt, utf16_endianess endianess) {
1993   const char16_t input[]    = {'b', 0x0448, 0xAAAA, 0};
1994   const char16_t expected[] = {'b', 0x0448, 0xAAAA, 0};
1995   static_assert(array_size(input) == 4, "");
1996   static_assert(array_size(expected) == 4, "");
1997 
1998   InternT in[array_size(input)];
1999   char exp[array_size(expected) * 2];
2000   copy(begin(input), end(input), begin(in));
2001   utf16_to_bytes(begin(expected), end(expected), begin(exp), endianess);
2002 
2003   test_offsets_partial offsets[] = {
2004       {1, 0, 0, 0}, // no space for first CP
2005       {1, 1, 0, 0}, // no space for first CP
2006 
2007       {2, 2, 1, 2}, // no space for second CP
2008       {2, 3, 1, 2}, // no space for second CP
2009 
2010       {3, 4, 2, 4}, // no space for third CP
2011       {3, 5, 2, 4}, // no space for third CP
2012   };
2013   for (test_offsets_partial* it = begin(offsets); it != end(offsets); ++it) {
2014     test_offsets_partial t        = *it;
2015     char out[array_size(exp) - 2] = {};
2016     assert(t.in_size <= array_size(in));
2017     assert(t.out_size <= array_size(out));
2018     assert(t.expected_in_next <= t.in_size);
2019     assert(t.expected_out_next <= t.out_size);
2020     mbstate_t state          = {};
2021     const InternT* in_next   = nullptr;
2022     char* out_next           = nullptr;
2023     codecvt_base::result res = codecvt_base::ok;
2024 
2025     res = cvt.out(state, in, in + t.in_size, in_next, out, out + t.out_size, out_next);
2026     assert(res == cvt.partial);
2027     assert(in_next == in + t.expected_in_next);
2028     assert(out_next == out + t.expected_out_next);
2029     assert(char_traits<char>::compare(out, exp, t.expected_out_next) == 0);
2030     if (t.expected_out_next < array_size(out))
2031       assert(out[t.expected_out_next] == 0);
2032   }
2033 }
2034 
2035 template <class InternT>
ucs2_to_utf16_out_error(const std::codecvt<InternT,char,mbstate_t> & cvt,utf16_endianess endianess)2036 void ucs2_to_utf16_out_error(const std::codecvt<InternT, char, mbstate_t>& cvt, utf16_endianess endianess) {
2037   const char16_t input[]    = {'b', 0x0448, 0xAAAA, 0xDBEA, 0xDEAA, 0};
2038   const char16_t expected[] = {'b', 0x0448, 0xAAAA, 0xDBEA, 0xDEAA, 0};
2039   static_assert(array_size(input) == 6, "");
2040   static_assert(array_size(expected) == 6, "");
2041 
2042   InternT in[array_size(input)];
2043   char exp[array_size(expected) * 2];
2044   copy(begin(input), end(input), begin(in));
2045   utf16_to_bytes(begin(expected), end(expected), begin(exp), endianess);
2046 
2047   test_offsets_error<InternT> offsets[] = {
2048       {3, 6, 0, 0, 0xD800, 0},
2049       {3, 6, 0, 0, 0xDBFF, 0},
2050       {3, 6, 0, 0, 0xDC00, 0},
2051       {3, 6, 0, 0, 0xDFFF, 0},
2052 
2053       {3, 6, 1, 2, 0xD800, 1},
2054       {3, 6, 1, 2, 0xDBFF, 1},
2055       {3, 6, 1, 2, 0xDC00, 1},
2056       {3, 6, 1, 2, 0xDFFF, 1},
2057 
2058       {3, 6, 2, 4, 0xD800, 2},
2059       {3, 6, 2, 4, 0xDBFF, 2},
2060       {3, 6, 2, 4, 0xDC00, 2},
2061       {3, 6, 2, 4, 0xDFFF, 2},
2062 
2063       // make the leading surrogate a trailing one
2064       {5, 10, 3, 6, 0xDC00, 3},
2065       {5, 10, 3, 6, 0xDFFF, 3},
2066 
2067       // make the trailing surrogate a leading one
2068       {5, 10, 3, 6, 0xD800, 4},
2069       {5, 10, 3, 6, 0xDBFF, 4},
2070 
2071       // make the trailing surrogate a BMP char
2072       {5, 10, 3, 6, 'z', 4},
2073 
2074       // don't replace anything in the test cases bellow, just show the surrogate
2075       // pair (fourth CP) fully or partially (just the first surrogate)
2076       {5, 10, 3, 6, 'b', 0},
2077       {5, 8, 3, 6, 'b', 0},
2078       {5, 9, 3, 6, 'b', 0},
2079 
2080       {4, 10, 3, 6, 'b', 0},
2081       {4, 8, 3, 6, 'b', 0},
2082       {4, 9, 3, 6, 'b', 0},
2083   };
2084 
2085   for (test_offsets_error<InternT>* it = begin(offsets); it != end(offsets); ++it) {
2086     test_offsets_error<InternT> t = *it;
2087     char out[array_size(exp) - 2] = {};
2088     assert(t.in_size <= array_size(in));
2089     assert(t.out_size <= array_size(out));
2090     assert(t.expected_in_next <= t.in_size);
2091     assert(t.expected_out_next <= t.out_size);
2092     InternT old_char  = in[t.replace_pos];
2093     in[t.replace_pos] = t.replace_char;
2094 
2095     mbstate_t state          = {};
2096     const InternT* in_next   = nullptr;
2097     char* out_next           = nullptr;
2098     codecvt_base::result res = codecvt_base::ok;
2099 
2100     res = cvt.out(state, in, in + t.in_size, in_next, out, out + t.out_size, out_next);
2101     assert(res == cvt.error);
2102     assert(in_next == in + t.expected_in_next);
2103     assert(out_next == out + t.expected_out_next);
2104     assert(char_traits<char>::compare(out, exp, t.expected_out_next) == 0);
2105     if (t.expected_out_next < array_size(out))
2106       assert(out[t.expected_out_next] == 0);
2107 
2108     in[t.replace_pos] = old_char;
2109   }
2110 }
2111 
2112 template <class InternT>
test_utf16_ucs2_cvt(const std::codecvt<InternT,char,mbstate_t> & cvt,utf16_endianess endianess)2113 void test_utf16_ucs2_cvt(const std::codecvt<InternT, char, mbstate_t>& cvt, utf16_endianess endianess) {
2114   utf16_to_ucs2_in_ok(cvt, endianess);
2115   utf16_to_ucs2_in_partial(cvt, endianess);
2116   utf16_to_ucs2_in_error(cvt, endianess);
2117   ucs2_to_utf16_out_ok(cvt, endianess);
2118   ucs2_to_utf16_out_partial(cvt, endianess);
2119   ucs2_to_utf16_out_error(cvt, endianess);
2120 }
2121 
2122 using std::codecvt;
2123 using std::codecvt_utf16;
2124 using std::codecvt_utf8;
2125 using std::codecvt_utf8_utf16;
2126 using std::has_facet;
2127 using std::locale;
2128 using std::use_facet;
2129 
test_utf8_utf32_codecvts()2130 void test_utf8_utf32_codecvts() {
2131   typedef codecvt<char32_t, char, mbstate_t> codecvt_c32;
2132   const locale& loc_c = locale::classic();
2133   assert(has_facet<codecvt_c32>(loc_c));
2134 
2135   const codecvt_c32& cvt = use_facet<codecvt_c32>(loc_c);
2136   test_utf8_utf32_cvt(cvt);
2137 
2138   codecvt_utf8<char32_t> cvt2;
2139   test_utf8_utf32_cvt(cvt2);
2140 
2141 #if !defined(TEST_HAS_NO_WIDE_CHARACTERS) && !defined(TEST_SHORT_WCHAR)
2142   codecvt_utf8<wchar_t> cvt3;
2143   test_utf8_utf32_cvt(cvt3);
2144 #endif
2145 
2146 #ifndef TEST_HAS_NO_CHAR8_T
2147   typedef codecvt<char32_t, char8_t, mbstate_t> codecvt_c32_c8;
2148   assert(has_facet<codecvt_c32_c8>(loc_c));
2149   const codecvt_c32_c8& cvt4 = use_facet<codecvt_c32_c8>(loc_c);
2150   test_utf8_utf32_cvt(cvt4);
2151 #endif
2152 }
2153 
test_utf8_utf16_codecvts()2154 void test_utf8_utf16_codecvts() {
2155   typedef codecvt<char16_t, char, mbstate_t> codecvt_c16;
2156   const locale& loc_c = locale::classic();
2157   assert(has_facet<codecvt_c16>(loc_c));
2158 
2159   const codecvt_c16& cvt = use_facet<codecvt_c16>(loc_c);
2160   test_utf8_utf16_cvt(cvt);
2161 
2162   codecvt_utf8_utf16<char16_t> cvt2;
2163   test_utf8_utf16_cvt(cvt2);
2164 
2165   codecvt_utf8_utf16<char32_t> cvt3;
2166   test_utf8_utf16_cvt(cvt3);
2167 
2168 #ifndef TEST_HAS_NO_WIDE_CHARACTERS
2169   codecvt_utf8_utf16<wchar_t> cvt4;
2170   test_utf8_utf16_cvt(cvt4);
2171 #endif
2172 
2173 #ifndef TEST_HAS_NO_CHAR8_T
2174   typedef codecvt<char16_t, char8_t, mbstate_t> codecvt_c16_c8;
2175   assert(has_facet<codecvt_c16_c8>(loc_c));
2176   const codecvt_c16_c8& cvt5 = use_facet<codecvt_c16_c8>(loc_c);
2177   test_utf8_utf16_cvt(cvt5);
2178 #endif
2179 }
2180 
test_utf8_ucs2_codecvts()2181 void test_utf8_ucs2_codecvts() {
2182   codecvt_utf8<char16_t> cvt;
2183   test_utf8_ucs2_cvt(cvt);
2184 
2185 #if !defined(TEST_HAS_NO_WIDE_CHARACTERS) && defined(TEST_SHORT_WCHAR)
2186   codecvt_utf8<wchar_t> cvt2;
2187   test_utf8_ucs2_cvt(cvt2);
2188 #endif
2189 }
2190 
test_utf16_utf32_codecvts()2191 void test_utf16_utf32_codecvts() {
2192   codecvt_utf16<char32_t> cvt;
2193   test_utf16_utf32_cvt(cvt, utf16_big_endian);
2194 
2195   codecvt_utf16<char32_t, 0x10FFFF, std::little_endian> cvt2;
2196   test_utf16_utf32_cvt(cvt2, utf16_little_endian);
2197 
2198 #if !defined(TEST_HAS_NO_WIDE_CHARACTERS) && !defined(TEST_SHORT_WCHAR)
2199   codecvt_utf16<wchar_t> cvt3;
2200   test_utf16_utf32_cvt(cvt3, utf16_big_endian);
2201 
2202   codecvt_utf16<wchar_t, 0x10FFFF, std::little_endian> cvt4;
2203   test_utf16_utf32_cvt(cvt4, utf16_little_endian);
2204 #endif
2205 }
2206 
test_utf16_ucs2_codecvts()2207 void test_utf16_ucs2_codecvts() {
2208   codecvt_utf16<char16_t> cvt;
2209   test_utf16_ucs2_cvt(cvt, utf16_big_endian);
2210 
2211   codecvt_utf16<char16_t, 0x10FFFF, std::little_endian> cvt2;
2212   test_utf16_ucs2_cvt(cvt2, utf16_little_endian);
2213 
2214 #if !defined(TEST_HAS_NO_WIDE_CHARACTERS) && defined(TEST_SHORT_WCHAR)
2215   codecvt_utf16<wchar_t> cvt3;
2216   test_utf16_ucs2_cvt(cvt3, utf16_big_endian);
2217 
2218   codecvt_utf16<wchar_t, 0x10FFFF, std::little_endian> cvt4;
2219   test_utf16_ucs2_cvt(cvt4, utf16_little_endian);
2220 #endif
2221 }
2222 
main()2223 int main() {
2224   test_utf8_utf32_codecvts();
2225   test_utf8_utf16_codecvts();
2226   test_utf8_ucs2_codecvts();
2227   test_utf16_utf32_codecvts();
2228   test_utf16_ucs2_codecvts();
2229 }
2230