1 //
2 // Copyright (c) 2015 Artyom Beilis (Tonkikh)
3 //
4 // Distributed under the Boost Software License, Version 1.0. (See
5 // accompanying file LICENSE or copy at
6 // http://www.boost.org/LICENSE_1_0.txt)
7 //
8
9 #include <boost/nowide/utf8_codecvt.hpp>
10
11 #include <boost/nowide/convert.hpp>
12 #include <cstring>
13 #include <iomanip>
14 #include <iostream>
15 #include <locale>
16 #include <vector>
17
18 #include "test.hpp"
19 #include "test_sets.hpp"
20
21 static const char* utf8_name =
22 "\xf0\x9d\x92\x9e-\xD0\xBF\xD1\x80\xD0\xB8\xD0\xB2\xD0\xB5\xD1\x82-\xE3\x82\x84\xE3\x81\x82.txt";
23 static const std::wstring wide_name_str = boost::nowide::widen(utf8_name);
24 static const wchar_t* wide_name = wide_name_str.c_str();
25
26 typedef std::codecvt<wchar_t, char, std::mbstate_t> cvt_type;
27
test_codecvt_in_n_m(const cvt_type & cvt,size_t n,size_t m)28 void test_codecvt_in_n_m(const cvt_type& cvt, size_t n, size_t m)
29 {
30 const wchar_t* wptr = wide_name;
31 size_t wlen = std::wcslen(wide_name);
32 size_t u8len = std::strlen(utf8_name);
33 const char* from = utf8_name;
34 const char* end = from;
35 const char* real_end = utf8_name + u8len;
36 const char* from_next = from;
37 std::mbstate_t mb = std::mbstate_t();
38 while(from_next < real_end)
39 {
40 if(from == end)
41 {
42 end = from + n;
43 if(end > real_end)
44 end = real_end;
45 }
46
47 wchar_t buf[128];
48 wchar_t* to = buf;
49 wchar_t* to_end = to + m;
50 wchar_t* to_next = to;
51
52 std::mbstate_t mb2 = mb;
53 std::codecvt_base::result r = cvt.in(mb, from, end, from_next, to, to_end, to_next);
54
55 int count = cvt.length(mb2, from, end, to_end - to);
56 #ifndef BOOST_NOWIDE_DO_LENGTH_MBSTATE_CONST
57 TEST(std::memcmp(&mb, &mb2, sizeof(mb)) == 0);
58 if(count != from_next - from)
59 {
60 std::cout << count << " " << from_next - from << std::endl;
61 }
62 TEST(count == from_next - from);
63 #else
64 TEST(count == to_next - to);
65 #endif
66
67 if(r == cvt_type::partial)
68 {
69 end += n;
70 if(end > real_end)
71 end = real_end;
72 } else
73 TEST(r == cvt_type::ok);
74 while(to != to_next)
75 {
76 TEST(*wptr == *to);
77 wptr++;
78 to++;
79 }
80 to = to_next;
81 from = from_next;
82 }
83 TEST(wptr == wide_name + wlen);
84 TEST(from == real_end);
85 }
86
test_codecvt_out_n_m(const cvt_type & cvt,size_t n,size_t m)87 void test_codecvt_out_n_m(const cvt_type& cvt, size_t n, size_t m)
88 {
89 const char* nptr = utf8_name;
90 size_t wlen = std::wcslen(wide_name);
91 size_t u8len = std::strlen(utf8_name);
92
93 std::mbstate_t mb = std::mbstate_t();
94
95 const wchar_t* from_next = wide_name;
96 const wchar_t* real_from_end = wide_name + wlen;
97
98 char buf[256];
99 char* to = buf;
100 char* to_next = to;
101 char* to_end = to + n;
102 char* real_to_end = buf + sizeof(buf);
103
104 while(from_next < real_from_end)
105 {
106 const wchar_t* from = from_next;
107 const wchar_t* from_end = from + m;
108 if(from_end > real_from_end)
109 from_end = real_from_end;
110 if(to_end == to)
111 {
112 to_end = to + n;
113 }
114
115 std::codecvt_base::result r = cvt.out(mb, from, from_end, from_next, to, to_end, to_next);
116 if(r == cvt_type::partial)
117 {
118 // If those are equal, then "partial" probably means: Need more input
119 // Otherwise "Need more output"
120 if(from_next != from_end)
121 {
122 TEST(to_end - to_next < cvt.max_length());
123 to_end += n;
124 if(to_end > real_to_end)
125 to_end = real_to_end;
126 }
127 } else
128 {
129 TEST(r == cvt_type::ok);
130 }
131
132 while(to != to_next)
133 {
134 TEST(*nptr == *to);
135 nptr++;
136 to++;
137 }
138 from = from_next;
139 }
140 TEST(nptr == utf8_name + u8len);
141 TEST(from_next == real_from_end);
142 TEST(cvt.unshift(mb, to, to + n, to_next) == cvt_type::ok);
143 TEST(to_next == to);
144 }
145
test_codecvt_conv()146 void test_codecvt_conv()
147 {
148 std::cout << "Conversions " << std::endl;
149 std::locale l(std::locale::classic(), new boost::nowide::utf8_codecvt<wchar_t>());
150
151 const cvt_type& cvt = std::use_facet<cvt_type>(l);
152 const size_t utf8_len = std::strlen(utf8_name);
153 const size_t wide_len = std::wcslen(wide_name);
154
155 for(size_t i = 1; i <= utf8_len + 1; i++)
156 {
157 for(size_t j = 1; j <= wide_len + 1; j++)
158 {
159 try
160 {
161 test_codecvt_in_n_m(cvt, i, j);
162 test_codecvt_out_n_m(cvt, i, j);
163 } catch(...)
164 {
165 std::cerr << "Wlen=" << j << " Nlen=" << i << std::endl;
166 throw;
167 }
168 }
169 }
170 }
171
test_codecvt_err()172 void test_codecvt_err()
173 {
174 std::cout << "Errors " << std::endl;
175 std::locale l(std::locale::classic(), new boost::nowide::utf8_codecvt<wchar_t>());
176
177 const cvt_type& cvt = std::use_facet<cvt_type>(l);
178
179 std::cout << "- UTF-8" << std::endl;
180 {
181 {
182 wchar_t buf[4];
183 wchar_t* const to = buf;
184 wchar_t* const to_end = buf + 4;
185 const char* err_utf = "1\xFF\xFF\xd7\xa9";
186 std::mbstate_t mb = std::mbstate_t();
187 const char* from = err_utf;
188 const char* from_end = from + std::strlen(from);
189 const char* from_next = from;
190 wchar_t* to_next = to;
191 TEST(cvt.in(mb, from, from_end, from_next, to, to_end, to_next) == cvt_type::ok);
192 TEST(from_next == from + 5);
193 TEST(to_next == to + 4);
194 TEST(std::wstring(to, to_end) == boost::nowide::widen(err_utf));
195 }
196 {
197 wchar_t buf[4];
198 wchar_t* const to = buf;
199 wchar_t* const to_end = buf + 4;
200 const char* err_utf = "1\xd7"; // 1 valid, 1 incomplete UTF-8 char
201 std::mbstate_t mb = std::mbstate_t();
202 const char* from = err_utf;
203 const char* from_end = from + std::strlen(from);
204 const char* from_next = from;
205 wchar_t* to_next = to;
206 TEST(cvt.in(mb, from, from_end, from_next, to, to_end, to_next) == cvt_type::partial);
207 TEST(from_next == from + 1);
208 TEST(to_next == to + 1);
209 TEST(std::wstring(to, to_next) == std::wstring(L"1"));
210 }
211 {
212 char buf[4] = {};
213 char* const to = buf;
214 char* const to_end = buf + 4;
215 char* to_next = to;
216 const wchar_t* err_utf = L"\xD800"; // Trailing UTF-16 surrogate
217 std::mbstate_t mb = std::mbstate_t();
218 const wchar_t* from = err_utf;
219 const wchar_t* from_end = from + 1;
220 const wchar_t* from_next = from;
221 cvt_type::result res = cvt.out(mb, from, from_end, from_next, to, to_end, to_next);
222 #ifdef BOOST_MSVC
223 #pragma warning(disable : 4127) // Constant expression detected
224 #endif
225 if(sizeof(wchar_t) == 2)
226 {
227 TEST(res == cvt_type::partial);
228 TEST(from_next == from_end);
229 TEST(to_next == to);
230 TEST(buf[0] == 0);
231 } else
232 {
233 TEST(res == cvt_type::ok);
234 TEST(from_next == from_end);
235 TEST(to_next == to + 3);
236 // surrogate is invalid
237 TEST(std::string(to, to_next) == boost::nowide::narrow(wreplacement_str));
238 }
239 }
240 }
241
242 std::cout << "- UTF-16/32" << std::endl;
243 {
244 char buf[32];
245 char* to = buf;
246 char* to_end = buf + 32;
247 char* to_next = to;
248 wchar_t err_buf[3] = {'1', 0xDC9E, 0}; // second surrogate not works both for UTF-16 and 32
249 const wchar_t* err_utf = err_buf;
250 {
251 std::mbstate_t mb = std::mbstate_t();
252 const wchar_t* from = err_utf;
253 const wchar_t* from_end = from + std::wcslen(from);
254 const wchar_t* from_next = from;
255 TEST(cvt.out(mb, from, from_end, from_next, to, to_end, to_next) == cvt_type::ok);
256 TEST(from_next == from + 2);
257 TEST(to_next == to + 4);
258 TEST(std::string(to, to_next) == "1" + boost::nowide::narrow(wreplacement_str));
259 }
260 }
261 }
262
codecvt_to_wide(const std::string & s)263 std::wstring codecvt_to_wide(const std::string& s)
264 {
265 std::locale l(std::locale::classic(), new boost::nowide::utf8_codecvt<wchar_t>());
266
267 const cvt_type& cvt = std::use_facet<cvt_type>(l);
268
269 std::mbstate_t mb = std::mbstate_t();
270 const char* const from = s.c_str();
271 const char* const from_end = from + s.size();
272 const char* from_next = from;
273
274 std::vector<wchar_t> buf(s.size() + 2); // +1 for possible incomplete char, +1 for NULL
275 wchar_t* const to = &buf[0];
276 wchar_t* const to_end = to + buf.size();
277 wchar_t* to_next = to;
278
279 cvt_type::result res = cvt.in(mb, from, from_end, from_next, to, to_end, to_next);
280 if(res == cvt_type::partial)
281 {
282 TEST(to_next < to_end);
283 *(to_next++) = BOOST_NOWIDE_REPLACEMENT_CHARACTER;
284 } else
285 TEST(res == cvt_type::ok);
286
287 return std::wstring(to, to_next);
288 }
289
codecvt_to_narrow(const std::wstring & s)290 std::string codecvt_to_narrow(const std::wstring& s)
291 {
292 std::locale l(std::locale::classic(), new boost::nowide::utf8_codecvt<wchar_t>());
293
294 const cvt_type& cvt = std::use_facet<cvt_type>(l);
295
296 std::mbstate_t mb = std::mbstate_t();
297 const wchar_t* const from = s.c_str();
298 const wchar_t* const from_end = from + s.size();
299 const wchar_t* from_next = from;
300
301 std::vector<char> buf((s.size() + 1) * 4 + 1); // +1 for possible incomplete char, +1 for NULL
302 char* const to = &buf[0];
303 char* const to_end = to + buf.size();
304 char* to_next = to;
305
306 cvt_type::result res = cvt.out(mb, from, from_end, from_next, to, to_end, to_next);
307 if(res == cvt_type::partial)
308 {
309 TEST(to_next < to_end);
310 return std::string(to, to_next) + boost::nowide::narrow(wreplacement_str);
311 } else
312 TEST(res == cvt_type::ok);
313
314 return std::string(to, to_next);
315 }
316
test_codecvt_subst()317 void test_codecvt_subst()
318 {
319 std::cout << "Substitutions " << std::endl;
320 run_all(codecvt_to_wide, codecvt_to_narrow);
321 }
322
test_main(int,char **,char **)323 void test_main(int, char**, char**)
324 {
325 test_codecvt_conv();
326 test_codecvt_err();
327 test_codecvt_subst();
328 }
329