• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 //
2 //  Copyright (c) 2015 Artyom Beilis (Tonkikh)
3 //
4 //  Distributed under the Boost Software License, Version 1.0. (See
5 //  accompanying file LICENSE or copy at
6 //  http://www.boost.org/LICENSE_1_0.txt)
7 //
8 
9 #include <boost/nowide/utf8_codecvt.hpp>
10 
11 #include <boost/nowide/convert.hpp>
12 #include <cstring>
13 #include <iomanip>
14 #include <iostream>
15 #include <locale>
16 #include <vector>
17 
18 #include "test.hpp"
19 #include "test_sets.hpp"
20 
21 static const char* utf8_name =
22   "\xf0\x9d\x92\x9e-\xD0\xBF\xD1\x80\xD0\xB8\xD0\xB2\xD0\xB5\xD1\x82-\xE3\x82\x84\xE3\x81\x82.txt";
23 static const std::wstring wide_name_str = boost::nowide::widen(utf8_name);
24 static const wchar_t* wide_name = wide_name_str.c_str();
25 
26 typedef std::codecvt<wchar_t, char, std::mbstate_t> cvt_type;
27 
test_codecvt_in_n_m(const cvt_type & cvt,size_t n,size_t m)28 void test_codecvt_in_n_m(const cvt_type& cvt, size_t n, size_t m)
29 {
30     const wchar_t* wptr = wide_name;
31     size_t wlen = std::wcslen(wide_name);
32     size_t u8len = std::strlen(utf8_name);
33     const char* from = utf8_name;
34     const char* end = from;
35     const char* real_end = utf8_name + u8len;
36     const char* from_next = from;
37     std::mbstate_t mb = std::mbstate_t();
38     while(from_next < real_end)
39     {
40         if(from == end)
41         {
42             end = from + n;
43             if(end > real_end)
44                 end = real_end;
45         }
46 
47         wchar_t buf[128];
48         wchar_t* to = buf;
49         wchar_t* to_end = to + m;
50         wchar_t* to_next = to;
51 
52         std::mbstate_t mb2 = mb;
53         std::codecvt_base::result r = cvt.in(mb, from, end, from_next, to, to_end, to_next);
54 
55         int count = cvt.length(mb2, from, end, to_end - to);
56 #ifndef BOOST_NOWIDE_DO_LENGTH_MBSTATE_CONST
57         TEST(std::memcmp(&mb, &mb2, sizeof(mb)) == 0);
58         if(count != from_next - from)
59         {
60             std::cout << count << " " << from_next - from << std::endl;
61         }
62         TEST(count == from_next - from);
63 #else
64         TEST(count == to_next - to);
65 #endif
66 
67         if(r == cvt_type::partial)
68         {
69             end += n;
70             if(end > real_end)
71                 end = real_end;
72         } else
73             TEST(r == cvt_type::ok);
74         while(to != to_next)
75         {
76             TEST(*wptr == *to);
77             wptr++;
78             to++;
79         }
80         to = to_next;
81         from = from_next;
82     }
83     TEST(wptr == wide_name + wlen);
84     TEST(from == real_end);
85 }
86 
test_codecvt_out_n_m(const cvt_type & cvt,size_t n,size_t m)87 void test_codecvt_out_n_m(const cvt_type& cvt, size_t n, size_t m)
88 {
89     const char* nptr = utf8_name;
90     size_t wlen = std::wcslen(wide_name);
91     size_t u8len = std::strlen(utf8_name);
92 
93     std::mbstate_t mb = std::mbstate_t();
94 
95     const wchar_t* from_next = wide_name;
96     const wchar_t* real_from_end = wide_name + wlen;
97 
98     char buf[256];
99     char* to = buf;
100     char* to_next = to;
101     char* to_end = to + n;
102     char* real_to_end = buf + sizeof(buf);
103 
104     while(from_next < real_from_end)
105     {
106         const wchar_t* from = from_next;
107         const wchar_t* from_end = from + m;
108         if(from_end > real_from_end)
109             from_end = real_from_end;
110         if(to_end == to)
111         {
112             to_end = to + n;
113         }
114 
115         std::codecvt_base::result r = cvt.out(mb, from, from_end, from_next, to, to_end, to_next);
116         if(r == cvt_type::partial)
117         {
118             // If those are equal, then "partial" probably means: Need more input
119             // Otherwise "Need more output"
120             if(from_next != from_end)
121             {
122                 TEST(to_end - to_next < cvt.max_length());
123                 to_end += n;
124                 if(to_end > real_to_end)
125                     to_end = real_to_end;
126             }
127         } else
128         {
129             TEST(r == cvt_type::ok);
130         }
131 
132         while(to != to_next)
133         {
134             TEST(*nptr == *to);
135             nptr++;
136             to++;
137         }
138         from = from_next;
139     }
140     TEST(nptr == utf8_name + u8len);
141     TEST(from_next == real_from_end);
142     TEST(cvt.unshift(mb, to, to + n, to_next) == cvt_type::ok);
143     TEST(to_next == to);
144 }
145 
test_codecvt_conv()146 void test_codecvt_conv()
147 {
148     std::cout << "Conversions " << std::endl;
149     std::locale l(std::locale::classic(), new boost::nowide::utf8_codecvt<wchar_t>());
150 
151     const cvt_type& cvt = std::use_facet<cvt_type>(l);
152     const size_t utf8_len = std::strlen(utf8_name);
153     const size_t wide_len = std::wcslen(wide_name);
154 
155     for(size_t i = 1; i <= utf8_len + 1; i++)
156     {
157         for(size_t j = 1; j <= wide_len + 1; j++)
158         {
159             try
160             {
161                 test_codecvt_in_n_m(cvt, i, j);
162                 test_codecvt_out_n_m(cvt, i, j);
163             } catch(...)
164             {
165                 std::cerr << "Wlen=" << j << " Nlen=" << i << std::endl;
166                 throw;
167             }
168         }
169     }
170 }
171 
test_codecvt_err()172 void test_codecvt_err()
173 {
174     std::cout << "Errors " << std::endl;
175     std::locale l(std::locale::classic(), new boost::nowide::utf8_codecvt<wchar_t>());
176 
177     const cvt_type& cvt = std::use_facet<cvt_type>(l);
178 
179     std::cout << "- UTF-8" << std::endl;
180     {
181         {
182             wchar_t buf[4];
183             wchar_t* const to = buf;
184             wchar_t* const to_end = buf + 4;
185             const char* err_utf = "1\xFF\xFF\xd7\xa9";
186             std::mbstate_t mb = std::mbstate_t();
187             const char* from = err_utf;
188             const char* from_end = from + std::strlen(from);
189             const char* from_next = from;
190             wchar_t* to_next = to;
191             TEST(cvt.in(mb, from, from_end, from_next, to, to_end, to_next) == cvt_type::ok);
192             TEST(from_next == from + 5);
193             TEST(to_next == to + 4);
194             TEST(std::wstring(to, to_end) == boost::nowide::widen(err_utf));
195         }
196         {
197             wchar_t buf[4];
198             wchar_t* const to = buf;
199             wchar_t* const to_end = buf + 4;
200             const char* err_utf = "1\xd7"; // 1 valid, 1 incomplete UTF-8 char
201             std::mbstate_t mb = std::mbstate_t();
202             const char* from = err_utf;
203             const char* from_end = from + std::strlen(from);
204             const char* from_next = from;
205             wchar_t* to_next = to;
206             TEST(cvt.in(mb, from, from_end, from_next, to, to_end, to_next) == cvt_type::partial);
207             TEST(from_next == from + 1);
208             TEST(to_next == to + 1);
209             TEST(std::wstring(to, to_next) == std::wstring(L"1"));
210         }
211         {
212             char buf[4] = {};
213             char* const to = buf;
214             char* const to_end = buf + 4;
215             char* to_next = to;
216             const wchar_t* err_utf = L"\xD800"; // Trailing UTF-16 surrogate
217             std::mbstate_t mb = std::mbstate_t();
218             const wchar_t* from = err_utf;
219             const wchar_t* from_end = from + 1;
220             const wchar_t* from_next = from;
221             cvt_type::result res = cvt.out(mb, from, from_end, from_next, to, to_end, to_next);
222 #ifdef BOOST_MSVC
223 #pragma warning(disable : 4127) // Constant expression detected
224 #endif
225             if(sizeof(wchar_t) == 2)
226             {
227                 TEST(res == cvt_type::partial);
228                 TEST(from_next == from_end);
229                 TEST(to_next == to);
230                 TEST(buf[0] == 0);
231             } else
232             {
233                 TEST(res == cvt_type::ok);
234                 TEST(from_next == from_end);
235                 TEST(to_next == to + 3);
236                 // surrogate is invalid
237                 TEST(std::string(to, to_next) == boost::nowide::narrow(wreplacement_str));
238             }
239         }
240     }
241 
242     std::cout << "- UTF-16/32" << std::endl;
243     {
244         char buf[32];
245         char* to = buf;
246         char* to_end = buf + 32;
247         char* to_next = to;
248         wchar_t err_buf[3] = {'1', 0xDC9E, 0}; // second surrogate not works both for UTF-16 and 32
249         const wchar_t* err_utf = err_buf;
250         {
251             std::mbstate_t mb = std::mbstate_t();
252             const wchar_t* from = err_utf;
253             const wchar_t* from_end = from + std::wcslen(from);
254             const wchar_t* from_next = from;
255             TEST(cvt.out(mb, from, from_end, from_next, to, to_end, to_next) == cvt_type::ok);
256             TEST(from_next == from + 2);
257             TEST(to_next == to + 4);
258             TEST(std::string(to, to_next) == "1" + boost::nowide::narrow(wreplacement_str));
259         }
260     }
261 }
262 
codecvt_to_wide(const std::string & s)263 std::wstring codecvt_to_wide(const std::string& s)
264 {
265     std::locale l(std::locale::classic(), new boost::nowide::utf8_codecvt<wchar_t>());
266 
267     const cvt_type& cvt = std::use_facet<cvt_type>(l);
268 
269     std::mbstate_t mb = std::mbstate_t();
270     const char* const from = s.c_str();
271     const char* const from_end = from + s.size();
272     const char* from_next = from;
273 
274     std::vector<wchar_t> buf(s.size() + 2); // +1 for possible incomplete char, +1 for NULL
275     wchar_t* const to = &buf[0];
276     wchar_t* const to_end = to + buf.size();
277     wchar_t* to_next = to;
278 
279     cvt_type::result res = cvt.in(mb, from, from_end, from_next, to, to_end, to_next);
280     if(res == cvt_type::partial)
281     {
282         TEST(to_next < to_end);
283         *(to_next++) = BOOST_NOWIDE_REPLACEMENT_CHARACTER;
284     } else
285         TEST(res == cvt_type::ok);
286 
287     return std::wstring(to, to_next);
288 }
289 
codecvt_to_narrow(const std::wstring & s)290 std::string codecvt_to_narrow(const std::wstring& s)
291 {
292     std::locale l(std::locale::classic(), new boost::nowide::utf8_codecvt<wchar_t>());
293 
294     const cvt_type& cvt = std::use_facet<cvt_type>(l);
295 
296     std::mbstate_t mb = std::mbstate_t();
297     const wchar_t* const from = s.c_str();
298     const wchar_t* const from_end = from + s.size();
299     const wchar_t* from_next = from;
300 
301     std::vector<char> buf((s.size() + 1) * 4 + 1); // +1 for possible incomplete char, +1 for NULL
302     char* const to = &buf[0];
303     char* const to_end = to + buf.size();
304     char* to_next = to;
305 
306     cvt_type::result res = cvt.out(mb, from, from_end, from_next, to, to_end, to_next);
307     if(res == cvt_type::partial)
308     {
309         TEST(to_next < to_end);
310         return std::string(to, to_next) + boost::nowide::narrow(wreplacement_str);
311     } else
312         TEST(res == cvt_type::ok);
313 
314     return std::string(to, to_next);
315 }
316 
test_codecvt_subst()317 void test_codecvt_subst()
318 {
319     std::cout << "Substitutions " << std::endl;
320     run_all(codecvt_to_wide, codecvt_to_narrow);
321 }
322 
test_main(int,char **,char **)323 void test_main(int, char**, char**)
324 {
325     test_codecvt_conv();
326     test_codecvt_err();
327     test_codecvt_subst();
328 }
329