1 /////////1/////////2/////////3/////////4/////////5/////////6/////////7/////////8
2 // utf8_codecvt_facet.cpp
3
4 // Copyright (c) 2001 Ronald Garcia, Indiana University (garcia@osl.iu.edu)
5 // Andrew Lumsdaine, Indiana University (lums@osl.iu.edu).
6 // Distributed under the Boost Software License, Version 1.0. (See accompany-
7 // ing file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
8
9 // See http://www.boost.org/libs/iostreams for documentation.
10
11 //#include <cstdlib> // for multi-byte converson routines
12
13 // Jonathan Turkanis:
14 // - Replaced test for BOOST_NO_STD_WSTREAMBUF with test for
15 // BOOST_IOSTREAMS_NO_WIDE_STREAMS;
16 // - Derived from codecvt_helper instead of codecvt.
17
18 #include <boost/config.hpp>
19 #include <boost/iostreams/detail/config/wide_streams.hpp>
20 #include <boost/numeric/conversion/cast.hpp>
21 #ifdef BOOST_IOSTREAMS_NO_LOCALES
22 # error "C++ locales not supported on this platform"
23 #else
24
25 #include <cassert>
26 #include <cstddef>
27
28 #include <boost/detail/workaround.hpp>
29 #include "./utf8_codecvt_facet.hpp"
30
31 #if BOOST_WORKAROUND(__BORLANDC__, <= 0x600)
32 # pragma warn -sig // Conversion may lose significant digits
33 # pragma warn -rng // Constant is out of range in comparison
34 #endif
35
36 /////////1/////////2/////////3/////////4/////////5/////////6/////////7/////////8
37 // implementation for wchar_t
38
39 // Translate incoming UTF-8 into UCS-4
do_in(std::mbstate_t &,const char * from,const char * from_end,const char * & from_next,wchar_t * to,wchar_t * to_end,wchar_t * & to_next) const40 std::codecvt_base::result utf8_codecvt_facet_wchar_t::do_in(
41 std::mbstate_t&,
42 const char * from,
43 const char * from_end,
44 const char * & from_next,
45 wchar_t * to,
46 wchar_t * to_end,
47 wchar_t * & to_next
48 ) const {
49 // Basic algorithm: The first octet determines how many
50 // octets total make up the UCS-4 character. The remaining
51 // "continuing octets" all begin with "10". To convert, subtract
52 // the amount that specifies the number of octets from the first
53 // octet. Subtract 0x80 (1000 0000) from each continuing octet,
54 // then mash the whole lot together. Note that each continuing
55 // octet only uses 6 bits as unique values, so only shift by
56 // multiples of 6 to combine.
57 while (from != from_end && to != to_end) {
58
59 // Error checking on the first octet
60 if (invalid_leading_octet(*from)){
61 from_next = from;
62 to_next = to;
63 return std::codecvt_base::error;
64 }
65
66 // The first octet is adjusted by a value dependent upon
67 // the number of "continuing octets" encoding the character
68 const int cont_octet_count = get_cont_octet_count(*from);
69 const wchar_t octet1_modifier_table[] = {
70 0x00, 0xc0, 0xe0, 0xf0, 0xf8, 0xfc
71 };
72
73 // The unsigned char conversion is necessary in case char is
74 // signed (I learned this the hard way)
75 wchar_t ucs_result =
76 (unsigned char)(*from++) - octet1_modifier_table[cont_octet_count];
77
78 // Invariants :
79 // 1) At the start of the loop, 'i' continuing characters have been
80 // processed
81 // 2) *from points to the next continuing character to be processed.
82 int i = 0;
83 while(i != cont_octet_count && from != from_end) {
84
85 // Error checking on continuing characters
86 if (invalid_continuing_octet(*from)) {
87 from_next = from;
88 to_next = to;
89 return std::codecvt_base::error;
90 }
91
92 ucs_result *= (1 << 6);
93
94 // each continuing character has an extra (10xxxxxx)b attached to
95 // it that must be removed.
96 ucs_result += (unsigned char)(*from++) - 0x80;
97 ++i;
98 }
99
100 // If the buffer ends with an incomplete unicode character...
101 if (from == from_end && i != cont_octet_count) {
102 // rewind "from" to before the current character translation
103 from_next = from - (i+1);
104 to_next = to;
105 return std::codecvt_base::partial;
106 }
107 *to++ = ucs_result;
108 }
109 from_next = from;
110 to_next = to;
111
112 // Were we done converting or did we run out of destination space?
113 if(from == from_end) return std::codecvt_base::ok;
114 else return std::codecvt_base::partial;
115 }
116
do_out(std::mbstate_t &,const wchar_t * from,const wchar_t * from_end,const wchar_t * & from_next,char * to,char * to_end,char * & to_next) const117 std::codecvt_base::result utf8_codecvt_facet_wchar_t::do_out(
118 std::mbstate_t &,
119 const wchar_t * from,
120 const wchar_t * from_end,
121 const wchar_t * & from_next,
122 char * to,
123 char * to_end,
124 char * & to_next
125 ) const
126 {
127 // RG - consider merging this table with the other one
128 const wchar_t octet1_modifier_table[] = {
129 0x00, 0xc0, 0xe0, 0xf0, 0xf8, 0xfc
130 };
131
132 while (from != from_end && to != to_end) {
133
134 #define BOOST_NULL // Prevent macro expansion
135 // Check for invalid UCS-4 character
136 if (*from > std::numeric_limits<wchar_t>::max BOOST_NULL ()) {
137 from_next = from;
138 to_next = to;
139 return std::codecvt_base::error;
140 }
141 #undef BOOST_NULL
142
143 int cont_octet_count = get_cont_octet_out_count(*from);
144
145 // RG - comment this formula better
146 int shift_exponent = (cont_octet_count) * 6;
147
148 // Process the first character
149 *to++ = octet1_modifier_table[cont_octet_count] +
150 (unsigned char)(*from / (1 << shift_exponent));
151
152 // Process the continuation characters
153 // Invariants: At the start of the loop:
154 // 1) 'i' continuing octets have been generated
155 // 2) '*to' points to the next location to place an octet
156 // 3) shift_exponent is 6 more than needed for the next octet
157 int i = 0;
158 while (i != cont_octet_count && to != to_end) {
159 shift_exponent -= 6;
160 *to++ = 0x80 + ((*from / (1 << shift_exponent)) % (1 << 6));
161 ++i;
162 }
163 // If we filled up the out buffer before encoding the character
164 if(to == to_end && i != cont_octet_count) {
165 from_next = from;
166 to_next = to - (i+1);
167 return std::codecvt_base::partial;
168 }
169 ++from;
170 }
171 from_next = from;
172 to_next = to;
173 // Were we done or did we run out of destination space
174 if(from == from_end) return std::codecvt_base::ok;
175 else return std::codecvt_base::partial;
176 }
177
178 // How many char objects can I process to get <= max_limit
179 // wchar_t objects?
do_length(BOOST_IOSTREAMS_CODECVT_CV_QUALIFIER std::mbstate_t &,const char * from,const char * from_end,std::size_t max_limit) const180 int utf8_codecvt_facet_wchar_t::do_length(
181 BOOST_IOSTREAMS_CODECVT_CV_QUALIFIER std::mbstate_t &,
182 const char * from,
183 const char * from_end,
184 std::size_t max_limit
185 ) const throw()
186 {
187 // RG - this code is confusing! I need a better way to express it.
188 // and test cases.
189
190 // Invariants:
191 // 1) last_octet_count has the size of the last measured character
192 // 2) char_count holds the number of characters shown to fit
193 // within the bounds so far (no greater than max_limit)
194 // 3) from_next points to the octet 'last_octet_count' before the
195 // last measured character.
196 int last_octet_count=0;
197 std::size_t char_count = 0;
198 const char* from_next = from;
199 // Use "<" because the buffer may represent incomplete characters
200 while (from_next+last_octet_count <= from_end && char_count <= max_limit) {
201 from_next += last_octet_count;
202 last_octet_count = (get_octet_count(*from_next));
203 ++char_count;
204 }
205 return boost::numeric_cast<int>(from_next - from_end);
206 }
207
get_octet_count(unsigned char lead_octet)208 unsigned int utf8_codecvt_facet_wchar_t::get_octet_count(
209 unsigned char lead_octet
210 ){
211 // if the 0-bit (MSB) is 0, then 1 character
212 if (lead_octet <= 0x7f) return 1;
213
214 // Otherwise the count number of consecutive 1 bits starting at MSB
215 assert(0xc0 <= lead_octet && lead_octet <= 0xfd);
216
217 if (0xc0 <= lead_octet && lead_octet <= 0xdf) return 2;
218 else if (0xe0 <= lead_octet && lead_octet <= 0xef) return 3;
219 else if (0xf0 <= lead_octet && lead_octet <= 0xf7) return 4;
220 else if (0xf8 <= lead_octet && lead_octet <= 0xfb) return 5;
221 else return 6;
222 }
223
224 namespace {
225 template<std::size_t s>
get_cont_octet_out_count_impl(wchar_t word)226 int get_cont_octet_out_count_impl(wchar_t word){
227 if (word < 0x80) {
228 return 0;
229 }
230 if (word < 0x800) {
231 return 1;
232 }
233 return 2;
234 }
235
236 // note the following code will generate on some platforms where
237 // wchar_t is defined as UCS2. The warnings are superfluous as
238 // the specialization is never instantitiated with such compilers.
239 template<>
get_cont_octet_out_count_impl(wchar_t word)240 int get_cont_octet_out_count_impl<4>(wchar_t word)
241 {
242 if (word < 0x80) {
243 return 0;
244 }
245 if (word < 0x800) {
246 return 1;
247 }
248 if (word < 0x10000) {
249 return 2;
250 }
251 if (word < 0x200000) {
252 return 3;
253 }
254 if (word < 0x4000000) {
255 return 4;
256 }
257 return 5;
258 }
259
260 } // namespace anonymous
261
262 // How many "continuing octets" will be needed for this word
263 // == total octets - 1.
get_cont_octet_out_count(wchar_t word) const264 int utf8_codecvt_facet_wchar_t::get_cont_octet_out_count(
265 wchar_t word
266 ) const {
267 return get_cont_octet_out_count_impl<sizeof(wchar_t)>(word);
268 }
269
270 #if 0 // not used?
271 /////////1/////////2/////////3/////////4/////////5/////////6/////////7/////////8
272 // implementation for char
273
274 std::codecvt_base::result utf8_codecvt_facet_char::do_in(
275 std::mbstate_t & state,
276 const char * from,
277 const char * from_end,
278 const char * & from_next,
279 char * to,
280 char * to_end,
281 char * & to_next
282 ) const
283 {
284 while(from_next < from_end){
285 wchar_t w;
286 wchar_t *wnext = & w;
287 utf8_codecvt_facet_wchar_t::result ucs4_result;
288 ucs4_result = base_class::do_in(
289 state,
290 from, from_end, from_next,
291 wnext, wnext + 1, wnext
292 );
293 if(codecvt_base::ok != ucs4_result)
294 return ucs4_result;
295 // if the conversion succeeds.
296 int length = std::wctomb(to_next, w);
297 assert(-1 != length);
298 to_next += length;
299 }
300 return codecvt_base::ok;
301 }
302
303 std::codecvt_base::result utf8_codecvt_facet_char::do_out(
304 mbstate_t & state,
305 const char * from,
306 const char * from_end,
307 const char * & from_next,
308 char * to,
309 char * to_end,
310 char * & to_next
311 ) const
312 {
313 while(from_next < from_end){
314 wchar_t w;
315 int result = std::mbtowc(&w, from_next, MB_LENGTH_MAX);
316 assert(-1 != result);
317 from_next += result;
318 utf8_codecvt_facet_wchar_t::result ucs4_result;
319
320 const wchar_t *wptr = & w;
321 ucs4_result = base_class::do_out(
322 state,
323 wptr, wptr+1, wptr,
324 to_next, to_end, to_next
325 );
326 if(codecvt_base::ok != ucs4_result)
327 return ucs4_result;
328 }
329 return codecvt_base::ok;
330 }
331
332 // How many bytes objects can I process to get <= max_limit
333 // char objects?
334 int utf8_codecvt_facet_char::do_length(
335 // it seems that the standard doesn't use const so these librarires
336 // would be in error
337 BOOST_IOSTREAMS_CODECVT_CV_QUALIFIER
338 utf8_codecvt_facet_wchar_t::mbstate_t & initial_state,
339 const char * from_next,
340 const char * from_end,
341 std::size_t max_limit
342 ) const
343 {
344 int total_length = 0;
345 const char *from = from_next;
346 mbstate_t state = initial_state;
347 while(from_next < from_end){
348 wchar_t w;
349 wchar_t *wnext = & w;
350 utf8_codecvt_facet_wchar_t::result ucs4_result;
351 ucs4_result = base_class::do_in(
352 state,
353 from_next, from_end, from_next,
354 wnext, wnext + 1, wnext
355 );
356
357 if(codecvt_base::ok != ucs4_result)
358 break;
359
360 char carray[MB_LENGTH_MAX];
361 std::size_t count = wctomb(carray, w);
362 if(count > max_limit)
363 break;
364
365 max_limit -= count;
366 total_length = from_next - from;
367 }
368 return total_length;
369 }
370 #endif
371
372 #endif //BOOST_IOSTREAMS_NO_WIDE_STREAMS
373