1/////////1/////////2/////////3/////////4/////////5/////////6/////////7/////////8 2// utf8_codecvt_facet.ipp 3 4// Copyright (c) 2001 Ronald Garcia, Indiana University (garcia@osl.iu.edu) 5// Andrew Lumsdaine, Indiana University (lums@osl.iu.edu). 6// Use, modification and distribution is subject to the Boost Software 7// License, Version 1.0. (See accompanying file LICENSE_1_0.txt or copy at 8// http://www.boost.org/LICENSE_1_0.txt) 9 10// Please see the comments in <boost/detail/utf8_codecvt_facet.hpp> to 11// learn how this file should be used. 12 13#include <boost/detail/utf8_codecvt_facet.hpp> 14 15#include <cstdlib> // for multi-byte converson routines 16#include <cassert> 17 18#include <boost/limits.hpp> 19#include <boost/config.hpp> 20 21// If we don't have wstring, then Unicode support 22// is not available anyway, so we don't need to even 23// compiler this file. This also fixes the problem 24// with mingw, which can compile this file, but will 25// generate link error when building DLL. 26#ifndef BOOST_NO_STD_WSTRING 27 28BOOST_UTF8_BEGIN_NAMESPACE 29 30/////////1/////////2/////////3/////////4/////////5/////////6/////////7/////////8 31// implementation for wchar_t 32 33utf8_codecvt_facet::utf8_codecvt_facet( 34 std::size_t no_locale_manage 35) : 36 std::codecvt<wchar_t, char, std::mbstate_t>(no_locale_manage) 37{} 38 39utf8_codecvt_facet::~utf8_codecvt_facet() 40{} 41 42// Translate incoming UTF-8 into UCS-4 43std::codecvt_base::result utf8_codecvt_facet::do_in( 44 std::mbstate_t& /*state*/, 45 const char * from, 46 const char * from_end, 47 const char * & from_next, 48 wchar_t * to, 49 wchar_t * to_end, 50 wchar_t * & to_next 51) const { 52 // Basic algorithm: The first octet determines how many 53 // octets total make up the UCS-4 character. The remaining 54 // "continuing octets" all begin with "10". To convert, subtract 55 // the amount that specifies the number of octets from the first 56 // octet. Subtract 0x80 (1000 0000) from each continuing octet, 57 // then mash the whole lot together. Note that each continuing 58 // octet only uses 6 bits as unique values, so only shift by 59 // multiples of 6 to combine. 60 while (from != from_end && to != to_end) { 61 62 // Error checking on the first octet 63 if (invalid_leading_octet(*from)){ 64 from_next = from; 65 to_next = to; 66 return std::codecvt_base::error; 67 } 68 69 // The first octet is adjusted by a value dependent upon 70 // the number of "continuing octets" encoding the character 71 const int cont_octet_count = get_cont_octet_count(*from); 72 const wchar_t octet1_modifier_table[] = { 73 0x00, 0xc0, 0xe0, 0xf0, 0xf8, 0xfc 74 }; 75 76 // The unsigned char conversion is necessary in case char is 77 // signed (I learned this the hard way) 78 wchar_t ucs_result = 79 (unsigned char)(*from++) - octet1_modifier_table[cont_octet_count]; 80 81 // Invariants : 82 // 1) At the start of the loop, 'i' continuing characters have been 83 // processed 84 // 2) *from points to the next continuing character to be processed. 85 int i = 0; 86 while(i != cont_octet_count && from != from_end) { 87 88 // Error checking on continuing characters 89 if (invalid_continuing_octet(*from)) { 90 from_next = from; 91 to_next = to; 92 return std::codecvt_base::error; 93 } 94 95 ucs_result *= (1 << 6); 96 97 // each continuing character has an extra (10xxxxxx)b attached to 98 // it that must be removed. 99 ucs_result += (unsigned char)(*from++) - 0x80; 100 ++i; 101 } 102 103 // If the buffer ends with an incomplete unicode character... 104 if (from == from_end && i != cont_octet_count) { 105 // rewind "from" to before the current character translation 106 from_next = from - (i+1); 107 to_next = to; 108 return std::codecvt_base::partial; 109 } 110 *to++ = ucs_result; 111 } 112 from_next = from; 113 to_next = to; 114 115 // Were we done converting or did we run out of destination space? 116 if(from == from_end) return std::codecvt_base::ok; 117 else return std::codecvt_base::partial; 118} 119 120std::codecvt_base::result utf8_codecvt_facet::do_out( 121 std::mbstate_t& /*state*/, 122 const wchar_t * from, 123 const wchar_t * from_end, 124 const wchar_t * & from_next, 125 char * to, 126 char * to_end, 127 char * & to_next 128) const 129{ 130 // RG - consider merging this table with the other one 131 const wchar_t octet1_modifier_table[] = { 132 0x00, 0xc0, 0xe0, 0xf0, 0xf8, 0xfc 133 }; 134 135 wchar_t max_wchar = (std::numeric_limits<wchar_t>::max)(); 136 while (from != from_end && to != to_end) { 137 138 // Check for invalid UCS-4 character 139 if (*from > max_wchar) { 140 from_next = from; 141 to_next = to; 142 return std::codecvt_base::error; 143 } 144 145 int cont_octet_count = get_cont_octet_out_count(*from); 146 147 // RG - comment this formula better 148 int shift_exponent = (cont_octet_count) * 6; 149 150 // Process the first character 151 *to++ = static_cast<char>(octet1_modifier_table[cont_octet_count] + 152 (unsigned char)(*from / (1 << shift_exponent))); 153 154 // Process the continuation characters 155 // Invariants: At the start of the loop: 156 // 1) 'i' continuing octets have been generated 157 // 2) '*to' points to the next location to place an octet 158 // 3) shift_exponent is 6 more than needed for the next octet 159 int i = 0; 160 while (i != cont_octet_count && to != to_end) { 161 shift_exponent -= 6; 162 *to++ = static_cast<char>(0x80 + ((*from / (1 << shift_exponent)) % (1 << 6))); 163 ++i; 164 } 165 // If we filled up the out buffer before encoding the character 166 if(to == to_end && i != cont_octet_count) { 167 from_next = from; 168 to_next = to - (i+1); 169 return std::codecvt_base::partial; 170 } 171 ++from; 172 } 173 from_next = from; 174 to_next = to; 175 // Were we done or did we run out of destination space 176 if(from == from_end) return std::codecvt_base::ok; 177 else return std::codecvt_base::partial; 178} 179 180// How many char objects can I process to get <= max_limit 181// wchar_t objects? 182int utf8_codecvt_facet::do_length( 183 std::mbstate_t &, 184 const char * from, 185 const char * from_end, 186 std::size_t max_limit 187) const 188#if BOOST_WORKAROUND(__IBMCPP__, BOOST_TESTED_AT(600)) 189 throw() 190#endif 191{ 192 // RG - this code is confusing! I need a better way to express it. 193 // and test cases. 194 195 // Invariants: 196 // 1) last_octet_count has the size of the last measured character 197 // 2) char_count holds the number of characters shown to fit 198 // within the bounds so far (no greater than max_limit) 199 // 3) from_next points to the octet 'last_octet_count' before the 200 // last measured character. 201 int last_octet_count=0; 202 std::size_t char_count = 0; 203 const char* from_next = from; 204 // Use "<" because the buffer may represent incomplete characters 205 while (from_next+last_octet_count <= from_end && char_count <= max_limit) { 206 from_next += last_octet_count; 207 last_octet_count = (get_octet_count(*from_next)); 208 ++char_count; 209 } 210 return static_cast<int>(from_next-from); 211} 212 213unsigned int utf8_codecvt_facet::get_octet_count( 214 unsigned char lead_octet 215){ 216 // if the 0-bit (MSB) is 0, then 1 character 217 if (lead_octet <= 0x7f) return 1; 218 219 // Otherwise the count number of consecutive 1 bits starting at MSB 220// assert(0xc0 <= lead_octet && lead_octet <= 0xfd); 221 222 if (0xc0 <= lead_octet && lead_octet <= 0xdf) return 2; 223 else if (0xe0 <= lead_octet && lead_octet <= 0xef) return 3; 224 else if (0xf0 <= lead_octet && lead_octet <= 0xf7) return 4; 225 else if (0xf8 <= lead_octet && lead_octet <= 0xfb) return 5; 226 else return 6; 227} 228 229namespace detail { 230 231template<std::size_t s> 232int get_cont_octet_out_count_impl(wchar_t word){ 233 if (word < 0x80) { 234 return 0; 235 } 236 if (word < 0x800) { 237 return 1; 238 } 239 return 2; 240} 241 242template<> 243int get_cont_octet_out_count_impl<4>(wchar_t word){ 244 if (word < 0x80) { 245 return 0; 246 } 247 if (word < 0x800) { 248 return 1; 249 } 250 251 // Note that the following code will generate warnings on some platforms 252 // where wchar_t is defined as UCS2. The warnings are superfluous as the 253 // specialization is never instantitiated with such compilers, but this 254 // can cause problems if warnings are being treated as errors, so we guard 255 // against that. Including <boost/detail/utf8_codecvt_facet.hpp> as we do 256 // should be enough to get WCHAR_MAX defined. 257#if !defined(WCHAR_MAX) 258# error WCHAR_MAX not defined! 259#endif 260 // cope with VC++ 7.1 or earlier having invalid WCHAR_MAX 261#if defined(_MSC_VER) && _MSC_VER <= 1310 // 7.1 or earlier 262 return 2; 263#elif WCHAR_MAX > 0x10000 264 265 if (word < 0x10000) { 266 return 2; 267 } 268 if (word < 0x200000) { 269 return 3; 270 } 271 if (word < 0x4000000) { 272 return 4; 273 } 274 return 5; 275 276#else 277 return 2; 278#endif 279} 280 281} // namespace detail 282 283// How many "continuing octets" will be needed for this word 284// == total octets - 1. 285int utf8_codecvt_facet::get_cont_octet_out_count( 286 wchar_t word 287) const { 288 return detail::get_cont_octet_out_count_impl<sizeof(wchar_t)>(word); 289} 290BOOST_UTF8_END_NAMESPACE 291 292#endif 293