1 // Copyright (c) 2001 Ronald Garcia, Indiana University (garcia@osl.iu.edu) 2 // Andrew Lumsdaine, Indiana University (lums@osl.iu.edu). 3 // Distributed under the Boost Software License, Version 1.0. (See accompany- 4 // ing file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) 5 6 #ifndef BOOST_UTF8_CODECVT_FACET_HPP 7 #define BOOST_UTF8_CODECVT_FACET_HPP 8 9 // MS compatible compilers support #pragma once 10 #if defined(_MSC_VER) && (_MSC_VER >= 1020) 11 # pragma once 12 #endif 13 14 /////////1/////////2/////////3/////////4/////////5/////////6/////////7/////////8 15 // utf8_codecvt_facet.hpp 16 17 // This header defines class utf8_codecvt_facet, derived from 18 // std::codecvt<wchar_t, char>, which can be used to convert utf8 data in 19 // files into wchar_t strings in the application. 20 // 21 // The header is NOT STANDALONE, and is not to be included by the USER. 22 // There are at least two libraries which want to use this functionality, and 23 // we want to avoid code duplication. It would be possible to create utf8 24 // library, but: 25 // - this requires review process first 26 // - in the case, when linking the a library which uses utf8 27 // (say 'program_options'), user should also link to the utf8 library. 28 // This seems inconvenient, and asking a user to link to an unrevieved 29 // library is strange. 30 // Until the above points are fixed, a library which wants to use utf8 must: 31 // - include this header in one of it's headers or sources 32 // - include the corresponding boost/detail/utf8_codecvt_facet.ipp file in one 33 // of its sources 34 // - before including either file, the library must define 35 // - BOOST_UTF8_BEGIN_NAMESPACE to the namespace declaration that must be used 36 // - BOOST_UTF8_END_NAMESPACE to the code to close the previous namespace 37 // declaration. 38 // - BOOST_UTF8_DECL -- to the code which must be used for all 'exportable' 39 // symbols. 40 // 41 // For example, program_options library might contain: 42 // #define BOOST_UTF8_BEGIN_NAMESPACE <backslash character> 43 // namespace boost { namespace program_options { 44 // #define BOOST_UTF8_END_NAMESPACE }} 45 // #define BOOST_UTF8_DECL BOOST_PROGRAM_OPTIONS_DECL 46 // #include <boost/detail/utf8_codecvt_facet.ipp> 47 // 48 // Essentially, each library will have its own copy of utf8 code, in 49 // different namespaces. 50 51 // Note:(Robert Ramey). I have made the following alterations in the original 52 // code. 53 // a) Rendered utf8_codecvt<wchar_t, char> with using templates 54 // b) Move longer functions outside class definition to prevent inlining 55 // and make code smaller 56 // c) added on a derived class to permit translation to/from current 57 // locale to utf8 58 59 // See http://www.boost.org for updates, documentation, and revision history. 60 61 // archives stored as text - note these ar templated on the basic 62 // stream templates to accommodate wide (and other?) kind of characters 63 // 64 // note the fact that on libraries without wide characters, ostream is 65 // is not a specialization of basic_ostream which in fact is not defined 66 // in such cases. So we can't use basic_ostream<OStream::char_type> but rather 67 // use two template parameters 68 // 69 // utf8_codecvt_facet 70 // This is an implementation of a std::codecvt facet for translating 71 // from UTF-8 externally to UCS-4. Note that this is not tied to 72 // any specific types in order to allow customization on platforms 73 // where wchar_t is not big enough. 74 // 75 // NOTES: The current implementation jumps through some unpleasant hoops in 76 // order to deal with signed character types. As a std::codecvt_base::result, 77 // it is necessary for the ExternType to be convertible to unsigned char. 78 // I chose not to tie the extern_type explicitly to char. But if any combination 79 // of types other than <wchar_t,char_t> is used, then std::codecvt must be 80 // specialized on those types for this to work. 81 82 #include <locale> 83 #include <cwchar> // for mbstate_t 84 #include <cstddef> // for std::size_t 85 86 #include <boost/config.hpp> 87 #include <boost/detail/workaround.hpp> 88 89 #if defined(BOOST_NO_STDC_NAMESPACE) 90 namespace std { 91 using ::mbstate_t; 92 using ::size_t; 93 } 94 #endif 95 96 // maximum lenght of a multibyte string 97 #define MB_LENGTH_MAX 8 98 99 BOOST_UTF8_BEGIN_NAMESPACE 100 101 //----------------------------------------------------------------------------// 102 // // 103 // utf8_codecvt_facet // 104 // // 105 // See utf8_codecvt_facet.ipp for the implementation. // 106 //----------------------------------------------------------------------------// 107 108 #ifndef BOOST_UTF8_DECL 109 #define BOOST_UTF8_DECL 110 #endif 111 112 struct BOOST_UTF8_DECL utf8_codecvt_facet : 113 public std::codecvt<wchar_t, char, std::mbstate_t> 114 { 115 public: 116 explicit utf8_codecvt_facet(std::size_t no_locale_manage=0); 117 virtual ~utf8_codecvt_facet(); 118 protected: 119 virtual std::codecvt_base::result do_in( 120 std::mbstate_t& state, 121 const char * from, 122 const char * from_end, 123 const char * & from_next, 124 wchar_t * to, 125 wchar_t * to_end, 126 wchar_t*& to_next 127 ) const; 128 129 virtual std::codecvt_base::result do_out( 130 std::mbstate_t & state, 131 const wchar_t * from, 132 const wchar_t * from_end, 133 const wchar_t* & from_next, 134 char * to, 135 char * to_end, 136 char * & to_next 137 ) const; 138 invalid_continuing_octetutf8_codecvt_facet139 bool invalid_continuing_octet(unsigned char octet_1) const { 140 return (octet_1 < 0x80|| 0xbf< octet_1); 141 } 142 invalid_leading_octetutf8_codecvt_facet143 bool invalid_leading_octet(unsigned char octet_1) const { 144 return (0x7f < octet_1 && octet_1 < 0xc0) || 145 (octet_1 > 0xfd); 146 } 147 148 // continuing octets = octets except for the leading octet get_cont_octet_countutf8_codecvt_facet149 static unsigned int get_cont_octet_count(unsigned char lead_octet) { 150 return get_octet_count(lead_octet) - 1; 151 } 152 153 static unsigned int get_octet_count(unsigned char lead_octet); 154 155 // How many "continuing octets" will be needed for this word 156 // == total octets - 1. 157 int get_cont_octet_out_count(wchar_t word) const ; 158 do_always_noconvutf8_codecvt_facet159 virtual bool do_always_noconv() const BOOST_NOEXCEPT_OR_NOTHROW { 160 return false; 161 } 162 163 // UTF-8 isn't really stateful since we rewind on partial conversions do_unshiftutf8_codecvt_facet164 virtual std::codecvt_base::result do_unshift( 165 std::mbstate_t&, 166 char * from, 167 char * /*to*/, 168 char * & next 169 ) const { 170 next = from; 171 return ok; 172 } 173 do_encodingutf8_codecvt_facet174 virtual int do_encoding() const BOOST_NOEXCEPT_OR_NOTHROW { 175 const int variable_byte_external_encoding=0; 176 return variable_byte_external_encoding; 177 } 178 179 // How many char objects can I process to get <= max_limit 180 // wchar_t objects? 181 virtual int do_length( 182 std::mbstate_t &, 183 const char * from, 184 const char * from_end, 185 std::size_t max_limit 186 ) const 187 #if BOOST_WORKAROUND(__IBMCPP__, BOOST_TESTED_AT(600)) 188 throw() 189 #endif 190 ; 191 192 // Nonstandard override do_lengthutf8_codecvt_facet193 virtual int do_length( 194 const std::mbstate_t & s, 195 const char * from, 196 const char * from_end, 197 std::size_t max_limit 198 ) const 199 #if BOOST_WORKAROUND(__IBMCPP__, BOOST_TESTED_AT(600)) 200 throw() 201 #endif 202 { 203 return do_length( 204 const_cast<std::mbstate_t &>(s), 205 from, 206 from_end, 207 max_limit 208 ); 209 } 210 211 // Largest possible value do_length(state,from,from_end,1) could return. do_max_lengthutf8_codecvt_facet212 virtual int do_max_length() const BOOST_NOEXCEPT_OR_NOTHROW { 213 return 6; // largest UTF-8 encoding of a UCS-4 character 214 } 215 }; 216 217 BOOST_UTF8_END_NAMESPACE 218 219 #endif // BOOST_UTF8_CODECVT_FACET_HPP 220