1 // 2 // Copyright (c) 2009-2011 Artyom Beilis (Tonkikh) 3 // 4 // Distributed under the Boost Software License, Version 1.0. (See 5 // accompanying file LICENSE_1_0.txt or copy at 6 // http://www.boost.org/LICENSE_1_0.txt) 7 // 8 #define BOOST_LOCALE_SOURCE 9 #include <boost/locale/generator.hpp> 10 #include <boost/locale/encoding.hpp> 11 #include <boost/locale/utf8_codecvt.hpp> 12 13 #include "../encoding/conv.hpp" 14 15 #include <boost/locale/util.hpp> 16 17 #ifdef BOOST_MSVC 18 # pragma warning(disable : 4244 4996) // loose data 19 #endif 20 21 #include <cstddef> 22 #include <string.h> 23 #include <vector> 24 #include <algorithm> 25 26 //#define DEBUG_CODECVT 27 28 #ifdef DEBUG_CODECVT 29 #include <iostream> 30 #endif 31 32 namespace boost { 33 namespace locale { 34 namespace util { 35 36 class utf8_converter : public base_converter { 37 public: max_len() const38 virtual int max_len() const 39 { 40 return 4; 41 } 42 clone() const43 virtual utf8_converter *clone() const 44 { 45 return new utf8_converter(); 46 } 47 is_thread_safe() const48 bool is_thread_safe() const 49 { 50 return true; 51 } 52 to_unicode(char const * & begin,char const * end)53 virtual uint32_t to_unicode(char const *&begin,char const *end) 54 { 55 char const *p=begin; 56 57 utf::code_point c = utf::utf_traits<char>::decode(p,end); 58 59 if(c==utf::illegal) 60 return illegal; 61 62 if(c==utf::incomplete) 63 return incomplete; 64 65 begin = p; 66 return c; 67 } 68 from_unicode(uint32_t u,char * begin,char const * end)69 virtual uint32_t from_unicode(uint32_t u,char *begin,char const *end) 70 { 71 if(!utf::is_valid_codepoint(u)) 72 return illegal; 73 int width = utf::utf_traits<char>::width(u); 74 std::ptrdiff_t d=end-begin; 75 if(d < width) 76 return incomplete; 77 utf::utf_traits<char>::encode(u,begin); 78 return width; 79 } 80 }; // utf8_converter 81 82 class simple_converter_impl { 83 public: 84 85 static const int hash_table_size = 1024; 86 simple_converter_impl(std::string const & encoding)87 simple_converter_impl(std::string const &encoding) 88 { 89 for(unsigned i=0;i<128;i++) 90 to_unicode_tbl_[i]=i; 91 for(unsigned i=128;i<256;i++) { 92 char buf[2] = { char(i) , 0 }; 93 uint32_t uchar=utf::illegal; 94 try { 95 std::wstring const tmp = conv::to_utf<wchar_t>(buf,buf+1,encoding,conv::stop); 96 if(tmp.size() == 1) { 97 uchar = tmp[0]; 98 } 99 else { 100 uchar = utf::illegal; 101 } 102 } 103 catch(conv::conversion_error const &/*e*/) { 104 uchar = utf::illegal; 105 } 106 to_unicode_tbl_[i]=uchar; 107 } 108 for(int i=0;i<hash_table_size;i++) 109 from_unicode_tbl_[i]=0; 110 for(unsigned i=1;i<256;i++) { 111 if(to_unicode_tbl_[i]!=utf::illegal) { 112 unsigned pos = to_unicode_tbl_[i] % hash_table_size; 113 while(from_unicode_tbl_[pos]!=0) 114 pos = (pos + 1) % hash_table_size; 115 from_unicode_tbl_[pos] = i; 116 } 117 } 118 } 119 to_unicode(char const * & begin,char const * end) const120 uint32_t to_unicode(char const *&begin,char const *end) const 121 { 122 if(begin==end) 123 return utf::incomplete; 124 unsigned char c = *begin++; 125 return to_unicode_tbl_[c]; 126 } from_unicode(uint32_t u,char * begin,char const * end) const127 uint32_t from_unicode(uint32_t u,char *begin,char const *end) const 128 { 129 if(begin==end) 130 return utf::incomplete; 131 if(u==0) { 132 *begin = 0; 133 return 1; 134 } 135 unsigned pos = u % hash_table_size; 136 unsigned char c; 137 while((c=from_unicode_tbl_[pos])!=0 && to_unicode_tbl_[c]!=u) 138 pos = (pos + 1) % hash_table_size; 139 if(c==0) 140 return utf::illegal; 141 *begin = c; 142 return 1; 143 } 144 private: 145 uint32_t to_unicode_tbl_[256]; 146 unsigned char from_unicode_tbl_[hash_table_size]; 147 }; 148 149 class simple_converter : public base_converter { 150 public: 151 ~simple_converter()152 virtual ~simple_converter() 153 { 154 } 155 simple_converter(std::string const & encoding)156 simple_converter(std::string const &encoding) : 157 cvt_(encoding) 158 { 159 } 160 max_len() const161 virtual int max_len() const 162 { 163 return 1; 164 } 165 is_thread_safe() const166 virtual bool is_thread_safe() const 167 { 168 return true; 169 } clone() const170 virtual base_converter *clone() const 171 { 172 return new simple_converter(*this); 173 } 174 to_unicode(char const * & begin,char const * end)175 virtual uint32_t to_unicode(char const *&begin,char const *end) 176 { 177 return cvt_.to_unicode(begin,end); 178 } from_unicode(uint32_t u,char * begin,char const * end)179 virtual uint32_t from_unicode(uint32_t u,char *begin,char const *end) 180 { 181 return cvt_.from_unicode(u,begin,end); 182 } 183 private: 184 simple_converter_impl cvt_; 185 }; 186 187 template<typename CharType> 188 class simple_codecvt : public generic_codecvt<CharType,simple_codecvt<CharType> > 189 { 190 public: 191 simple_codecvt(std::string const & encoding,size_t refs=0)192 simple_codecvt(std::string const &encoding,size_t refs = 0) : 193 generic_codecvt<CharType,simple_codecvt<CharType> >(refs), 194 cvt_(encoding) 195 { 196 } 197 198 struct state_type {}; initial_state(generic_codecvt_base::initial_convertion_state)199 static state_type initial_state(generic_codecvt_base::initial_convertion_state /* unused */) 200 { 201 return state_type(); 202 } max_encoding_length()203 static int max_encoding_length() 204 { 205 return 1; 206 } 207 to_unicode(state_type &,char const * & begin,char const * end) const208 utf::code_point to_unicode(state_type &,char const *&begin,char const *end) const 209 { 210 return cvt_.to_unicode(begin,end); 211 } 212 from_unicode(state_type &,utf::code_point u,char * begin,char const * end) const213 utf::code_point from_unicode(state_type &,utf::code_point u,char *begin,char const *end) const 214 { 215 return cvt_.from_unicode(u,begin,end); 216 } 217 private: 218 simple_converter_impl cvt_; 219 220 }; 221 222 namespace { 223 char const *simple_encoding_table[] = { 224 "cp1250", 225 "cp1251", 226 "cp1252", 227 "cp1253", 228 "cp1254", 229 "cp1255", 230 "cp1256", 231 "cp1257", 232 "iso88591", 233 "iso885913", 234 "iso885915", 235 "iso88592", 236 "iso88593", 237 "iso88594", 238 "iso88595", 239 "iso88596", 240 "iso88597", 241 "iso88598", 242 "iso88599", 243 "koi8r", 244 "koi8u", 245 "usascii", 246 "windows1250", 247 "windows1251", 248 "windows1252", 249 "windows1253", 250 "windows1254", 251 "windows1255", 252 "windows1256", 253 "windows1257" 254 }; 255 compare_strings(char const * l,char const * r)256 bool compare_strings(char const *l,char const *r) 257 { 258 return strcmp(l,r) < 0; 259 } 260 } 261 check_is_simple_encoding(std::string const & encoding)262 bool check_is_simple_encoding(std::string const &encoding) 263 { 264 std::string norm = conv::impl::normalize_encoding(encoding.c_str()); 265 return std::binary_search<char const **>( simple_encoding_table, 266 simple_encoding_table + sizeof(simple_encoding_table)/sizeof(char const *), 267 norm.c_str(), 268 compare_strings); 269 return 0; 270 } 271 272 #if !defined(BOOST_LOCALE_HIDE_AUTO_PTR) && !defined(BOOST_NO_AUTO_PTR) create_utf8_converter()273 std::auto_ptr<base_converter> create_utf8_converter() 274 { 275 std::auto_ptr<base_converter> res(create_utf8_converter_new_ptr()); 276 return res; 277 } create_simple_converter(std::string const & encoding)278 std::auto_ptr<base_converter> create_simple_converter(std::string const &encoding) 279 { 280 std::auto_ptr<base_converter> res(create_simple_converter_new_ptr(encoding)); 281 return res; 282 } create_codecvt(std::locale const & in,std::auto_ptr<base_converter> cvt,character_facet_type type)283 std::locale create_codecvt(std::locale const &in,std::auto_ptr<base_converter> cvt,character_facet_type type) 284 { 285 return create_codecvt_from_pointer(in,cvt.release(),type); 286 } 287 #endif 288 #ifndef BOOST_NO_CXX11_SMART_PTR create_utf8_converter_unique_ptr()289 std::unique_ptr<base_converter> create_utf8_converter_unique_ptr() 290 { 291 std::unique_ptr<base_converter> res(create_utf8_converter_new_ptr()); 292 return res; 293 } create_simple_converter_unique_ptr(std::string const & encoding)294 std::unique_ptr<base_converter> create_simple_converter_unique_ptr(std::string const &encoding) 295 { 296 std::unique_ptr<base_converter> res(create_simple_converter_new_ptr(encoding)); 297 return res; 298 } create_codecvt(std::locale const & in,std::unique_ptr<base_converter> cvt,character_facet_type type)299 std::locale create_codecvt(std::locale const &in,std::unique_ptr<base_converter> cvt,character_facet_type type) 300 { 301 return create_codecvt_from_pointer(in,cvt.release(),type); 302 } 303 #endif 304 create_simple_converter_new_ptr(std::string const & encoding)305 base_converter *create_simple_converter_new_ptr(std::string const &encoding) 306 { 307 if(check_is_simple_encoding(encoding)) 308 return new simple_converter(encoding); 309 return 0; 310 } 311 create_utf8_converter_new_ptr()312 base_converter *create_utf8_converter_new_ptr() 313 { 314 return new utf8_converter(); 315 } 316 317 template<typename CharType> 318 class code_converter : public generic_codecvt<CharType,code_converter<CharType> > 319 { 320 public: 321 #ifndef BOOST_NO_CXX11_SMART_PTR 322 typedef std::unique_ptr<base_converter> base_converter_ptr; 323 #define PTR_TRANS(x) std::move((x)) 324 #else 325 typedef std::auto_ptr<base_converter> base_converter_ptr; 326 #define PTR_TRANS(x) (x) 327 #endif 328 typedef base_converter_ptr state_type; 329 code_converter(base_converter_ptr cvt,size_t refs=0)330 code_converter(base_converter_ptr cvt,size_t refs = 0) : 331 generic_codecvt<CharType,code_converter<CharType> >(refs), 332 cvt_(PTR_TRANS(cvt)) 333 { 334 max_len_ = cvt_->max_len(); 335 thread_safe_ = cvt_->is_thread_safe(); 336 } 337 338 max_encoding_length() const339 int max_encoding_length() const 340 { 341 return max_len_; 342 } 343 initial_state(generic_codecvt_base::initial_convertion_state) const344 base_converter_ptr initial_state(generic_codecvt_base::initial_convertion_state /* unused */) const 345 { 346 base_converter_ptr r; 347 if(!thread_safe_) 348 r.reset(cvt_->clone()); 349 return r; 350 } 351 to_unicode(base_converter_ptr & ptr,char const * & begin,char const * end) const352 utf::code_point to_unicode(base_converter_ptr &ptr,char const *&begin,char const *end) const 353 { 354 if(thread_safe_) 355 return cvt_->to_unicode(begin,end); 356 else 357 return ptr->to_unicode(begin,end); 358 } 359 from_unicode(base_converter_ptr & ptr,utf::code_point u,char * begin,char const * end) const360 utf::code_point from_unicode(base_converter_ptr &ptr,utf::code_point u,char *begin,char const *end) const 361 { 362 if(thread_safe_) 363 return cvt_->from_unicode(u,begin,end); 364 else 365 return ptr->from_unicode(u,begin,end); 366 } 367 368 private: 369 base_converter_ptr cvt_; 370 int max_len_; 371 bool thread_safe_; 372 }; 373 374 create_codecvt_from_pointer(std::locale const & in,base_converter * pcvt,character_facet_type type)375 std::locale create_codecvt_from_pointer(std::locale const &in,base_converter *pcvt,character_facet_type type) 376 { 377 code_converter<char>::base_converter_ptr cvt(pcvt); 378 if(!cvt.get()) 379 cvt.reset(new base_converter()); 380 switch(type) { 381 case char_facet: 382 return std::locale(in,new code_converter<char>(PTR_TRANS(cvt))); 383 case wchar_t_facet: 384 return std::locale(in,new code_converter<wchar_t>(PTR_TRANS(cvt))); 385 #if defined(BOOST_LOCALE_ENABLE_CHAR16_T) && !defined(BOOST_NO_CHAR16_T_CODECVT) 386 case char16_t_facet: 387 return std::locale(in,new code_converter<char16_t>(PTR_TRANS(cvt))); 388 #endif 389 #if defined(BOOST_LOCALE_ENABLE_CHAR32_T) && !defined(BOOST_NO_CHAR32_T_CODECVT) 390 case char32_t_facet: 391 return std::locale(in,new code_converter<char32_t>(PTR_TRANS(cvt))); 392 #endif 393 default: 394 return in; 395 } 396 } 397 398 399 /// 400 /// Install utf8 codecvt to UTF-16 or UTF-32 into locale \a in and return 401 /// new locale that is based on \a in and uses new facet. 402 /// create_utf8_codecvt(std::locale const & in,character_facet_type type)403 std::locale create_utf8_codecvt(std::locale const &in,character_facet_type type) 404 { 405 switch(type) { 406 case char_facet: 407 return std::locale(in,new utf8_codecvt<char>()); 408 case wchar_t_facet: 409 return std::locale(in,new utf8_codecvt<wchar_t>()); 410 #if defined(BOOST_LOCALE_ENABLE_CHAR16_T) && !defined(BOOST_NO_CHAR16_T_CODECVT) 411 case char16_t_facet: 412 return std::locale(in,new utf8_codecvt<char16_t>()); 413 #endif 414 #if defined(BOOST_LOCALE_ENABLE_CHAR32_T) && !defined(BOOST_NO_CHAR32_T_CODECVT) 415 case char32_t_facet: 416 return std::locale(in,new utf8_codecvt<char32_t>()); 417 #endif 418 default: 419 return in; 420 } 421 } 422 423 /// 424 /// This function installs codecvt that can be used for conversion between single byte 425 /// character encodings like ISO-8859-1, koi8-r, windows-1255 and Unicode code points, 426 /// 427 /// Throws invalid_charset_error if the chacater set is not supported or isn't single byte character 428 /// set create_simple_codecvt(std::locale const & in,std::string const & encoding,character_facet_type type)429 std::locale create_simple_codecvt(std::locale const &in,std::string const &encoding,character_facet_type type) 430 { 431 if(!check_is_simple_encoding(encoding)) 432 throw boost::locale::conv::invalid_charset_error("Invalid simple encoding " + encoding); 433 434 switch(type) { 435 case char_facet: 436 return std::locale(in,new simple_codecvt<char>(encoding)); 437 case wchar_t_facet: 438 return std::locale(in,new simple_codecvt<wchar_t>(encoding)); 439 #if defined(BOOST_LOCALE_ENABLE_CHAR16_T) && !defined(BOOST_NO_CHAR16_T_CODECVT) 440 case char16_t_facet: 441 return std::locale(in,new simple_codecvt<char16_t>(encoding)); 442 #endif 443 #if defined(BOOST_LOCALE_ENABLE_CHAR32_T) && !defined(BOOST_NO_CHAR32_T_CODECVT) 444 case char32_t_facet: 445 return std::locale(in,new simple_codecvt<char32_t>(encoding)); 446 #endif 447 default: 448 return in; 449 } 450 } 451 452 453 454 } // util 455 } // locale 456 } // boost 457 458 // vim: tabstop=4 expandtab shiftwidth=4 softtabstop=4 459