1[/ 2 Copyright 2006-2007 John Maddock. 3 Distributed under the Boost Software License, Version 1.0. 4 (See accompanying file LICENSE_1_0.txt or copy at 5 http://www.boost.org/LICENSE_1_0.txt). 6] 7 8 9[section:icu Working With Unicode and ICU String Types] 10 11[section:intro Introduction to using Regex with ICU] 12 13The header: 14 15 <boost/regex/icu.hpp> 16 17contains the data types and algorithms necessary for working with regular 18expressions in a Unicode aware environment. 19 20In order to use this header you will need the 21[@http://www.ibm.com/software/globalization/icu/ ICU library], and you will need 22to have built the Boost.Regex library with 23[link boost_regex.install.building_with_unicode_and_icu_su ICU support enabled]. 24 25The header will enable you to: 26 27* Create regular expressions that treat Unicode strings as sequences of UTF-32 code points. 28* Create regular expressions that support various Unicode data properties, including character classification. 29* Transparently search Unicode strings that are encoded as either UTF-8, UTF-16 or UTF-32. 30 31[endsect] 32 33[section:unicode_types Unicode regular expression types] 34 35Header `<boost/regex/icu.hpp>` provides a regular expression traits class that 36handles UTF-32 characters: 37 38 class icu_regex_traits; 39 40and a regular expression type based upon that: 41 42 typedef basic_regex<UChar32,icu_regex_traits> u32regex; 43 44The type `u32regex` is regular expression type to use for all Unicode 45regular expressions; internally it uses UTF-32 code points, but can be 46created from, and used to search, either UTF-8, or UTF-16 encoded strings 47as well as UTF-32 ones. 48 49The constructors, and assign member functions of `u32regex`, require UTF-32 50encoded strings, but there are a series of overloaded algorithms called 51`make_u32regex` which allow regular expressions to be created from 52UTF-8, UTF-16, or UTF-32 encoded strings: 53 54 template <class InputIterator> 55 u32regex make_u32regex(InputIterator i, 56 InputIterator j, 57 boost::regex_constants::syntax_option_type opt); 58 59[*Effects]: Creates a regular expression object from the iterator sequence \[i,j). 60The character encoding of the sequence is determined based upon sizeof(*i): 611 implies UTF-8, 2 implies UTF-16, and 4 implies UTF-32. 62 63 u32regex make_u32regex(const char* p, 64 boost::regex_constants::syntax_option_type opt 65 = boost::regex_constants::perl); 66 67[*Effects]: Creates a regular expression object from the Null-terminated 68UTF-8 character sequence /p/. 69 70 u32regex make_u32regex(const unsigned char* p, 71 boost::regex_constants::syntax_option_type opt 72 = boost::regex_constants::perl); 73 74[*Effects]: Creates a regular expression object from the Null-terminated UTF-8 character sequence p. 75 76 u32regex make_u32regex(const wchar_t* p, 77 boost::regex_constants::syntax_option_type opt 78 = boost::regex_constants::perl); 79 80[*Effects]: Creates a regular expression object from the Null-terminated character sequence p. The character encoding of the sequence is determined based upon sizeof(wchar_t): 1 implies UTF-8, 2 implies UTF-16, and 4 implies UTF-32. 81 82 u32regex make_u32regex(const UChar* p, 83 boost::regex_constants::syntax_option_type opt 84 = boost::regex_constants::perl); 85 86[*Effects]: Creates a regular expression object from the Null-terminated UTF-16 character sequence p. 87 88 template<class C, class T, class A> 89 u32regex make_u32regex(const std::basic_string<C, T, A>& s, 90 boost::regex_constants::syntax_option_type opt 91 = boost::regex_constants::perl); 92 93[*Effects]: Creates a regular expression object from the string s. The character encoding of the string is determined based upon sizeof(C): 1 implies UTF-8, 2 implies UTF-16, and 4 implies UTF-32. 94 95 u32regex make_u32regex(const UnicodeString& s, 96 boost::regex_constants::syntax_option_type opt 97 = boost::regex_constants::perl); 98 99[*Effects]: Creates a regular expression object from the UTF-16 encoding string s. 100 101[endsect] 102 103[section:unicode_algo Unicode Regular Expression Algorithms] 104 105The regular expression algorithms [regex_match], [regex_search] and [regex_replace] 106all expect that the character sequence upon which they operate, 107is encoded in the same character encoding as the regular expression object 108with which they are used. For Unicode regular expressions that behavior is 109undesirable: while we may want to process the data in UTF-32 "chunks", the 110actual data is much more likely to encoded as either UTF-8 or UTF-16. 111Therefore the header <boost/regex/icu.hpp> provides a series of thin wrappers 112around these algorithms, called `u32regex_match`, `u32regex_search`, and 113`u32regex_replace`. These wrappers use iterator-adapters internally to 114make external UTF-8 or UTF-16 data look as though it's really a UTF-32 sequence, 115that can then be passed on to the "real" algorithm. 116 117[h4 u32regex_match] 118 119For each [regex_match] algorithm defined by `<boost/regex.hpp>`, then 120`<boost/regex/icu.hpp>` defines an overloaded algorithm that takes the 121same arguments, but which is called `u32regex_match`, and which will 122accept UTF-8, UTF-16 or UTF-32 encoded data, as well as an 123ICU UnicodeString as input. 124 125Example: match a password, encoded in a UTF-16 UnicodeString: 126 127 // 128 // Find out if *password* meets our password requirements, 129 // as defined by the regular expression *requirements*. 130 // 131 bool is_valid_password(const UnicodeString& password, const UnicodeString& requirements) 132 { 133 return boost::u32regex_match(password, boost::make_u32regex(requirements)); 134 } 135 136Example: match a UTF-8 encoded filename: 137 138 // 139 // Extract filename part of a path from a UTF-8 encoded std::string and return the result 140 // as another std::string: 141 // 142 std::string get_filename(const std::string& path) 143 { 144 boost::u32regex r = boost::make_u32regex("(?:\\A|.*\\\\)([^\\\\]+)"); 145 boost::smatch what; 146 if(boost::u32regex_match(path, what, r)) 147 { 148 // extract $1 as a std::string: 149 return what.str(1); 150 } 151 else 152 { 153 throw std::runtime_error("Invalid pathname"); 154 } 155 } 156 157[h4 u32regex_search] 158 159For each [regex_search] algorithm defined by `<boost/regex.hpp>`, then 160`<boost/regex/icu.hpp>` defines an overloaded algorithm that takes the 161same arguments, but which is called `u32regex_search`, and which will 162accept UTF-8, UTF-16 or UTF-32 encoded data, as well as an ICU 163UnicodeString as input. 164 165Example: search for a character sequence in a specific language block: 166 167 UnicodeString extract_greek(const UnicodeString& text) 168 { 169 // searches through some UTF-16 encoded text for a block encoded in Greek, 170 // this expression is imperfect, but the best we can do for now - searching 171 // for specific scripts is actually pretty hard to do right. 172 // 173 // Here we search for a character sequence that begins with a Greek letter, 174 // and continues with characters that are either not-letters ( [^[:L*:]] ) 175 // or are characters in the Greek character block ( [\\x{370}-\\x{3FF}] ). 176 // 177 boost::u32regex r = boost::make_u32regex( 178 L"[\\x{370}-\\x{3FF}](?:[^[:L*:]]|[\\x{370}-\\x{3FF}])*"); 179 boost::u16match what; 180 if(boost::u32regex_search(text, what, r)) 181 { 182 // extract $0 as a UnicodeString: 183 return UnicodeString(what[0].first, what.length(0)); 184 } 185 else 186 { 187 throw std::runtime_error("No Greek found!"); 188 } 189 } 190 191[h4 u32regex_replace] 192 193For each [regex_replace] algorithm defined by `<boost/regex.hpp>`, then 194`<boost/regex/icu.hpp>` defines an overloaded algorithm that takes 195the same arguments, but which is called `u32regex_replace`, and which will 196accept UTF-8, UTF-16 or UTF-32 encoded data, as well as an ICU 197UnicodeString as input. The input sequence and the format string specifier 198passed to the algorithm, can be encoded independently (for example one can 199be UTF-8, the other in UTF-16), but the result string / output iterator 200argument must use the same character encoding as the text being searched. 201 202Example: Credit card number reformatting: 203 204 // 205 // Take a credit card number as a string of digits, 206 // and reformat it as a human readable string with "-" 207 // separating each group of four digit;, 208 // note that we're mixing a UTF-32 regex, with a UTF-16 209 // string and a UTF-8 format specifier, and it still all 210 // just works: 211 // 212 const boost::u32regex e = boost::make_u32regex( 213 "\\A(\\d{3,4})[- ]?(\\d{4})[- ]?(\\d{4})[- ]?(\\d{4})\\z"); 214 const char* human_format = "$1-$2-$3-$4"; 215 216 UnicodeString human_readable_card_number(const UnicodeString& s) 217 { 218 return boost::u32regex_replace(s, e, human_format); 219 } 220 221[endsect] 222[section:unicode_iter Unicode Aware Regex Iterators] 223 224[h4 u32regex_iterator] 225 226Type `u32regex_iterator` is in all respects the same as [regex_iterator] 227except that since the regular expression type is always `u32regex` 228it only takes one template parameter (the iterator type). It also calls 229`u32regex_search` internally, allowing it to interface correctly with 230UTF-8, UTF-16, and UTF-32 data: 231 232 template <class BidirectionalIterator> 233 class u32regex_iterator 234 { 235 // for members see regex_iterator 236 }; 237 238 typedef u32regex_iterator<const char*> utf8regex_iterator; 239 typedef u32regex_iterator<const UChar*> utf16regex_iterator; 240 typedef u32regex_iterator<const UChar32*> utf32regex_iterator; 241 242In order to simplify the construction of a `u32regex_iterator` from a string, 243there are a series of non-member helper functions called make_u32regex_iterator: 244 245 u32regex_iterator<const char*> 246 make_u32regex_iterator(const char* s, 247 const u32regex& e, 248 regex_constants::match_flag_type m = regex_constants::match_default); 249 250 u32regex_iterator<const wchar_t*> 251 make_u32regex_iterator(const wchar_t* s, 252 const u32regex& e, 253 regex_constants::match_flag_type m = regex_constants::match_default); 254 255 u32regex_iterator<const UChar*> 256 make_u32regex_iterator(const UChar* s, 257 const u32regex& e, 258 regex_constants::match_flag_type m = regex_constants::match_default); 259 260 template <class charT, class Traits, class Alloc> 261 u32regex_iterator<typename std::basic_string<charT, Traits, Alloc>::const_iterator> 262 make_u32regex_iterator(const std::basic_string<charT, Traits, Alloc>& s, 263 const u32regex& e, 264 regex_constants::match_flag_type m = regex_constants::match_default); 265 266 u32regex_iterator<const UChar*> 267 make_u32regex_iterator(const UnicodeString& s, 268 const u32regex& e, 269 regex_constants::match_flag_type m = regex_constants::match_default); 270 271Each of these overloads returns an iterator that enumerates all occurrences 272of expression /e/, in text /s/, using match_flags /m/. 273 274Example: search for international currency symbols, along with their associated numeric value: 275 276 void enumerate_currencies(const std::string& text) 277 { 278 // enumerate and print all the currency symbols, along 279 // with any associated numeric values: 280 const char* re = 281 "([[:Sc:]][[:Cf:][:Cc:][:Z*:]]*)?" 282 "([[:Nd:]]+(?:[[:Po:]][[:Nd:]]+)?)?" 283 "(?(1)" 284 "|(?(2)" 285 "[[:Cf:][:Cc:][:Z*:]]*" 286 ")" 287 "[[:Sc:]]" 288 ")"; 289 boost::u32regex r = boost::make_u32regex(re); 290 boost::u32regex_iterator<std::string::const_iterator> 291 i(boost::make_u32regex_iterator(text, r)), j; 292 while(i != j) 293 { 294 std::cout << (*i)[0] << std::endl; 295 ++i; 296 } 297 } 298 299Calling 300 301[/this doesn't format correctly as code:] 302[pre enumerate_currencies(" $100.23 or '''£'''198.12 ");] 303 304Yields the output: 305 306[pre 307$100.23 308'''£'''198.12 309] 310 311Provided of course that the input is encoded as UTF-8. 312 313[h4 u32regex_token_iterator] 314 315Type `u32regex_token_iterator` is in all respects the same as [regex_token_iterator] 316except that since the regular expression type is always `u32regex` it only 317takes one template parameter (the iterator type). It also calls 318`u32regex_search` internally, allowing it to interface correctly with UTF-8, 319UTF-16, and UTF-32 data: 320 321 template <class BidirectionalIterator> 322 class u32regex_token_iterator 323 { 324 // for members see regex_token_iterator 325 }; 326 327 typedef u32regex_token_iterator<const char*> utf8regex_token_iterator; 328 typedef u32regex_token_iterator<const UChar*> utf16regex_token_iterator; 329 typedef u32regex_token_iterator<const UChar32*> utf32regex_token_iterator; 330 331In order to simplify the construction of a `u32regex_token_iterator` from a string, 332there are a series of non-member helper functions called `make_u32regex_token_iterator`: 333 334 u32regex_token_iterator<const char*> 335 make_u32regex_token_iterator( 336 const char* s, 337 const u32regex& e, 338 int sub, 339 regex_constants::match_flag_type m = regex_constants::match_default); 340 341 u32regex_token_iterator<const wchar_t*> 342 make_u32regex_token_iterator( 343 const wchar_t* s, 344 const u32regex& e, 345 int sub, 346 regex_constants::match_flag_type m = regex_constants::match_default); 347 348 u32regex_token_iterator<const UChar*> 349 make_u32regex_token_iterator( 350 const UChar* s, 351 const u32regex& e, 352 int sub, 353 regex_constants::match_flag_type m = regex_constants::match_default); 354 355 template <class charT, class Traits, class Alloc> 356 u32regex_token_iterator<typename std::basic_string<charT, Traits, Alloc>::const_iterator> 357 make_u32regex_token_iterator( 358 const std::basic_string<charT, Traits, Alloc>& s, 359 const u32regex& e, 360 int sub, 361 regex_constants::match_flag_type m = regex_constants::match_default); 362 363 u32regex_token_iterator<const UChar*> 364 make_u32regex_token_iterator( 365 const UnicodeString& s, 366 const u32regex& e, 367 int sub, 368 regex_constants::match_flag_type m = regex_constants::match_default); 369 370Each of these overloads returns an iterator that enumerates all occurrences of 371marked sub-expression sub in regular expression /e/, found in text /s/, using 372match_flags /m/. 373 374 template <std::size_t N> 375 u32regex_token_iterator<const char*> 376 make_u32regex_token_iterator( 377 const char* p, 378 const u32regex& e, 379 const int (&submatch)[N], 380 regex_constants::match_flag_type m = regex_constants::match_default); 381 382 template <std::size_t N> 383 u32regex_token_iterator<const wchar_t*> 384 make_u32regex_token_iterator( 385 const wchar_t* p, 386 const u32regex& e, 387 const int (&submatch)[N], 388 regex_constants::match_flag_type m = regex_constants::match_default); 389 390 template <std::size_t N> 391 u32regex_token_iterator<const UChar*> 392 make_u32regex_token_iterator( 393 const UChar* p, 394 const u32regex& e, 395 const int (&submatch)[N], 396 regex_constants::match_flag_type m = regex_constants::match_default); 397 398 template <class charT, class Traits, class Alloc, std::size_t N> 399 u32regex_token_iterator<typename std::basic_string<charT, Traits, Alloc>::const_iterator> 400 make_u32regex_token_iterator( 401 const std::basic_string<charT, Traits, Alloc>& p, 402 const u32regex& e, 403 const int (&submatch)[N], 404 regex_constants::match_flag_type m = regex_constants::match_default); 405 406 template <std::size_t N> 407 u32regex_token_iterator<const UChar*> 408 make_u32regex_token_iterator( 409 const UnicodeString& s, 410 const u32regex& e, 411 const int (&submatch)[N], 412 regex_constants::match_flag_type m = regex_constants::match_default); 413 414Each of these overloads returns an iterator that enumerates one sub-expression 415for each submatch in regular expression /e/, found in text /s/, using match_flags /m/. 416 417 u32regex_token_iterator<const char*> 418 make_u32regex_token_iterator( 419 const char* p, 420 const u32regex& e, 421 const std::vector<int>& submatch, 422 regex_constants::match_flag_type m = regex_constants::match_default); 423 424 u32regex_token_iterator<const wchar_t*> 425 make_u32regex_token_iterator( 426 const wchar_t* p, 427 const u32regex& e, 428 const std::vector<int>& submatch, 429 regex_constants::match_flag_type m = regex_constants::match_default); 430 431 u32regex_token_iterator<const UChar*> 432 make_u32regex_token_iterator( 433 const UChar* p, 434 const u32regex& e, 435 const std::vector<int>& submatch, 436 regex_constants::match_flag_type m = regex_constants::match_default); 437 438 template <class charT, class Traits, class Alloc> 439 u32regex_token_iterator<typename std::basic_string<charT, Traits, Alloc>::const_iterator> 440 make_u32regex_token_iterator( 441 const std::basic_string<charT, Traits, Alloc>& p, 442 const u32regex& e, 443 const std::vector<int>& submatch, 444 regex_constants::match_flag_type m = regex_constants::match_default); 445 446 u32regex_token_iterator<const UChar*> 447 make_u32regex_token_iterator( 448 const UnicodeString& s, 449 const u32regex& e, 450 const std::vector<int>& submatch, 451 regex_constants::match_flag_type m = regex_constants::match_default); 452 453Each of these overloads returns an iterator that enumerates one sub-expression for 454each submatch in regular expression /e/, found in text /s/, using match_flags /m/. 455 456Example: search for international currency symbols, along with their associated numeric value: 457 458 void enumerate_currencies2(const std::string& text) 459 { 460 // enumerate and print all the currency symbols, along 461 // with any associated numeric values: 462 const char* re = 463 "([[:Sc:]][[:Cf:][:Cc:][:Z*:]]*)?" 464 "([[:Nd:]]+(?:[[:Po:]][[:Nd:]]+)?)?" 465 "(?(1)" 466 "|(?(2)" 467 "[[:Cf:][:Cc:][:Z*:]]*" 468 ")" 469 "[[:Sc:]]" 470 ")"; 471 boost::u32regex r = boost::make_u32regex(re); 472 boost::u32regex_token_iterator<std::string::const_iterator> 473 i(boost::make_u32regex_token_iterator(text, r, 1)), j; 474 while(i != j) 475 { 476 std::cout << *i << std::endl; 477 ++i; 478 } 479 } 480 481[endsect] 482 483[endsect] 484 485