1 /////////////////////////////////////////////////////////////////////////////// 2 // detail/dynamic/parser_traits.hpp 3 // 4 // Copyright 2008 Eric Niebler. Distributed under the Boost 5 // Software License, Version 1.0. (See accompanying file 6 // LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) 7 8 #ifndef BOOST_XPRESSIVE_DETAIL_DYNAMIC_PARSER_TRAITS_HPP_EAN_10_04_2005 9 #define BOOST_XPRESSIVE_DETAIL_DYNAMIC_PARSER_TRAITS_HPP_EAN_10_04_2005 10 11 // MS compatible compilers support #pragma once 12 #if defined(_MSC_VER) 13 # pragma once 14 #endif 15 16 #include <string> 17 #include <climits> 18 #include <boost/config.hpp> 19 #include <boost/assert.hpp> 20 #include <boost/throw_exception.hpp> 21 #include <boost/xpressive/regex_error.hpp> 22 #include <boost/xpressive/regex_traits.hpp> 23 #include <boost/xpressive/detail/detail_fwd.hpp> 24 #include <boost/xpressive/detail/dynamic/matchable.hpp> 25 #include <boost/xpressive/detail/dynamic/parser_enum.hpp> 26 #include <boost/xpressive/detail/utility/literals.hpp> 27 #include <boost/xpressive/detail/utility/algorithm.hpp> 28 29 namespace boost { namespace xpressive 30 { 31 32 /////////////////////////////////////////////////////////////////////////////// 33 // compiler_traits 34 // this works for char and wchar_t. it must be specialized for anything else. 35 // 36 template<typename RegexTraits> 37 struct compiler_traits 38 { 39 typedef RegexTraits regex_traits; 40 typedef typename regex_traits::char_type char_type; 41 typedef typename regex_traits::string_type string_type; 42 typedef typename regex_traits::locale_type locale_type; 43 44 /////////////////////////////////////////////////////////////////////////////// 45 // constructor compiler_traitsboost::xpressive::compiler_traits46 explicit compiler_traits(RegexTraits const &traits = RegexTraits()) 47 : traits_(traits) 48 , flags_(regex_constants::ECMAScript) 49 , space_(lookup_classname(traits_, "space")) 50 , alnum_(lookup_classname(traits_, "alnum")) 51 { 52 } 53 54 /////////////////////////////////////////////////////////////////////////////// 55 // flags flagsboost::xpressive::compiler_traits56 regex_constants::syntax_option_type flags() const 57 { 58 return this->flags_; 59 } 60 61 /////////////////////////////////////////////////////////////////////////////// 62 // flags flagsboost::xpressive::compiler_traits63 void flags(regex_constants::syntax_option_type flags) 64 { 65 this->flags_ = flags; 66 } 67 68 /////////////////////////////////////////////////////////////////////////////// 69 // traits traitsboost::xpressive::compiler_traits70 regex_traits &traits() 71 { 72 return this->traits_; 73 } 74 traitsboost::xpressive::compiler_traits75 regex_traits const &traits() const 76 { 77 return this->traits_; 78 } 79 80 /////////////////////////////////////////////////////////////////////////////// 81 // imbue imbueboost::xpressive::compiler_traits82 locale_type imbue(locale_type const &loc) 83 { 84 locale_type oldloc = this->traits().imbue(loc); 85 this->space_ = lookup_classname(this->traits(), "space"); 86 this->alnum_ = lookup_classname(this->traits(), "alnum"); 87 return oldloc; 88 } 89 90 /////////////////////////////////////////////////////////////////////////////// 91 // getloc getlocboost::xpressive::compiler_traits92 locale_type getloc() const 93 { 94 return this->traits().getloc(); 95 } 96 97 /////////////////////////////////////////////////////////////////////////////// 98 // get_token 99 // get a token and advance the iterator 100 template<typename FwdIter> get_tokenboost::xpressive::compiler_traits101 regex_constants::compiler_token_type get_token(FwdIter &begin, FwdIter end) 102 { 103 using namespace regex_constants; 104 if(this->eat_ws_(begin, end) == end) 105 { 106 return regex_constants::token_end_of_pattern; 107 } 108 109 switch(*begin) 110 { 111 case BOOST_XPR_CHAR_(char_type, '\\'): return this->get_escape_token(++begin, end); 112 case BOOST_XPR_CHAR_(char_type, '.'): ++begin; return token_any; 113 case BOOST_XPR_CHAR_(char_type, '^'): ++begin; return token_assert_begin_line; 114 case BOOST_XPR_CHAR_(char_type, '$'): ++begin; return token_assert_end_line; 115 case BOOST_XPR_CHAR_(char_type, '('): ++begin; return token_group_begin; 116 case BOOST_XPR_CHAR_(char_type, ')'): ++begin; return token_group_end; 117 case BOOST_XPR_CHAR_(char_type, '|'): ++begin; return token_alternate; 118 case BOOST_XPR_CHAR_(char_type, '['): ++begin; return token_charset_begin; 119 120 case BOOST_XPR_CHAR_(char_type, '*'): 121 case BOOST_XPR_CHAR_(char_type, '+'): 122 case BOOST_XPR_CHAR_(char_type, '?'): 123 return token_invalid_quantifier; 124 125 case BOOST_XPR_CHAR_(char_type, ']'): 126 case BOOST_XPR_CHAR_(char_type, '{'): 127 default: 128 return token_literal; 129 } 130 } 131 132 /////////////////////////////////////////////////////////////////////////////// 133 // get_quant_spec 134 template<typename FwdIter> get_quant_specboost::xpressive::compiler_traits135 bool get_quant_spec(FwdIter &begin, FwdIter end, detail::quant_spec &spec) 136 { 137 using namespace regex_constants; 138 FwdIter old_begin; 139 140 if(this->eat_ws_(begin, end) == end) 141 { 142 return false; 143 } 144 145 switch(*begin) 146 { 147 case BOOST_XPR_CHAR_(char_type, '*'): 148 spec.min_ = 0; 149 spec.max_ = (std::numeric_limits<unsigned int>::max)(); 150 break; 151 152 case BOOST_XPR_CHAR_(char_type, '+'): 153 spec.min_ = 1; 154 spec.max_ = (std::numeric_limits<unsigned int>::max)(); 155 break; 156 157 case BOOST_XPR_CHAR_(char_type, '?'): 158 spec.min_ = 0; 159 spec.max_ = 1; 160 break; 161 162 case BOOST_XPR_CHAR_(char_type, '{'): 163 old_begin = this->eat_ws_(++begin, end); 164 spec.min_ = spec.max_ = detail::toi(begin, end, this->traits()); 165 BOOST_XPR_ENSURE_ 166 ( 167 begin != old_begin && begin != end, error_brace, "invalid quantifier" 168 ); 169 170 if(*begin == BOOST_XPR_CHAR_(char_type, ',')) 171 { 172 old_begin = this->eat_ws_(++begin, end); 173 spec.max_ = detail::toi(begin, end, this->traits()); 174 BOOST_XPR_ENSURE_ 175 ( 176 begin != end && BOOST_XPR_CHAR_(char_type, '}') == *begin 177 , error_brace, "invalid quantifier" 178 ); 179 180 if(begin == old_begin) 181 { 182 spec.max_ = (std::numeric_limits<unsigned int>::max)(); 183 } 184 else 185 { 186 BOOST_XPR_ENSURE_ 187 ( 188 spec.min_ <= spec.max_, error_badbrace, "invalid quantification range" 189 ); 190 } 191 } 192 else 193 { 194 BOOST_XPR_ENSURE_ 195 ( 196 BOOST_XPR_CHAR_(char_type, '}') == *begin, error_brace, "invalid quantifier" 197 ); 198 } 199 break; 200 201 default: 202 return false; 203 } 204 205 spec.greedy_ = true; 206 if(this->eat_ws_(++begin, end) != end && BOOST_XPR_CHAR_(char_type, '?') == *begin) 207 { 208 ++begin; 209 spec.greedy_ = false; 210 } 211 212 return true; 213 } 214 215 /////////////////////////////////////////////////////////////////////////// 216 // get_group_type 217 template<typename FwdIter> get_group_typeboost::xpressive::compiler_traits218 regex_constants::compiler_token_type get_group_type(FwdIter &begin, FwdIter end, string_type &name) 219 { 220 using namespace regex_constants; 221 if(this->eat_ws_(begin, end) != end && BOOST_XPR_CHAR_(char_type, '?') == *begin) 222 { 223 this->eat_ws_(++begin, end); 224 BOOST_XPR_ENSURE_(begin != end, error_paren, "incomplete extension"); 225 226 switch(*begin) 227 { 228 case BOOST_XPR_CHAR_(char_type, ':'): ++begin; return token_no_mark; 229 case BOOST_XPR_CHAR_(char_type, '>'): ++begin; return token_independent_sub_expression; 230 case BOOST_XPR_CHAR_(char_type, '#'): ++begin; return token_comment; 231 case BOOST_XPR_CHAR_(char_type, '='): ++begin; return token_positive_lookahead; 232 case BOOST_XPR_CHAR_(char_type, '!'): ++begin; return token_negative_lookahead; 233 case BOOST_XPR_CHAR_(char_type, 'R'): ++begin; return token_recurse; 234 case BOOST_XPR_CHAR_(char_type, '$'): 235 this->get_name_(++begin, end, name); 236 BOOST_XPR_ENSURE_(begin != end, error_paren, "incomplete extension"); 237 if(BOOST_XPR_CHAR_(char_type, '=') == *begin) 238 { 239 ++begin; 240 return token_rule_assign; 241 } 242 return token_rule_ref; 243 244 case BOOST_XPR_CHAR_(char_type, '<'): 245 this->eat_ws_(++begin, end); 246 BOOST_XPR_ENSURE_(begin != end, error_paren, "incomplete extension"); 247 switch(*begin) 248 { 249 case BOOST_XPR_CHAR_(char_type, '='): ++begin; return token_positive_lookbehind; 250 case BOOST_XPR_CHAR_(char_type, '!'): ++begin; return token_negative_lookbehind; 251 default: 252 BOOST_THROW_EXCEPTION(regex_error(error_badbrace, "unrecognized extension")); 253 } 254 255 case BOOST_XPR_CHAR_(char_type, 'P'): 256 this->eat_ws_(++begin, end); 257 BOOST_XPR_ENSURE_(begin != end, error_paren, "incomplete extension"); 258 switch(*begin) 259 { 260 case BOOST_XPR_CHAR_(char_type, '<'): 261 this->get_name_(++begin, end, name); 262 BOOST_XPR_ENSURE_(begin != end && BOOST_XPR_CHAR_(char_type, '>') == *begin++, error_paren, "incomplete extension"); 263 return token_named_mark; 264 case BOOST_XPR_CHAR_(char_type, '='): 265 this->get_name_(++begin, end, name); 266 BOOST_XPR_ENSURE_(begin != end, error_paren, "incomplete extension"); 267 return token_named_mark_ref; 268 default: 269 BOOST_THROW_EXCEPTION(regex_error(error_badbrace, "unrecognized extension")); 270 } 271 272 case BOOST_XPR_CHAR_(char_type, 'i'): 273 case BOOST_XPR_CHAR_(char_type, 'm'): 274 case BOOST_XPR_CHAR_(char_type, 's'): 275 case BOOST_XPR_CHAR_(char_type, 'x'): 276 case BOOST_XPR_CHAR_(char_type, '-'): 277 return this->parse_mods_(begin, end); 278 279 default: 280 BOOST_THROW_EXCEPTION(regex_error(error_badbrace, "unrecognized extension")); 281 } 282 } 283 284 return token_literal; 285 } 286 287 ////////////////////////////////////////////////////////////////////////// 288 // get_charset_token 289 // NOTE: white-space is *never* ignored in a charset. 290 template<typename FwdIter> get_charset_tokenboost::xpressive::compiler_traits291 regex_constants::compiler_token_type get_charset_token(FwdIter &begin, FwdIter end) 292 { 293 using namespace regex_constants; 294 BOOST_ASSERT(begin != end); 295 switch(*begin) 296 { 297 case BOOST_XPR_CHAR_(char_type, '^'): ++begin; return token_charset_invert; 298 case BOOST_XPR_CHAR_(char_type, '-'): ++begin; return token_charset_hyphen; 299 case BOOST_XPR_CHAR_(char_type, ']'): ++begin; return token_charset_end; 300 case BOOST_XPR_CHAR_(char_type, '['): 301 { 302 FwdIter next = begin; ++next; 303 if(next != end) 304 { 305 BOOST_XPR_ENSURE_( 306 *next != BOOST_XPR_CHAR_(char_type, '=') 307 , error_collate 308 , "equivalence classes are not yet supported" 309 ); 310 311 BOOST_XPR_ENSURE_( 312 *next != BOOST_XPR_CHAR_(char_type, '.') 313 , error_collate 314 , "collation sequences are not yet supported" 315 ); 316 317 if(*next == BOOST_XPR_CHAR_(char_type, ':')) 318 { 319 begin = ++next; 320 return token_posix_charset_begin; 321 } 322 } 323 } 324 break; 325 case BOOST_XPR_CHAR_(char_type, ':'): 326 { 327 FwdIter next = begin; ++next; 328 if(next != end && *next == BOOST_XPR_CHAR_(char_type, ']')) 329 { 330 begin = ++next; 331 return token_posix_charset_end; 332 } 333 } 334 break; 335 case BOOST_XPR_CHAR_(char_type, '\\'): 336 if(++begin != end) 337 { 338 switch(*begin) 339 { 340 case BOOST_XPR_CHAR_(char_type, 'b'): ++begin; return token_charset_backspace; 341 default:; 342 } 343 } 344 return token_escape; 345 default:; 346 } 347 return token_literal; 348 } 349 350 ////////////////////////////////////////////////////////////////////////// 351 // get_escape_token 352 template<typename FwdIter> get_escape_tokenboost::xpressive::compiler_traits353 regex_constants::compiler_token_type get_escape_token(FwdIter &begin, FwdIter end) 354 { 355 using namespace regex_constants; 356 if(begin != end) 357 { 358 switch(*begin) 359 { 360 //case BOOST_XPR_CHAR_(char_type, 'a'): ++begin; return token_escape_bell; 361 //case BOOST_XPR_CHAR_(char_type, 'c'): ++begin; return token_escape_control; 362 //case BOOST_XPR_CHAR_(char_type, 'e'): ++begin; return token_escape_escape; 363 //case BOOST_XPR_CHAR_(char_type, 'f'): ++begin; return token_escape_formfeed; 364 //case BOOST_XPR_CHAR_(char_type, 'n'): ++begin; return token_escape_newline; 365 //case BOOST_XPR_CHAR_(char_type, 't'): ++begin; return token_escape_horizontal_tab; 366 //case BOOST_XPR_CHAR_(char_type, 'v'): ++begin; return token_escape_vertical_tab; 367 case BOOST_XPR_CHAR_(char_type, 'A'): ++begin; return token_assert_begin_sequence; 368 case BOOST_XPR_CHAR_(char_type, 'b'): ++begin; return token_assert_word_boundary; 369 case BOOST_XPR_CHAR_(char_type, 'B'): ++begin; return token_assert_not_word_boundary; 370 case BOOST_XPR_CHAR_(char_type, 'E'): ++begin; return token_quote_meta_end; 371 case BOOST_XPR_CHAR_(char_type, 'Q'): ++begin; return token_quote_meta_begin; 372 case BOOST_XPR_CHAR_(char_type, 'Z'): ++begin; return token_assert_end_sequence; 373 // Non-standard extension to ECMAScript syntax 374 case BOOST_XPR_CHAR_(char_type, '<'): ++begin; return token_assert_word_begin; 375 case BOOST_XPR_CHAR_(char_type, '>'): ++begin; return token_assert_word_end; 376 default:; // fall-through 377 } 378 } 379 380 return token_escape; 381 } 382 383 private: 384 385 ////////////////////////////////////////////////////////////////////////// 386 // parse_mods_ 387 template<typename FwdIter> parse_mods_boost::xpressive::compiler_traits388 regex_constants::compiler_token_type parse_mods_(FwdIter &begin, FwdIter end) 389 { 390 using namespace regex_constants; 391 bool set = true; 392 do switch(*begin) 393 { 394 case BOOST_XPR_CHAR_(char_type, 'i'): this->flag_(set, icase_); break; 395 case BOOST_XPR_CHAR_(char_type, 'm'): this->flag_(!set, single_line); break; 396 case BOOST_XPR_CHAR_(char_type, 's'): this->flag_(!set, not_dot_newline); break; 397 case BOOST_XPR_CHAR_(char_type, 'x'): this->flag_(set, ignore_white_space); break; 398 case BOOST_XPR_CHAR_(char_type, ':'): ++begin; BOOST_FALLTHROUGH; 399 case BOOST_XPR_CHAR_(char_type, ')'): return token_no_mark; 400 case BOOST_XPR_CHAR_(char_type, '-'): if(false == (set = !set)) break; BOOST_FALLTHROUGH; 401 default: BOOST_THROW_EXCEPTION(regex_error(error_paren, "unknown pattern modifier")); 402 } 403 while(BOOST_XPR_ENSURE_(++begin != end, error_paren, "incomplete extension")); 404 // this return is technically unreachable, but this must 405 // be here to work around a bug in gcc 4.0 406 return token_no_mark; 407 } 408 409 /////////////////////////////////////////////////////////////////////////////// 410 // flag_ flag_boost::xpressive::compiler_traits411 void flag_(bool set, regex_constants::syntax_option_type flag) 412 { 413 this->flags_ = set ? (this->flags_ | flag) : (this->flags_ & ~flag); 414 } 415 416 /////////////////////////////////////////////////////////////////////////// 417 // is_space_ is_space_boost::xpressive::compiler_traits418 bool is_space_(char_type ch) const 419 { 420 return 0 != this->space_ && this->traits().isctype(ch, this->space_); 421 } 422 423 /////////////////////////////////////////////////////////////////////////// 424 // is_alnum_ is_alnum_boost::xpressive::compiler_traits425 bool is_alnum_(char_type ch) const 426 { 427 return 0 != this->alnum_ && this->traits().isctype(ch, this->alnum_); 428 } 429 430 /////////////////////////////////////////////////////////////////////////// 431 // get_name_ 432 template<typename FwdIter> get_name_boost::xpressive::compiler_traits433 void get_name_(FwdIter &begin, FwdIter end, string_type &name) 434 { 435 this->eat_ws_(begin, end); 436 for(name.clear(); begin != end && this->is_alnum_(*begin); ++begin) 437 { 438 name.push_back(*begin); 439 } 440 this->eat_ws_(begin, end); 441 BOOST_XPR_ENSURE_(!name.empty(), regex_constants::error_paren, "incomplete extension"); 442 } 443 444 /////////////////////////////////////////////////////////////////////////////// 445 // eat_ws_ 446 template<typename FwdIter> eat_ws_boost::xpressive::compiler_traits447 FwdIter &eat_ws_(FwdIter &begin, FwdIter end) 448 { 449 if(0 != (regex_constants::ignore_white_space & this->flags())) 450 { 451 while(end != begin && (BOOST_XPR_CHAR_(char_type, '#') == *begin || this->is_space_(*begin))) 452 { 453 if(BOOST_XPR_CHAR_(char_type, '#') == *begin++) 454 { 455 while(end != begin && BOOST_XPR_CHAR_(char_type, '\n') != *begin++) {} 456 } 457 else 458 { 459 for(; end != begin && this->is_space_(*begin); ++begin) {} 460 } 461 } 462 } 463 464 return begin; 465 } 466 467 regex_traits traits_; 468 regex_constants::syntax_option_type flags_; 469 typename regex_traits::char_class_type space_; 470 typename regex_traits::char_class_type alnum_; 471 }; 472 473 }} // namespace boost::xpressive 474 475 #endif 476