• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 ///////////////////////////////////////////////////////////////////////////////
2 // detail/dynamic/parser_traits.hpp
3 //
4 //  Copyright 2008 Eric Niebler. Distributed under the Boost
5 //  Software License, Version 1.0. (See accompanying file
6 //  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
7 
8 #ifndef BOOST_XPRESSIVE_DETAIL_DYNAMIC_PARSER_TRAITS_HPP_EAN_10_04_2005
9 #define BOOST_XPRESSIVE_DETAIL_DYNAMIC_PARSER_TRAITS_HPP_EAN_10_04_2005
10 
11 // MS compatible compilers support #pragma once
12 #if defined(_MSC_VER)
13 # pragma once
14 #endif
15 
16 #include <string>
17 #include <climits>
18 #include <boost/config.hpp>
19 #include <boost/assert.hpp>
20 #include <boost/throw_exception.hpp>
21 #include <boost/xpressive/regex_error.hpp>
22 #include <boost/xpressive/regex_traits.hpp>
23 #include <boost/xpressive/detail/detail_fwd.hpp>
24 #include <boost/xpressive/detail/dynamic/matchable.hpp>
25 #include <boost/xpressive/detail/dynamic/parser_enum.hpp>
26 #include <boost/xpressive/detail/utility/literals.hpp>
27 #include <boost/xpressive/detail/utility/algorithm.hpp>
28 
29 namespace boost { namespace xpressive
30 {
31 
32 ///////////////////////////////////////////////////////////////////////////////
33 // compiler_traits
34 //  this works for char and wchar_t. it must be specialized for anything else.
35 //
36 template<typename RegexTraits>
37 struct compiler_traits
38 {
39     typedef RegexTraits regex_traits;
40     typedef typename regex_traits::char_type char_type;
41     typedef typename regex_traits::string_type string_type;
42     typedef typename regex_traits::locale_type locale_type;
43 
44     ///////////////////////////////////////////////////////////////////////////////
45     // constructor
compiler_traitsboost::xpressive::compiler_traits46     explicit compiler_traits(RegexTraits const &traits = RegexTraits())
47       : traits_(traits)
48       , flags_(regex_constants::ECMAScript)
49       , space_(lookup_classname(traits_, "space"))
50       , alnum_(lookup_classname(traits_, "alnum"))
51     {
52     }
53 
54     ///////////////////////////////////////////////////////////////////////////////
55     // flags
flagsboost::xpressive::compiler_traits56     regex_constants::syntax_option_type flags() const
57     {
58         return this->flags_;
59     }
60 
61     ///////////////////////////////////////////////////////////////////////////////
62     // flags
flagsboost::xpressive::compiler_traits63     void flags(regex_constants::syntax_option_type flags)
64     {
65         this->flags_ = flags;
66     }
67 
68     ///////////////////////////////////////////////////////////////////////////////
69     // traits
traitsboost::xpressive::compiler_traits70     regex_traits &traits()
71     {
72         return this->traits_;
73     }
74 
traitsboost::xpressive::compiler_traits75     regex_traits const &traits() const
76     {
77         return this->traits_;
78     }
79 
80     ///////////////////////////////////////////////////////////////////////////////
81     // imbue
imbueboost::xpressive::compiler_traits82     locale_type imbue(locale_type const &loc)
83     {
84         locale_type oldloc = this->traits().imbue(loc);
85         this->space_ = lookup_classname(this->traits(), "space");
86         this->alnum_ = lookup_classname(this->traits(), "alnum");
87         return oldloc;
88     }
89 
90     ///////////////////////////////////////////////////////////////////////////////
91     // getloc
getlocboost::xpressive::compiler_traits92     locale_type getloc() const
93     {
94         return this->traits().getloc();
95     }
96 
97     ///////////////////////////////////////////////////////////////////////////////
98     // get_token
99     //  get a token and advance the iterator
100     template<typename FwdIter>
get_tokenboost::xpressive::compiler_traits101     regex_constants::compiler_token_type get_token(FwdIter &begin, FwdIter end)
102     {
103         using namespace regex_constants;
104         if(this->eat_ws_(begin, end) == end)
105         {
106             return regex_constants::token_end_of_pattern;
107         }
108 
109         switch(*begin)
110         {
111         case BOOST_XPR_CHAR_(char_type, '\\'): return this->get_escape_token(++begin, end);
112         case BOOST_XPR_CHAR_(char_type, '.'): ++begin; return token_any;
113         case BOOST_XPR_CHAR_(char_type, '^'): ++begin; return token_assert_begin_line;
114         case BOOST_XPR_CHAR_(char_type, '$'): ++begin; return token_assert_end_line;
115         case BOOST_XPR_CHAR_(char_type, '('): ++begin; return token_group_begin;
116         case BOOST_XPR_CHAR_(char_type, ')'): ++begin; return token_group_end;
117         case BOOST_XPR_CHAR_(char_type, '|'): ++begin; return token_alternate;
118         case BOOST_XPR_CHAR_(char_type, '['): ++begin; return token_charset_begin;
119 
120         case BOOST_XPR_CHAR_(char_type, '*'):
121         case BOOST_XPR_CHAR_(char_type, '+'):
122         case BOOST_XPR_CHAR_(char_type, '?'):
123             return token_invalid_quantifier;
124 
125         case BOOST_XPR_CHAR_(char_type, ']'):
126         case BOOST_XPR_CHAR_(char_type, '{'):
127         default:
128             return token_literal;
129         }
130     }
131 
132     ///////////////////////////////////////////////////////////////////////////////
133     // get_quant_spec
134     template<typename FwdIter>
get_quant_specboost::xpressive::compiler_traits135     bool get_quant_spec(FwdIter &begin, FwdIter end, detail::quant_spec &spec)
136     {
137         using namespace regex_constants;
138         FwdIter old_begin;
139 
140         if(this->eat_ws_(begin, end) == end)
141         {
142             return false;
143         }
144 
145         switch(*begin)
146         {
147         case BOOST_XPR_CHAR_(char_type, '*'):
148             spec.min_ = 0;
149             spec.max_ = (std::numeric_limits<unsigned int>::max)();
150             break;
151 
152         case BOOST_XPR_CHAR_(char_type, '+'):
153             spec.min_ = 1;
154             spec.max_ = (std::numeric_limits<unsigned int>::max)();
155             break;
156 
157         case BOOST_XPR_CHAR_(char_type, '?'):
158             spec.min_ = 0;
159             spec.max_ = 1;
160             break;
161 
162         case BOOST_XPR_CHAR_(char_type, '{'):
163             old_begin = this->eat_ws_(++begin, end);
164             spec.min_ = spec.max_ = detail::toi(begin, end, this->traits());
165             BOOST_XPR_ENSURE_
166             (
167                 begin != old_begin && begin != end, error_brace, "invalid quantifier"
168             );
169 
170             if(*begin == BOOST_XPR_CHAR_(char_type, ','))
171             {
172                 old_begin = this->eat_ws_(++begin, end);
173                 spec.max_ = detail::toi(begin, end, this->traits());
174                 BOOST_XPR_ENSURE_
175                 (
176                     begin != end && BOOST_XPR_CHAR_(char_type, '}') == *begin
177                   , error_brace, "invalid quantifier"
178                 );
179 
180                 if(begin == old_begin)
181                 {
182                     spec.max_ = (std::numeric_limits<unsigned int>::max)();
183                 }
184                 else
185                 {
186                     BOOST_XPR_ENSURE_
187                     (
188                         spec.min_ <= spec.max_, error_badbrace, "invalid quantification range"
189                     );
190                 }
191             }
192             else
193             {
194                 BOOST_XPR_ENSURE_
195                 (
196                     BOOST_XPR_CHAR_(char_type, '}') == *begin, error_brace, "invalid quantifier"
197                 );
198             }
199             break;
200 
201         default:
202             return false;
203         }
204 
205         spec.greedy_ = true;
206         if(this->eat_ws_(++begin, end) != end && BOOST_XPR_CHAR_(char_type, '?') == *begin)
207         {
208             ++begin;
209             spec.greedy_ = false;
210         }
211 
212         return true;
213     }
214 
215     ///////////////////////////////////////////////////////////////////////////
216     // get_group_type
217     template<typename FwdIter>
get_group_typeboost::xpressive::compiler_traits218     regex_constants::compiler_token_type get_group_type(FwdIter &begin, FwdIter end, string_type &name)
219     {
220         using namespace regex_constants;
221         if(this->eat_ws_(begin, end) != end && BOOST_XPR_CHAR_(char_type, '?') == *begin)
222         {
223             this->eat_ws_(++begin, end);
224             BOOST_XPR_ENSURE_(begin != end, error_paren, "incomplete extension");
225 
226             switch(*begin)
227             {
228             case BOOST_XPR_CHAR_(char_type, ':'): ++begin; return token_no_mark;
229             case BOOST_XPR_CHAR_(char_type, '>'): ++begin; return token_independent_sub_expression;
230             case BOOST_XPR_CHAR_(char_type, '#'): ++begin; return token_comment;
231             case BOOST_XPR_CHAR_(char_type, '='): ++begin; return token_positive_lookahead;
232             case BOOST_XPR_CHAR_(char_type, '!'): ++begin; return token_negative_lookahead;
233             case BOOST_XPR_CHAR_(char_type, 'R'): ++begin; return token_recurse;
234             case BOOST_XPR_CHAR_(char_type, '$'):
235                 this->get_name_(++begin, end, name);
236                 BOOST_XPR_ENSURE_(begin != end, error_paren, "incomplete extension");
237                 if(BOOST_XPR_CHAR_(char_type, '=') == *begin)
238                 {
239                     ++begin;
240                     return token_rule_assign;
241                 }
242                 return token_rule_ref;
243 
244             case BOOST_XPR_CHAR_(char_type, '<'):
245                 this->eat_ws_(++begin, end);
246                 BOOST_XPR_ENSURE_(begin != end, error_paren, "incomplete extension");
247                 switch(*begin)
248                 {
249                 case BOOST_XPR_CHAR_(char_type, '='): ++begin; return token_positive_lookbehind;
250                 case BOOST_XPR_CHAR_(char_type, '!'): ++begin; return token_negative_lookbehind;
251                 default:
252                     BOOST_THROW_EXCEPTION(regex_error(error_badbrace, "unrecognized extension"));
253                 }
254 
255             case BOOST_XPR_CHAR_(char_type, 'P'):
256                 this->eat_ws_(++begin, end);
257                 BOOST_XPR_ENSURE_(begin != end, error_paren, "incomplete extension");
258                 switch(*begin)
259                 {
260                 case BOOST_XPR_CHAR_(char_type, '<'):
261                     this->get_name_(++begin, end, name);
262                     BOOST_XPR_ENSURE_(begin != end && BOOST_XPR_CHAR_(char_type, '>') == *begin++, error_paren, "incomplete extension");
263                     return token_named_mark;
264                 case BOOST_XPR_CHAR_(char_type, '='):
265                     this->get_name_(++begin, end, name);
266                     BOOST_XPR_ENSURE_(begin != end, error_paren, "incomplete extension");
267                     return token_named_mark_ref;
268                 default:
269                     BOOST_THROW_EXCEPTION(regex_error(error_badbrace, "unrecognized extension"));
270                 }
271 
272             case BOOST_XPR_CHAR_(char_type, 'i'):
273             case BOOST_XPR_CHAR_(char_type, 'm'):
274             case BOOST_XPR_CHAR_(char_type, 's'):
275             case BOOST_XPR_CHAR_(char_type, 'x'):
276             case BOOST_XPR_CHAR_(char_type, '-'):
277                 return this->parse_mods_(begin, end);
278 
279             default:
280                 BOOST_THROW_EXCEPTION(regex_error(error_badbrace, "unrecognized extension"));
281             }
282         }
283 
284         return token_literal;
285     }
286 
287     //////////////////////////////////////////////////////////////////////////
288     // get_charset_token
289     //  NOTE: white-space is *never* ignored in a charset.
290     template<typename FwdIter>
get_charset_tokenboost::xpressive::compiler_traits291     regex_constants::compiler_token_type get_charset_token(FwdIter &begin, FwdIter end)
292     {
293         using namespace regex_constants;
294         BOOST_ASSERT(begin != end);
295         switch(*begin)
296         {
297         case BOOST_XPR_CHAR_(char_type, '^'): ++begin; return token_charset_invert;
298         case BOOST_XPR_CHAR_(char_type, '-'): ++begin; return token_charset_hyphen;
299         case BOOST_XPR_CHAR_(char_type, ']'): ++begin; return token_charset_end;
300         case BOOST_XPR_CHAR_(char_type, '['):
301             {
302                 FwdIter next = begin; ++next;
303                 if(next != end)
304                 {
305                     BOOST_XPR_ENSURE_(
306                         *next != BOOST_XPR_CHAR_(char_type, '=')
307                       , error_collate
308                       , "equivalence classes are not yet supported"
309                     );
310 
311                     BOOST_XPR_ENSURE_(
312                         *next != BOOST_XPR_CHAR_(char_type, '.')
313                       , error_collate
314                       , "collation sequences are not yet supported"
315                     );
316 
317                     if(*next == BOOST_XPR_CHAR_(char_type, ':'))
318                     {
319                         begin = ++next;
320                         return token_posix_charset_begin;
321                     }
322                 }
323             }
324             break;
325         case BOOST_XPR_CHAR_(char_type, ':'):
326             {
327                 FwdIter next = begin; ++next;
328                 if(next != end && *next == BOOST_XPR_CHAR_(char_type, ']'))
329                 {
330                     begin = ++next;
331                     return token_posix_charset_end;
332                 }
333             }
334             break;
335         case BOOST_XPR_CHAR_(char_type, '\\'):
336             if(++begin != end)
337             {
338                 switch(*begin)
339                 {
340                 case BOOST_XPR_CHAR_(char_type, 'b'): ++begin; return token_charset_backspace;
341                 default:;
342                 }
343             }
344             return token_escape;
345         default:;
346         }
347         return token_literal;
348     }
349 
350     //////////////////////////////////////////////////////////////////////////
351     // get_escape_token
352     template<typename FwdIter>
get_escape_tokenboost::xpressive::compiler_traits353     regex_constants::compiler_token_type get_escape_token(FwdIter &begin, FwdIter end)
354     {
355         using namespace regex_constants;
356         if(begin != end)
357         {
358             switch(*begin)
359             {
360             //case BOOST_XPR_CHAR_(char_type, 'a'): ++begin; return token_escape_bell;
361             //case BOOST_XPR_CHAR_(char_type, 'c'): ++begin; return token_escape_control;
362             //case BOOST_XPR_CHAR_(char_type, 'e'): ++begin; return token_escape_escape;
363             //case BOOST_XPR_CHAR_(char_type, 'f'): ++begin; return token_escape_formfeed;
364             //case BOOST_XPR_CHAR_(char_type, 'n'): ++begin; return token_escape_newline;
365             //case BOOST_XPR_CHAR_(char_type, 't'): ++begin; return token_escape_horizontal_tab;
366             //case BOOST_XPR_CHAR_(char_type, 'v'): ++begin; return token_escape_vertical_tab;
367             case BOOST_XPR_CHAR_(char_type, 'A'): ++begin; return token_assert_begin_sequence;
368             case BOOST_XPR_CHAR_(char_type, 'b'): ++begin; return token_assert_word_boundary;
369             case BOOST_XPR_CHAR_(char_type, 'B'): ++begin; return token_assert_not_word_boundary;
370             case BOOST_XPR_CHAR_(char_type, 'E'): ++begin; return token_quote_meta_end;
371             case BOOST_XPR_CHAR_(char_type, 'Q'): ++begin; return token_quote_meta_begin;
372             case BOOST_XPR_CHAR_(char_type, 'Z'): ++begin; return token_assert_end_sequence;
373             // Non-standard extension to ECMAScript syntax
374             case BOOST_XPR_CHAR_(char_type, '<'): ++begin; return token_assert_word_begin;
375             case BOOST_XPR_CHAR_(char_type, '>'): ++begin; return token_assert_word_end;
376             default:; // fall-through
377             }
378         }
379 
380         return token_escape;
381     }
382 
383 private:
384 
385     //////////////////////////////////////////////////////////////////////////
386     // parse_mods_
387     template<typename FwdIter>
parse_mods_boost::xpressive::compiler_traits388     regex_constants::compiler_token_type parse_mods_(FwdIter &begin, FwdIter end)
389     {
390         using namespace regex_constants;
391         bool set = true;
392         do switch(*begin)
393         {
394         case BOOST_XPR_CHAR_(char_type, 'i'): this->flag_(set, icase_); break;
395         case BOOST_XPR_CHAR_(char_type, 'm'): this->flag_(!set, single_line); break;
396         case BOOST_XPR_CHAR_(char_type, 's'): this->flag_(!set, not_dot_newline); break;
397         case BOOST_XPR_CHAR_(char_type, 'x'): this->flag_(set, ignore_white_space); break;
398         case BOOST_XPR_CHAR_(char_type, ':'): ++begin; BOOST_FALLTHROUGH;
399         case BOOST_XPR_CHAR_(char_type, ')'): return token_no_mark;
400         case BOOST_XPR_CHAR_(char_type, '-'): if(false == (set = !set)) break; BOOST_FALLTHROUGH;
401         default: BOOST_THROW_EXCEPTION(regex_error(error_paren, "unknown pattern modifier"));
402         }
403         while(BOOST_XPR_ENSURE_(++begin != end, error_paren, "incomplete extension"));
404         // this return is technically unreachable, but this must
405         // be here to work around a bug in gcc 4.0
406         return token_no_mark;
407     }
408 
409     ///////////////////////////////////////////////////////////////////////////////
410     // flag_
flag_boost::xpressive::compiler_traits411     void flag_(bool set, regex_constants::syntax_option_type flag)
412     {
413         this->flags_ = set ? (this->flags_ | flag) : (this->flags_ & ~flag);
414     }
415 
416     ///////////////////////////////////////////////////////////////////////////
417     // is_space_
is_space_boost::xpressive::compiler_traits418     bool is_space_(char_type ch) const
419     {
420         return 0 != this->space_ && this->traits().isctype(ch, this->space_);
421     }
422 
423     ///////////////////////////////////////////////////////////////////////////
424     // is_alnum_
is_alnum_boost::xpressive::compiler_traits425     bool is_alnum_(char_type ch) const
426     {
427         return 0 != this->alnum_ && this->traits().isctype(ch, this->alnum_);
428     }
429 
430     ///////////////////////////////////////////////////////////////////////////
431     // get_name_
432     template<typename FwdIter>
get_name_boost::xpressive::compiler_traits433     void get_name_(FwdIter &begin, FwdIter end, string_type &name)
434     {
435         this->eat_ws_(begin, end);
436         for(name.clear(); begin != end && this->is_alnum_(*begin); ++begin)
437         {
438             name.push_back(*begin);
439         }
440         this->eat_ws_(begin, end);
441         BOOST_XPR_ENSURE_(!name.empty(), regex_constants::error_paren, "incomplete extension");
442     }
443 
444     ///////////////////////////////////////////////////////////////////////////////
445     // eat_ws_
446     template<typename FwdIter>
eat_ws_boost::xpressive::compiler_traits447     FwdIter &eat_ws_(FwdIter &begin, FwdIter end)
448     {
449         if(0 != (regex_constants::ignore_white_space & this->flags()))
450         {
451             while(end != begin && (BOOST_XPR_CHAR_(char_type, '#') == *begin || this->is_space_(*begin)))
452             {
453                 if(BOOST_XPR_CHAR_(char_type, '#') == *begin++)
454                 {
455                     while(end != begin && BOOST_XPR_CHAR_(char_type, '\n') != *begin++) {}
456                 }
457                 else
458                 {
459                     for(; end != begin && this->is_space_(*begin); ++begin) {}
460                 }
461             }
462         }
463 
464         return begin;
465     }
466 
467     regex_traits traits_;
468     regex_constants::syntax_option_type flags_;
469     typename regex_traits::char_class_type space_;
470     typename regex_traits::char_class_type alnum_;
471 };
472 
473 }} // namespace boost::xpressive
474 
475 #endif
476