• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  *
3  * Copyright (c) 2004
4  * John Maddock
5  *
6  * Use, modification and distribution are subject to the
7  * Boost Software License, Version 1.0. (See accompanying file
8  * LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
9  *
10  */
11 
12  /*
13   *   LOCATION:    see http://www.boost.org for most recent version.
14   *   FILE         basic_regex_parser.cpp
15   *   VERSION      see <boost/version.hpp>
16   *   DESCRIPTION: Declares template class basic_regex_parser.
17   */
18 
19 #ifndef BOOST_REGEX_V4_BASIC_REGEX_PARSER_HPP
20 #define BOOST_REGEX_V4_BASIC_REGEX_PARSER_HPP
21 
22 #ifdef BOOST_MSVC
23 #pragma warning(push)
24 #pragma warning(disable: 4103)
25 #if BOOST_MSVC >= 1800
26 #pragma warning(disable: 26812)
27 #endif
28 #endif
29 #ifdef BOOST_HAS_ABI_HEADERS
30 #  include BOOST_ABI_PREFIX
31 #endif
32 #ifdef BOOST_MSVC
33 #pragma warning(pop)
34 #endif
35 
36 namespace boost{
37 namespace BOOST_REGEX_DETAIL_NS{
38 
39 #ifdef BOOST_MSVC
40 #pragma warning(push)
41 #pragma warning(disable:4244)
42 #if BOOST_MSVC < 1910
43 #pragma warning(disable:4800)
44 #endif
45 #endif
46 
umax(mpl::false_ const &)47 inline boost::intmax_t umax(mpl::false_ const&)
48 {
49    // Get out clause here, just in case numeric_limits is unspecialized:
50    return std::numeric_limits<boost::intmax_t>::is_specialized ? (std::numeric_limits<boost::intmax_t>::max)() : INT_MAX;
51 }
umax(mpl::true_ const &)52 inline boost::intmax_t umax(mpl::true_ const&)
53 {
54    return (std::numeric_limits<std::size_t>::max)();
55 }
56 
umax()57 inline boost::intmax_t umax()
58 {
59    return umax(mpl::bool_<std::numeric_limits<boost::intmax_t>::digits >= std::numeric_limits<std::size_t>::digits>());
60 }
61 
62 template <class charT, class traits>
63 class basic_regex_parser : public basic_regex_creator<charT, traits>
64 {
65 public:
66    basic_regex_parser(regex_data<charT, traits>* data);
67    void parse(const charT* p1, const charT* p2, unsigned flags);
68    void fail(regex_constants::error_type error_code, std::ptrdiff_t position);
69    void fail(regex_constants::error_type error_code, std::ptrdiff_t position, std::string message, std::ptrdiff_t start_pos);
fail(regex_constants::error_type error_code,std::ptrdiff_t position,const std::string & message)70    void fail(regex_constants::error_type error_code, std::ptrdiff_t position, const std::string& message)
71    {
72       fail(error_code, position, message, position);
73    }
74 
75    bool parse_all();
76    bool parse_basic();
77    bool parse_extended();
78    bool parse_literal();
79    bool parse_open_paren();
80    bool parse_basic_escape();
81    bool parse_extended_escape();
82    bool parse_match_any();
83    bool parse_repeat(std::size_t low = 0, std::size_t high = (std::numeric_limits<std::size_t>::max)());
84    bool parse_repeat_range(bool isbasic);
85    bool parse_alt();
86    bool parse_set();
87    bool parse_backref();
88    void parse_set_literal(basic_char_set<charT, traits>& char_set);
89    bool parse_inner_set(basic_char_set<charT, traits>& char_set);
90    bool parse_QE();
91    bool parse_perl_extension();
92    bool parse_perl_verb();
93    bool match_verb(const char*);
94    bool add_emacs_code(bool negate);
95    bool unwind_alts(std::ptrdiff_t last_paren_start);
96    digraph<charT> get_next_set_literal(basic_char_set<charT, traits>& char_set);
97    charT unescape_character();
98    regex_constants::syntax_option_type parse_options();
99 
100 private:
101    typedef bool (basic_regex_parser::*parser_proc_type)();
102    typedef typename traits::string_type string_type;
103    typedef typename traits::char_class_type char_class_type;
104    parser_proc_type           m_parser_proc;    // the main parser to use
105    const charT*               m_base;           // the start of the string being parsed
106    const charT*               m_end;            // the end of the string being parsed
107    const charT*               m_position;       // our current parser position
108    unsigned                   m_mark_count;     // how many sub-expressions we have
109    int                        m_mark_reset;     // used to indicate that we're inside a (?|...) block.
110    unsigned                   m_max_mark;       // largest mark count seen inside a (?|...) block.
111    std::ptrdiff_t             m_paren_start;    // where the last seen ')' began (where repeats are inserted).
112    std::ptrdiff_t             m_alt_insert_point; // where to insert the next alternative
113    bool                       m_has_case_change; // true if somewhere in the current block the case has changed
114    unsigned                   m_recursion_count; // How many times we've called parse_all.
115 #if defined(BOOST_MSVC) && defined(_M_IX86)
116    // This is an ugly warning suppression workaround (for warnings *inside* std::vector
117    // that can not otherwise be suppressed)...
118    BOOST_STATIC_ASSERT(sizeof(long) >= sizeof(void*));
119    std::vector<long>           m_alt_jumps;      // list of alternative in the current scope.
120 #else
121    std::vector<std::ptrdiff_t> m_alt_jumps;      // list of alternative in the current scope.
122 #endif
123 
124    basic_regex_parser& operator=(const basic_regex_parser&);
125    basic_regex_parser(const basic_regex_parser&);
126 };
127 
128 template <class charT, class traits>
basic_regex_parser(regex_data<charT,traits> * data)129 basic_regex_parser<charT, traits>::basic_regex_parser(regex_data<charT, traits>* data)
130    : basic_regex_creator<charT, traits>(data), m_parser_proc(), m_base(0), m_end(0), m_position(0),
131    m_mark_count(0), m_mark_reset(-1), m_max_mark(0), m_paren_start(0), m_alt_insert_point(0), m_has_case_change(false), m_recursion_count(0)
132 {
133 }
134 
135 template <class charT, class traits>
parse(const charT * p1,const charT * p2,unsigned l_flags)136 void basic_regex_parser<charT, traits>::parse(const charT* p1, const charT* p2, unsigned l_flags)
137 {
138    // pass l_flags on to base class:
139    this->init(l_flags);
140    // set up pointers:
141    m_position = m_base = p1;
142    m_end = p2;
143    // empty strings are errors:
144    if((p1 == p2) &&
145       (
146          ((l_flags & regbase::main_option_type) != regbase::perl_syntax_group)
147          || (l_flags & regbase::no_empty_expressions)
148       )
149      )
150    {
151       fail(regex_constants::error_empty, 0);
152       return;
153    }
154    // select which parser to use:
155    switch(l_flags & regbase::main_option_type)
156    {
157    case regbase::perl_syntax_group:
158       {
159          m_parser_proc = &basic_regex_parser<charT, traits>::parse_extended;
160          //
161          // Add a leading paren with index zero to give recursions a target:
162          //
163          re_brace* br = static_cast<re_brace*>(this->append_state(syntax_element_startmark, sizeof(re_brace)));
164          br->index = 0;
165          br->icase = this->flags() & regbase::icase;
166          break;
167       }
168    case regbase::basic_syntax_group:
169       m_parser_proc = &basic_regex_parser<charT, traits>::parse_basic;
170       break;
171    case regbase::literal:
172       m_parser_proc = &basic_regex_parser<charT, traits>::parse_literal;
173       break;
174    default:
175       // Ooops, someone has managed to set more than one of the main option flags,
176       // so this must be an error:
177       fail(regex_constants::error_unknown, 0, "An invalid combination of regular expression syntax flags was used.");
178       return;
179    }
180 
181    // parse all our characters:
182    bool result = parse_all();
183    //
184    // Unwind our alternatives:
185    //
186    unwind_alts(-1);
187    // reset l_flags as a global scope (?imsx) may have altered them:
188    this->flags(l_flags);
189    // if we haven't gobbled up all the characters then we must
190    // have had an unexpected ')' :
191    if(!result)
192    {
193       fail(regex_constants::error_paren, ::boost::BOOST_REGEX_DETAIL_NS::distance(m_base, m_position), "Found a closing ) with no corresponding opening parenthesis.");
194       return;
195    }
196    // if an error has been set then give up now:
197    if(this->m_pdata->m_status)
198       return;
199    // fill in our sub-expression count:
200    this->m_pdata->m_mark_count = 1u + (std::size_t)m_mark_count;
201    this->finalize(p1, p2);
202 }
203 
204 template <class charT, class traits>
fail(regex_constants::error_type error_code,std::ptrdiff_t position)205 void basic_regex_parser<charT, traits>::fail(regex_constants::error_type error_code, std::ptrdiff_t position)
206 {
207    // get the error message:
208    std::string message = this->m_pdata->m_ptraits->error_string(error_code);
209    fail(error_code, position, message);
210 }
211 
212 template <class charT, class traits>
fail(regex_constants::error_type error_code,std::ptrdiff_t position,std::string message,std::ptrdiff_t start_pos)213 void basic_regex_parser<charT, traits>::fail(regex_constants::error_type error_code, std::ptrdiff_t position, std::string message, std::ptrdiff_t start_pos)
214 {
215    if(0 == this->m_pdata->m_status) // update the error code if not already set
216       this->m_pdata->m_status = error_code;
217    m_position = m_end; // don't bother parsing anything else
218 
219 #ifndef BOOST_NO_TEMPLATED_ITERATOR_CONSTRUCTORS
220    //
221    // Augment error message with the regular expression text:
222    //
223    if(start_pos == position)
224       start_pos = (std::max)(static_cast<std::ptrdiff_t>(0), position - static_cast<std::ptrdiff_t>(10));
225    std::ptrdiff_t end_pos = (std::min)(position + static_cast<std::ptrdiff_t>(10), static_cast<std::ptrdiff_t>(m_end - m_base));
226    if(error_code != regex_constants::error_empty)
227    {
228       if((start_pos != 0) || (end_pos != (m_end - m_base)))
229          message += "  The error occurred while parsing the regular expression fragment: '";
230       else
231          message += "  The error occurred while parsing the regular expression: '";
232       if(start_pos != end_pos)
233       {
234          message += std::string(m_base + start_pos, m_base + position);
235          message += ">>>HERE>>>";
236          message += std::string(m_base + position, m_base + end_pos);
237       }
238       message += "'.";
239    }
240 #endif
241 
242 #ifndef BOOST_NO_EXCEPTIONS
243    if(0 == (this->flags() & regex_constants::no_except))
244    {
245       boost::regex_error e(message, error_code, position);
246       e.raise();
247    }
248 #else
249    (void)position; // suppress warnings.
250 #endif
251 }
252 
253 template <class charT, class traits>
parse_all()254 bool basic_regex_parser<charT, traits>::parse_all()
255 {
256    if (++m_recursion_count > 400)
257    {
258       // exceeded internal limits
259       fail(boost::regex_constants::error_complexity, m_position - m_base, "Exceeded nested brace limit.");
260    }
261    bool result = true;
262    while(result && (m_position != m_end))
263    {
264       result = (this->*m_parser_proc)();
265    }
266    --m_recursion_count;
267    return result;
268 }
269 
270 #ifdef BOOST_MSVC
271 #pragma warning(push)
272 #pragma warning(disable:4702)
273 #endif
274 template <class charT, class traits>
parse_basic()275 bool basic_regex_parser<charT, traits>::parse_basic()
276 {
277    switch(this->m_traits.syntax_type(*m_position))
278    {
279    case regex_constants::syntax_escape:
280       return parse_basic_escape();
281    case regex_constants::syntax_dot:
282       return parse_match_any();
283    case regex_constants::syntax_caret:
284       ++m_position;
285       this->append_state(syntax_element_start_line);
286       break;
287    case regex_constants::syntax_dollar:
288       ++m_position;
289       this->append_state(syntax_element_end_line);
290       break;
291    case regex_constants::syntax_star:
292       if(!(this->m_last_state) || (this->m_last_state->type == syntax_element_start_line))
293          return parse_literal();
294       else
295       {
296          ++m_position;
297          return parse_repeat();
298       }
299    case regex_constants::syntax_plus:
300       if(!(this->m_last_state) || (this->m_last_state->type == syntax_element_start_line) || !(this->flags() & regbase::emacs_ex))
301          return parse_literal();
302       else
303       {
304          ++m_position;
305          return parse_repeat(1);
306       }
307    case regex_constants::syntax_question:
308       if(!(this->m_last_state) || (this->m_last_state->type == syntax_element_start_line) || !(this->flags() & regbase::emacs_ex))
309          return parse_literal();
310       else
311       {
312          ++m_position;
313          return parse_repeat(0, 1);
314       }
315    case regex_constants::syntax_open_set:
316       return parse_set();
317    case regex_constants::syntax_newline:
318       if(this->flags() & regbase::newline_alt)
319          return parse_alt();
320       else
321          return parse_literal();
322    default:
323       return parse_literal();
324    }
325    return true;
326 }
327 
328 #ifdef BOOST_MSVC
329 #  pragma warning(push)
330 #if BOOST_MSVC >= 1800
331 #pragma warning(disable:26812)
332 #endif
333 #endif
334 template <class charT, class traits>
parse_extended()335 bool basic_regex_parser<charT, traits>::parse_extended()
336 {
337    bool result = true;
338    switch(this->m_traits.syntax_type(*m_position))
339    {
340    case regex_constants::syntax_open_mark:
341       return parse_open_paren();
342    case regex_constants::syntax_close_mark:
343       return false;
344    case regex_constants::syntax_escape:
345       return parse_extended_escape();
346    case regex_constants::syntax_dot:
347       return parse_match_any();
348    case regex_constants::syntax_caret:
349       ++m_position;
350       this->append_state(
351          (this->flags() & regex_constants::no_mod_m ? syntax_element_buffer_start : syntax_element_start_line));
352       break;
353    case regex_constants::syntax_dollar:
354       ++m_position;
355       this->append_state(
356          (this->flags() & regex_constants::no_mod_m ? syntax_element_buffer_end : syntax_element_end_line));
357       break;
358    case regex_constants::syntax_star:
359       if(m_position == this->m_base)
360       {
361          fail(regex_constants::error_badrepeat, 0, "The repeat operator \"*\" cannot start a regular expression.");
362          return false;
363       }
364       ++m_position;
365       return parse_repeat();
366    case regex_constants::syntax_question:
367       if(m_position == this->m_base)
368       {
369          fail(regex_constants::error_badrepeat, 0, "The repeat operator \"?\" cannot start a regular expression.");
370          return false;
371       }
372       ++m_position;
373       return parse_repeat(0,1);
374    case regex_constants::syntax_plus:
375       if(m_position == this->m_base)
376       {
377          fail(regex_constants::error_badrepeat, 0, "The repeat operator \"+\" cannot start a regular expression.");
378          return false;
379       }
380       ++m_position;
381       return parse_repeat(1);
382    case regex_constants::syntax_open_brace:
383       ++m_position;
384       return parse_repeat_range(false);
385    case regex_constants::syntax_close_brace:
386       if((this->flags() & regbase::no_perl_ex) == regbase::no_perl_ex)
387       {
388          fail(regex_constants::error_brace, this->m_position - this->m_base, "Found a closing repetition operator } with no corresponding {.");
389          return false;
390       }
391       result = parse_literal();
392       break;
393    case regex_constants::syntax_or:
394       return parse_alt();
395    case regex_constants::syntax_open_set:
396       return parse_set();
397    case regex_constants::syntax_newline:
398       if(this->flags() & regbase::newline_alt)
399          return parse_alt();
400       else
401          return parse_literal();
402    case regex_constants::syntax_hash:
403       //
404       // If we have a mod_x flag set, then skip until
405       // we get to a newline character:
406       //
407       if((this->flags()
408          & (regbase::no_perl_ex|regbase::mod_x))
409          == regbase::mod_x)
410       {
411          while((m_position != m_end) && !is_separator(*m_position++)){}
412          return true;
413       }
414       BOOST_FALLTHROUGH;
415    default:
416       result = parse_literal();
417       break;
418    }
419    return result;
420 }
421 #ifdef BOOST_MSVC
422 #  pragma warning(pop)
423 #endif
424 #ifdef BOOST_MSVC
425 #pragma warning(pop)
426 #endif
427 
428 template <class charT, class traits>
parse_literal()429 bool basic_regex_parser<charT, traits>::parse_literal()
430 {
431    // append this as a literal provided it's not a space character
432    // or the perl option regbase::mod_x is not set:
433    if(
434       ((this->flags()
435          & (regbase::main_option_type|regbase::mod_x|regbase::no_perl_ex))
436             != regbase::mod_x)
437       || !this->m_traits.isctype(*m_position, this->m_mask_space))
438          this->append_literal(*m_position);
439    ++m_position;
440    return true;
441 }
442 
443 template <class charT, class traits>
parse_open_paren()444 bool basic_regex_parser<charT, traits>::parse_open_paren()
445 {
446    //
447    // skip the '(' and error check:
448    //
449    if(++m_position == m_end)
450    {
451       fail(regex_constants::error_paren, m_position - m_base);
452       return false;
453    }
454    //
455    // begin by checking for a perl-style (?...) extension:
456    //
457    if(
458          ((this->flags() & (regbase::main_option_type | regbase::no_perl_ex)) == 0)
459          || ((this->flags() & (regbase::main_option_type | regbase::emacs_ex)) == (regbase::basic_syntax_group|regbase::emacs_ex))
460      )
461    {
462       if(this->m_traits.syntax_type(*m_position) == regex_constants::syntax_question)
463          return parse_perl_extension();
464       if(this->m_traits.syntax_type(*m_position) == regex_constants::syntax_star)
465          return parse_perl_verb();
466    }
467    //
468    // update our mark count, and append the required state:
469    //
470    unsigned markid = 0;
471    if(0 == (this->flags() & regbase::nosubs))
472    {
473       markid = ++m_mark_count;
474 #ifndef BOOST_NO_STD_DISTANCE
475       if(this->flags() & regbase::save_subexpression_location)
476          this->m_pdata->m_subs.push_back(std::pair<std::size_t, std::size_t>(std::distance(m_base, m_position) - 1, 0));
477 #else
478       if(this->flags() & regbase::save_subexpression_location)
479          this->m_pdata->m_subs.push_back(std::pair<std::size_t, std::size_t>((m_position - m_base) - 1, 0));
480 #endif
481    }
482    re_brace* pb = static_cast<re_brace*>(this->append_state(syntax_element_startmark, sizeof(re_brace)));
483    pb->index = markid;
484    pb->icase = this->flags() & regbase::icase;
485    std::ptrdiff_t last_paren_start = this->getoffset(pb);
486    // back up insertion point for alternations, and set new point:
487    std::ptrdiff_t last_alt_point = m_alt_insert_point;
488    this->m_pdata->m_data.align();
489    m_alt_insert_point = this->m_pdata->m_data.size();
490    //
491    // back up the current flags in case we have a nested (?imsx) group:
492    //
493    regex_constants::syntax_option_type opts = this->flags();
494    bool old_case_change = m_has_case_change;
495    m_has_case_change = false; // no changes to this scope as yet...
496    //
497    // Back up branch reset data in case we have a nested (?|...)
498    //
499    int mark_reset = m_mark_reset;
500    m_mark_reset = -1;
501    //
502    // now recursively add more states, this will terminate when we get to a
503    // matching ')' :
504    //
505    parse_all();
506    //
507    // Unwind pushed alternatives:
508    //
509    if(0 == unwind_alts(last_paren_start))
510       return false;
511    //
512    // restore flags:
513    //
514    if(m_has_case_change)
515    {
516       // the case has changed in one or more of the alternatives
517       // within the scoped (...) block: we have to add a state
518       // to reset the case sensitivity:
519       static_cast<re_case*>(
520          this->append_state(syntax_element_toggle_case, sizeof(re_case))
521          )->icase = opts & regbase::icase;
522    }
523    this->flags(opts);
524    m_has_case_change = old_case_change;
525    //
526    // restore branch reset:
527    //
528    m_mark_reset = mark_reset;
529    //
530    // we either have a ')' or we have run out of characters prematurely:
531    //
532    if(m_position == m_end)
533    {
534       this->fail(regex_constants::error_paren, ::boost::BOOST_REGEX_DETAIL_NS::distance(m_base, m_end));
535       return false;
536    }
537    if(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_close_mark)
538       return false;
539 #ifndef BOOST_NO_STD_DISTANCE
540    if(markid && (this->flags() & regbase::save_subexpression_location))
541       this->m_pdata->m_subs.at(markid - 1).second = std::distance(m_base, m_position);
542 #else
543    if(markid && (this->flags() & regbase::save_subexpression_location))
544       this->m_pdata->m_subs.at(markid - 1).second = (m_position - m_base);
545 #endif
546    ++m_position;
547    //
548    // append closing parenthesis state:
549    //
550    pb = static_cast<re_brace*>(this->append_state(syntax_element_endmark, sizeof(re_brace)));
551    pb->index = markid;
552    pb->icase = this->flags() & regbase::icase;
553    this->m_paren_start = last_paren_start;
554    //
555    // restore the alternate insertion point:
556    //
557    this->m_alt_insert_point = last_alt_point;
558    //
559    // allow backrefs to this mark:
560    //
561    if(markid > 0)
562       this->m_backrefs.set(markid);
563 
564    return true;
565 }
566 
567 template <class charT, class traits>
parse_basic_escape()568 bool basic_regex_parser<charT, traits>::parse_basic_escape()
569 {
570    if(++m_position == m_end)
571    {
572       fail(regex_constants::error_paren, m_position - m_base);
573       return false;
574    }
575    bool result = true;
576    switch(this->m_traits.escape_syntax_type(*m_position))
577    {
578    case regex_constants::syntax_open_mark:
579       return parse_open_paren();
580    case regex_constants::syntax_close_mark:
581       return false;
582    case regex_constants::syntax_plus:
583       if(this->flags() & regex_constants::bk_plus_qm)
584       {
585          ++m_position;
586          return parse_repeat(1);
587       }
588       else
589          return parse_literal();
590    case regex_constants::syntax_question:
591       if(this->flags() & regex_constants::bk_plus_qm)
592       {
593          ++m_position;
594          return parse_repeat(0, 1);
595       }
596       else
597          return parse_literal();
598    case regex_constants::syntax_open_brace:
599       if(this->flags() & regbase::no_intervals)
600          return parse_literal();
601       ++m_position;
602       return parse_repeat_range(true);
603    case regex_constants::syntax_close_brace:
604       if(this->flags() & regbase::no_intervals)
605          return parse_literal();
606       fail(regex_constants::error_brace, this->m_position - this->m_base, "Found a closing repetition operator } with no corresponding {.");
607       return false;
608    case regex_constants::syntax_or:
609       if(this->flags() & regbase::bk_vbar)
610          return parse_alt();
611       else
612          result = parse_literal();
613       break;
614    case regex_constants::syntax_digit:
615       return parse_backref();
616    case regex_constants::escape_type_start_buffer:
617       if(this->flags() & regbase::emacs_ex)
618       {
619          ++m_position;
620          this->append_state(syntax_element_buffer_start);
621       }
622       else
623          result = parse_literal();
624       break;
625    case regex_constants::escape_type_end_buffer:
626       if(this->flags() & regbase::emacs_ex)
627       {
628          ++m_position;
629          this->append_state(syntax_element_buffer_end);
630       }
631       else
632          result = parse_literal();
633       break;
634    case regex_constants::escape_type_word_assert:
635       if(this->flags() & regbase::emacs_ex)
636       {
637          ++m_position;
638          this->append_state(syntax_element_word_boundary);
639       }
640       else
641          result = parse_literal();
642       break;
643    case regex_constants::escape_type_not_word_assert:
644       if(this->flags() & regbase::emacs_ex)
645       {
646          ++m_position;
647          this->append_state(syntax_element_within_word);
648       }
649       else
650          result = parse_literal();
651       break;
652    case regex_constants::escape_type_left_word:
653       if(this->flags() & regbase::emacs_ex)
654       {
655          ++m_position;
656          this->append_state(syntax_element_word_start);
657       }
658       else
659          result = parse_literal();
660       break;
661    case regex_constants::escape_type_right_word:
662       if(this->flags() & regbase::emacs_ex)
663       {
664          ++m_position;
665          this->append_state(syntax_element_word_end);
666       }
667       else
668          result = parse_literal();
669       break;
670    default:
671       if(this->flags() & regbase::emacs_ex)
672       {
673          bool negate = true;
674          switch(*m_position)
675          {
676          case 'w':
677             negate = false;
678             BOOST_FALLTHROUGH;
679          case 'W':
680             {
681             basic_char_set<charT, traits> char_set;
682             if(negate)
683                char_set.negate();
684             char_set.add_class(this->m_word_mask);
685             if(0 == this->append_set(char_set))
686             {
687                fail(regex_constants::error_ctype, m_position - m_base);
688                return false;
689             }
690             ++m_position;
691             return true;
692             }
693          case 's':
694             negate = false;
695             BOOST_FALLTHROUGH;
696          case 'S':
697             return add_emacs_code(negate);
698          case 'c':
699          case 'C':
700             // not supported yet:
701             fail(regex_constants::error_escape, m_position - m_base, "The \\c and \\C escape sequences are not supported by POSIX basic regular expressions: try the Perl syntax instead.");
702             return false;
703          default:
704             break;
705          }
706       }
707       result = parse_literal();
708       break;
709    }
710    return result;
711 }
712 
713 template <class charT, class traits>
parse_extended_escape()714 bool basic_regex_parser<charT, traits>::parse_extended_escape()
715 {
716    ++m_position;
717    if(m_position == m_end)
718    {
719       fail(regex_constants::error_escape, m_position - m_base, "Incomplete escape sequence found.");
720       return false;
721    }
722    bool negate = false; // in case this is a character class escape: \w \d etc
723    switch(this->m_traits.escape_syntax_type(*m_position))
724    {
725    case regex_constants::escape_type_not_class:
726       negate = true;
727       BOOST_FALLTHROUGH;
728    case regex_constants::escape_type_class:
729       {
730 escape_type_class_jump:
731          typedef typename traits::char_class_type m_type;
732          m_type m = this->m_traits.lookup_classname(m_position, m_position+1);
733          if(m != 0)
734          {
735             basic_char_set<charT, traits> char_set;
736             if(negate)
737                char_set.negate();
738             char_set.add_class(m);
739             if(0 == this->append_set(char_set))
740             {
741                fail(regex_constants::error_ctype, m_position - m_base);
742                return false;
743             }
744             ++m_position;
745             return true;
746          }
747          //
748          // not a class, just a regular unknown escape:
749          //
750          this->append_literal(unescape_character());
751          break;
752       }
753    case regex_constants::syntax_digit:
754       return parse_backref();
755    case regex_constants::escape_type_left_word:
756       ++m_position;
757       this->append_state(syntax_element_word_start);
758       break;
759    case regex_constants::escape_type_right_word:
760       ++m_position;
761       this->append_state(syntax_element_word_end);
762       break;
763    case regex_constants::escape_type_start_buffer:
764       ++m_position;
765       this->append_state(syntax_element_buffer_start);
766       break;
767    case regex_constants::escape_type_end_buffer:
768       ++m_position;
769       this->append_state(syntax_element_buffer_end);
770       break;
771    case regex_constants::escape_type_word_assert:
772       ++m_position;
773       this->append_state(syntax_element_word_boundary);
774       break;
775    case regex_constants::escape_type_not_word_assert:
776       ++m_position;
777       this->append_state(syntax_element_within_word);
778       break;
779    case regex_constants::escape_type_Z:
780       ++m_position;
781       this->append_state(syntax_element_soft_buffer_end);
782       break;
783    case regex_constants::escape_type_Q:
784       return parse_QE();
785    case regex_constants::escape_type_C:
786       return parse_match_any();
787    case regex_constants::escape_type_X:
788       ++m_position;
789       this->append_state(syntax_element_combining);
790       break;
791    case regex_constants::escape_type_G:
792       ++m_position;
793       this->append_state(syntax_element_restart_continue);
794       break;
795    case regex_constants::escape_type_not_property:
796       negate = true;
797       BOOST_FALLTHROUGH;
798    case regex_constants::escape_type_property:
799       {
800          ++m_position;
801          char_class_type m;
802          if(m_position == m_end)
803          {
804             fail(regex_constants::error_escape, m_position - m_base, "Incomplete property escape found.");
805             return false;
806          }
807          // maybe have \p{ddd}
808          if(this->m_traits.syntax_type(*m_position) == regex_constants::syntax_open_brace)
809          {
810             const charT* base = m_position;
811             // skip forward until we find enclosing brace:
812             while((m_position != m_end) && (this->m_traits.syntax_type(*m_position) != regex_constants::syntax_close_brace))
813                ++m_position;
814             if(m_position == m_end)
815             {
816                fail(regex_constants::error_escape, m_position - m_base, "Closing } missing from property escape sequence.");
817                return false;
818             }
819             m = this->m_traits.lookup_classname(++base, m_position++);
820          }
821          else
822          {
823             m = this->m_traits.lookup_classname(m_position, m_position+1);
824             ++m_position;
825          }
826          if(m != 0)
827          {
828             basic_char_set<charT, traits> char_set;
829             if(negate)
830                char_set.negate();
831             char_set.add_class(m);
832             if(0 == this->append_set(char_set))
833             {
834                fail(regex_constants::error_ctype, m_position - m_base);
835                return false;
836             }
837             return true;
838          }
839          fail(regex_constants::error_ctype, m_position - m_base, "Escape sequence was neither a valid property nor a valid character class name.");
840          return false;
841       }
842    case regex_constants::escape_type_reset_start_mark:
843       if(0 == (this->flags() & (regbase::main_option_type | regbase::no_perl_ex)))
844       {
845          re_brace* pb = static_cast<re_brace*>(this->append_state(syntax_element_startmark, sizeof(re_brace)));
846          pb->index = -5;
847          pb->icase = this->flags() & regbase::icase;
848          this->m_pdata->m_data.align();
849          ++m_position;
850          return true;
851       }
852       goto escape_type_class_jump;
853    case regex_constants::escape_type_line_ending:
854       if(0 == (this->flags() & (regbase::main_option_type | regbase::no_perl_ex)))
855       {
856          const charT* e = get_escape_R_string<charT>();
857          const charT* old_position = m_position;
858          const charT* old_end = m_end;
859          const charT* old_base = m_base;
860          m_position = e;
861          m_base = e;
862          m_end = e + traits::length(e);
863          bool r = parse_all();
864          m_position = ++old_position;
865          m_end = old_end;
866          m_base = old_base;
867          return r;
868       }
869       goto escape_type_class_jump;
870    case regex_constants::escape_type_extended_backref:
871       if(0 == (this->flags() & (regbase::main_option_type | regbase::no_perl_ex)))
872       {
873          bool have_brace = false;
874          bool negative = false;
875          static const char incomplete_message[] = "Incomplete \\g escape found.";
876          if(++m_position == m_end)
877          {
878             fail(regex_constants::error_escape, m_position - m_base, incomplete_message);
879             return false;
880          }
881          // maybe have \g{ddd}
882          regex_constants::syntax_type syn = this->m_traits.syntax_type(*m_position);
883          regex_constants::syntax_type syn_end = 0;
884          if((syn == regex_constants::syntax_open_brace)
885             || (syn == regex_constants::escape_type_left_word)
886             || (syn == regex_constants::escape_type_end_buffer))
887          {
888             if(++m_position == m_end)
889             {
890                fail(regex_constants::error_escape, m_position - m_base, incomplete_message);
891                return false;
892             }
893             have_brace = true;
894             switch(syn)
895             {
896             case regex_constants::syntax_open_brace:
897                syn_end = regex_constants::syntax_close_brace;
898                break;
899             case regex_constants::escape_type_left_word:
900                syn_end = regex_constants::escape_type_right_word;
901                break;
902             default:
903                syn_end = regex_constants::escape_type_end_buffer;
904                break;
905             }
906          }
907          negative = (*m_position == static_cast<charT>('-'));
908          if((negative) && (++m_position == m_end))
909          {
910             fail(regex_constants::error_escape, m_position - m_base, incomplete_message);
911             return false;
912          }
913          const charT* pc = m_position;
914          boost::intmax_t i = this->m_traits.toi(pc, m_end, 10);
915          if((i < 0) && syn_end)
916          {
917             // Check for a named capture, get the leftmost one if there is more than one:
918             const charT* base = m_position;
919             while((m_position != m_end) && (this->m_traits.syntax_type(*m_position) != syn_end))
920             {
921                ++m_position;
922             }
923             i = hash_value_from_capture_name(base, m_position);
924             pc = m_position;
925          }
926          if(negative)
927             i = 1 + (static_cast<boost::intmax_t>(m_mark_count) - i);
928          if(((i < hash_value_mask) && (i > 0) && (this->m_backrefs.test(i))) || ((i >= hash_value_mask) && (this->m_pdata->get_id(i) > 0) && (this->m_backrefs.test(this->m_pdata->get_id(i)))))
929          {
930             m_position = pc;
931             re_brace* pb = static_cast<re_brace*>(this->append_state(syntax_element_backref, sizeof(re_brace)));
932             pb->index = i;
933             pb->icase = this->flags() & regbase::icase;
934          }
935          else
936          {
937             fail(regex_constants::error_backref, m_position - m_base);
938             return false;
939          }
940          m_position = pc;
941          if(have_brace)
942          {
943             if((m_position == m_end) || (this->m_traits.syntax_type(*m_position) != syn_end))
944             {
945                fail(regex_constants::error_escape, m_position - m_base, incomplete_message);
946                return false;
947             }
948             ++m_position;
949          }
950          return true;
951       }
952       goto escape_type_class_jump;
953    case regex_constants::escape_type_control_v:
954       if(0 == (this->flags() & (regbase::main_option_type | regbase::no_perl_ex)))
955          goto escape_type_class_jump;
956       BOOST_FALLTHROUGH;
957    default:
958       this->append_literal(unescape_character());
959       break;
960    }
961    return true;
962 }
963 
964 template <class charT, class traits>
parse_match_any()965 bool basic_regex_parser<charT, traits>::parse_match_any()
966 {
967    //
968    // we have a '.' that can match any character:
969    //
970    ++m_position;
971    static_cast<re_dot*>(
972       this->append_state(syntax_element_wild, sizeof(re_dot))
973       )->mask = static_cast<unsigned char>(this->flags() & regbase::no_mod_s
974       ? BOOST_REGEX_DETAIL_NS::force_not_newline
975          : this->flags() & regbase::mod_s ?
976             BOOST_REGEX_DETAIL_NS::force_newline : BOOST_REGEX_DETAIL_NS::dont_care);
977    return true;
978 }
979 
980 template <class charT, class traits>
parse_repeat(std::size_t low,std::size_t high)981 bool basic_regex_parser<charT, traits>::parse_repeat(std::size_t low, std::size_t high)
982 {
983    bool greedy = true;
984    bool pocessive = false;
985    std::size_t insert_point;
986    //
987    // when we get to here we may have a non-greedy ? mark still to come:
988    //
989    if((m_position != m_end)
990       && (
991             (0 == (this->flags() & (regbase::main_option_type | regbase::no_perl_ex)))
992             || ((regbase::basic_syntax_group|regbase::emacs_ex) == (this->flags() & (regbase::main_option_type | regbase::emacs_ex)))
993          )
994       )
995    {
996       // OK we have a perl or emacs regex, check for a '?':
997       if ((this->flags() & (regbase::main_option_type | regbase::mod_x | regbase::no_perl_ex)) == regbase::mod_x)
998       {
999          // whitespace skip:
1000          while ((m_position != m_end) && this->m_traits.isctype(*m_position, this->m_mask_space))
1001             ++m_position;
1002       }
1003       if((m_position != m_end) && (this->m_traits.syntax_type(*m_position) == regex_constants::syntax_question))
1004       {
1005          greedy = false;
1006          ++m_position;
1007       }
1008       // for perl regexes only check for pocessive ++ repeats.
1009       if((m_position != m_end)
1010          && (0 == (this->flags() & regbase::main_option_type))
1011          && (this->m_traits.syntax_type(*m_position) == regex_constants::syntax_plus))
1012       {
1013          pocessive = true;
1014          ++m_position;
1015       }
1016    }
1017    if(0 == this->m_last_state)
1018    {
1019       fail(regex_constants::error_badrepeat, ::boost::BOOST_REGEX_DETAIL_NS::distance(m_base, m_position), "Nothing to repeat.");
1020       return false;
1021    }
1022    if(this->m_last_state->type == syntax_element_endmark)
1023    {
1024       // insert a repeat before the '(' matching the last ')':
1025       insert_point = this->m_paren_start;
1026    }
1027    else if((this->m_last_state->type == syntax_element_literal) && (static_cast<re_literal*>(this->m_last_state)->length > 1))
1028    {
1029       // the last state was a literal with more than one character, split it in two:
1030       re_literal* lit = static_cast<re_literal*>(this->m_last_state);
1031       charT c = (static_cast<charT*>(static_cast<void*>(lit+1)))[lit->length - 1];
1032       lit->length -= 1;
1033       // now append new state:
1034       lit = static_cast<re_literal*>(this->append_state(syntax_element_literal, sizeof(re_literal) + sizeof(charT)));
1035       lit->length = 1;
1036       (static_cast<charT*>(static_cast<void*>(lit+1)))[0] = c;
1037       insert_point = this->getoffset(this->m_last_state);
1038    }
1039    else
1040    {
1041       // repeat the last state whatever it was, need to add some error checking here:
1042       switch(this->m_last_state->type)
1043       {
1044       case syntax_element_start_line:
1045       case syntax_element_end_line:
1046       case syntax_element_word_boundary:
1047       case syntax_element_within_word:
1048       case syntax_element_word_start:
1049       case syntax_element_word_end:
1050       case syntax_element_buffer_start:
1051       case syntax_element_buffer_end:
1052       case syntax_element_alt:
1053       case syntax_element_soft_buffer_end:
1054       case syntax_element_restart_continue:
1055       case syntax_element_jump:
1056       case syntax_element_startmark:
1057       case syntax_element_backstep:
1058          // can't legally repeat any of the above:
1059          fail(regex_constants::error_badrepeat, m_position - m_base);
1060          return false;
1061       default:
1062          // do nothing...
1063          break;
1064       }
1065       insert_point = this->getoffset(this->m_last_state);
1066    }
1067    //
1068    // OK we now know what to repeat, so insert the repeat around it:
1069    //
1070    re_repeat* rep = static_cast<re_repeat*>(this->insert_state(insert_point, syntax_element_rep, re_repeater_size));
1071    rep->min = low;
1072    rep->max = high;
1073    rep->greedy = greedy;
1074    rep->leading = false;
1075    // store our repeater position for later:
1076    std::ptrdiff_t rep_off = this->getoffset(rep);
1077    // and append a back jump to the repeat:
1078    re_jump* jmp = static_cast<re_jump*>(this->append_state(syntax_element_jump, sizeof(re_jump)));
1079    jmp->alt.i = rep_off - this->getoffset(jmp);
1080    this->m_pdata->m_data.align();
1081    // now fill in the alt jump for the repeat:
1082    rep = static_cast<re_repeat*>(this->getaddress(rep_off));
1083    rep->alt.i = this->m_pdata->m_data.size() - rep_off;
1084    //
1085    // If the repeat is pocessive then bracket the repeat with a (?>...)
1086    // independent sub-expression construct:
1087    //
1088    if(pocessive)
1089    {
1090       if(m_position != m_end)
1091       {
1092          //
1093          // Check for illegal following quantifier, we have to do this here, because
1094          // the extra states we insert below circumvents our usual error checking :-(
1095          //
1096          bool contin = false;
1097          do
1098          {
1099             if ((this->flags() & (regbase::main_option_type | regbase::mod_x | regbase::no_perl_ex)) == regbase::mod_x)
1100             {
1101                // whitespace skip:
1102                while ((m_position != m_end) && this->m_traits.isctype(*m_position, this->m_mask_space))
1103                   ++m_position;
1104             }
1105             if (m_position != m_end)
1106             {
1107                switch (this->m_traits.syntax_type(*m_position))
1108                {
1109                case regex_constants::syntax_star:
1110                case regex_constants::syntax_plus:
1111                case regex_constants::syntax_question:
1112                case regex_constants::syntax_open_brace:
1113                   fail(regex_constants::error_badrepeat, m_position - m_base);
1114                   return false;
1115                case regex_constants::syntax_open_mark:
1116                   // Do we have a comment?  If so we need to skip it here...
1117                   if ((m_position + 2 < m_end) && this->m_traits.syntax_type(*(m_position + 1)) == regex_constants::syntax_question
1118                      && this->m_traits.syntax_type(*(m_position + 2)) == regex_constants::syntax_hash)
1119                   {
1120                      while ((m_position != m_end)
1121                         && (this->m_traits.syntax_type(*m_position++) != regex_constants::syntax_close_mark)) {
1122                      }
1123                      contin = true;
1124                   }
1125                   else
1126                      contin = false;
1127                }
1128             }
1129             else
1130                contin = false;
1131          } while (contin);
1132       }
1133       re_brace* pb = static_cast<re_brace*>(this->insert_state(insert_point, syntax_element_startmark, sizeof(re_brace)));
1134       pb->index = -3;
1135       pb->icase = this->flags() & regbase::icase;
1136       jmp = static_cast<re_jump*>(this->insert_state(insert_point + sizeof(re_brace), syntax_element_jump, sizeof(re_jump)));
1137       this->m_pdata->m_data.align();
1138       jmp->alt.i = this->m_pdata->m_data.size() - this->getoffset(jmp);
1139       pb = static_cast<re_brace*>(this->append_state(syntax_element_endmark, sizeof(re_brace)));
1140       pb->index = -3;
1141       pb->icase = this->flags() & regbase::icase;
1142    }
1143    return true;
1144 }
1145 
1146 template <class charT, class traits>
parse_repeat_range(bool isbasic)1147 bool basic_regex_parser<charT, traits>::parse_repeat_range(bool isbasic)
1148 {
1149    static const char incomplete_message[] = "Missing } in quantified repetition.";
1150    //
1151    // parse a repeat-range:
1152    //
1153    std::size_t min, max;
1154    boost::intmax_t v;
1155    // skip whitespace:
1156    while((m_position != m_end) && this->m_traits.isctype(*m_position, this->m_mask_space))
1157       ++m_position;
1158    if(this->m_position == this->m_end)
1159    {
1160       if(this->flags() & (regbase::main_option_type | regbase::no_perl_ex))
1161       {
1162          fail(regex_constants::error_brace, this->m_position - this->m_base, incomplete_message);
1163          return false;
1164       }
1165       // Treat the opening '{' as a literal character, rewind to start of error:
1166       --m_position;
1167       while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_open_brace) --m_position;
1168       return parse_literal();
1169    }
1170    // get min:
1171    v = this->m_traits.toi(m_position, m_end, 10);
1172    // skip whitespace:
1173    if((v < 0) || (v > umax()))
1174    {
1175       if(this->flags() & (regbase::main_option_type | regbase::no_perl_ex))
1176       {
1177          fail(regex_constants::error_brace, this->m_position - this->m_base, incomplete_message);
1178          return false;
1179       }
1180       // Treat the opening '{' as a literal character, rewind to start of error:
1181       --m_position;
1182       while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_open_brace) --m_position;
1183       return parse_literal();
1184    }
1185    while((m_position != m_end) && this->m_traits.isctype(*m_position, this->m_mask_space))
1186       ++m_position;
1187    if(this->m_position == this->m_end)
1188    {
1189       if(this->flags() & (regbase::main_option_type | regbase::no_perl_ex))
1190       {
1191          fail(regex_constants::error_brace, this->m_position - this->m_base, incomplete_message);
1192          return false;
1193       }
1194       // Treat the opening '{' as a literal character, rewind to start of error:
1195       --m_position;
1196       while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_open_brace) --m_position;
1197       return parse_literal();
1198    }
1199    min = static_cast<std::size_t>(v);
1200    // see if we have a comma:
1201    if(this->m_traits.syntax_type(*m_position) == regex_constants::syntax_comma)
1202    {
1203       // move on and error check:
1204       ++m_position;
1205       // skip whitespace:
1206       while((m_position != m_end) && this->m_traits.isctype(*m_position, this->m_mask_space))
1207          ++m_position;
1208       if(this->m_position == this->m_end)
1209       {
1210          if(this->flags() & (regbase::main_option_type | regbase::no_perl_ex))
1211          {
1212             fail(regex_constants::error_brace, this->m_position - this->m_base, incomplete_message);
1213             return false;
1214          }
1215          // Treat the opening '{' as a literal character, rewind to start of error:
1216          --m_position;
1217          while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_open_brace) --m_position;
1218          return parse_literal();
1219       }
1220       // get the value if any:
1221       v = this->m_traits.toi(m_position, m_end, 10);
1222       max = ((v >= 0) && (v < umax())) ? (std::size_t)v : (std::numeric_limits<std::size_t>::max)();
1223    }
1224    else
1225    {
1226       // no comma, max = min:
1227       max = min;
1228    }
1229    // skip whitespace:
1230    while((m_position != m_end) && this->m_traits.isctype(*m_position, this->m_mask_space))
1231       ++m_position;
1232    // OK now check trailing }:
1233    if(this->m_position == this->m_end)
1234    {
1235       if(this->flags() & (regbase::main_option_type | regbase::no_perl_ex))
1236       {
1237          fail(regex_constants::error_brace, this->m_position - this->m_base, incomplete_message);
1238          return false;
1239       }
1240       // Treat the opening '{' as a literal character, rewind to start of error:
1241       --m_position;
1242       while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_open_brace) --m_position;
1243       return parse_literal();
1244    }
1245    if(isbasic)
1246    {
1247       if(this->m_traits.syntax_type(*m_position) == regex_constants::syntax_escape)
1248       {
1249          ++m_position;
1250          if(this->m_position == this->m_end)
1251          {
1252             fail(regex_constants::error_brace, this->m_position - this->m_base, incomplete_message);
1253             return false;
1254          }
1255       }
1256       else
1257       {
1258          fail(regex_constants::error_brace, this->m_position - this->m_base, incomplete_message);
1259          return false;
1260       }
1261    }
1262    if(this->m_traits.syntax_type(*m_position) == regex_constants::syntax_close_brace)
1263       ++m_position;
1264    else
1265    {
1266       // Treat the opening '{' as a literal character, rewind to start of error:
1267       --m_position;
1268       while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_open_brace) --m_position;
1269       return parse_literal();
1270    }
1271    //
1272    // finally go and add the repeat, unless error:
1273    //
1274    if(min > max)
1275    {
1276       // Backtrack to error location:
1277       m_position -= 2;
1278       while(this->m_traits.isctype(*m_position, this->m_word_mask)) --m_position;
1279          ++m_position;
1280       fail(regex_constants::error_badbrace, m_position - m_base);
1281       return false;
1282    }
1283    return parse_repeat(min, max);
1284 }
1285 
1286 template <class charT, class traits>
parse_alt()1287 bool basic_regex_parser<charT, traits>::parse_alt()
1288 {
1289    //
1290    // error check: if there have been no previous states,
1291    // or if the last state was a '(' then error:
1292    //
1293    if(
1294       ((this->m_last_state == 0) || (this->m_last_state->type == syntax_element_startmark))
1295       &&
1296       !(
1297          ((this->flags() & regbase::main_option_type) == regbase::perl_syntax_group)
1298            &&
1299          ((this->flags() & regbase::no_empty_expressions) == 0)
1300         )
1301       )
1302    {
1303       fail(regex_constants::error_empty, this->m_position - this->m_base, "A regular expression cannot start with the alternation operator |.");
1304       return false;
1305    }
1306    //
1307    // Reset mark count if required:
1308    //
1309    if(m_max_mark < m_mark_count)
1310       m_max_mark = m_mark_count;
1311    if(m_mark_reset >= 0)
1312       m_mark_count = m_mark_reset;
1313 
1314    ++m_position;
1315    //
1316    // we need to append a trailing jump:
1317    //
1318    re_syntax_base* pj = this->append_state(BOOST_REGEX_DETAIL_NS::syntax_element_jump, sizeof(re_jump));
1319    std::ptrdiff_t jump_offset = this->getoffset(pj);
1320    //
1321    // now insert the alternative:
1322    //
1323    re_alt* palt = static_cast<re_alt*>(this->insert_state(this->m_alt_insert_point, syntax_element_alt, re_alt_size));
1324    jump_offset += re_alt_size;
1325    this->m_pdata->m_data.align();
1326    palt->alt.i = this->m_pdata->m_data.size() - this->getoffset(palt);
1327    //
1328    // update m_alt_insert_point so that the next alternate gets
1329    // inserted at the start of the second of the two we've just created:
1330    //
1331    this->m_alt_insert_point = this->m_pdata->m_data.size();
1332    //
1333    // the start of this alternative must have a case changes state
1334    // if the current block has messed around with case changes:
1335    //
1336    if(m_has_case_change)
1337    {
1338       static_cast<re_case*>(
1339          this->append_state(syntax_element_toggle_case, sizeof(re_case))
1340          )->icase = this->m_icase;
1341    }
1342    //
1343    // push the alternative onto our stack, a recursive
1344    // implementation here is easier to understand (and faster
1345    // as it happens), but causes all kinds of stack overflow problems
1346    // on programs with small stacks (COM+).
1347    //
1348    m_alt_jumps.push_back(jump_offset);
1349    return true;
1350 }
1351 
1352 template <class charT, class traits>
parse_set()1353 bool basic_regex_parser<charT, traits>::parse_set()
1354 {
1355    static const char incomplete_message[] = "Character set declaration starting with [ terminated prematurely - either no ] was found or the set had no content.";
1356    ++m_position;
1357    if(m_position == m_end)
1358    {
1359       fail(regex_constants::error_brack, m_position - m_base, incomplete_message);
1360       return false;
1361    }
1362    basic_char_set<charT, traits> char_set;
1363 
1364    const charT* base = m_position;  // where the '[' was
1365    const charT* item_base = m_position;  // where the '[' or '^' was
1366 
1367    while(m_position != m_end)
1368    {
1369       switch(this->m_traits.syntax_type(*m_position))
1370       {
1371       case regex_constants::syntax_caret:
1372          if(m_position == base)
1373          {
1374             char_set.negate();
1375             ++m_position;
1376             item_base = m_position;
1377          }
1378          else
1379             parse_set_literal(char_set);
1380          break;
1381       case regex_constants::syntax_close_set:
1382          if(m_position == item_base)
1383          {
1384             parse_set_literal(char_set);
1385             break;
1386          }
1387          else
1388          {
1389             ++m_position;
1390             if(0 == this->append_set(char_set))
1391             {
1392                fail(regex_constants::error_ctype, m_position - m_base);
1393                return false;
1394             }
1395          }
1396          return true;
1397       case regex_constants::syntax_open_set:
1398          if(parse_inner_set(char_set))
1399             break;
1400          return true;
1401       case regex_constants::syntax_escape:
1402          {
1403             //
1404             // look ahead and see if this is a character class shortcut
1405             // \d \w \s etc...
1406             //
1407             ++m_position;
1408             if(this->m_traits.escape_syntax_type(*m_position)
1409                == regex_constants::escape_type_class)
1410             {
1411                char_class_type m = this->m_traits.lookup_classname(m_position, m_position+1);
1412                if(m != 0)
1413                {
1414                   char_set.add_class(m);
1415                   ++m_position;
1416                   break;
1417                }
1418             }
1419             else if(this->m_traits.escape_syntax_type(*m_position)
1420                == regex_constants::escape_type_not_class)
1421             {
1422                // negated character class:
1423                char_class_type m = this->m_traits.lookup_classname(m_position, m_position+1);
1424                if(m != 0)
1425                {
1426                   char_set.add_negated_class(m);
1427                   ++m_position;
1428                   break;
1429                }
1430             }
1431             // not a character class, just a regular escape:
1432             --m_position;
1433             parse_set_literal(char_set);
1434             break;
1435          }
1436       default:
1437          parse_set_literal(char_set);
1438          break;
1439       }
1440    }
1441    return m_position != m_end;
1442 }
1443 
1444 template <class charT, class traits>
parse_inner_set(basic_char_set<charT,traits> & char_set)1445 bool basic_regex_parser<charT, traits>::parse_inner_set(basic_char_set<charT, traits>& char_set)
1446 {
1447    static const char incomplete_message[] = "Character class declaration starting with [ terminated prematurely - either no ] was found or the set had no content.";
1448    //
1449    // we have either a character class [:name:]
1450    // a collating element [.name.]
1451    // or an equivalence class [=name=]
1452    //
1453    if(m_end == ++m_position)
1454    {
1455       fail(regex_constants::error_brack, m_position - m_base, incomplete_message);
1456       return false;
1457    }
1458    switch(this->m_traits.syntax_type(*m_position))
1459    {
1460    case regex_constants::syntax_dot:
1461       //
1462       // a collating element is treated as a literal:
1463       //
1464       --m_position;
1465       parse_set_literal(char_set);
1466       return true;
1467    case regex_constants::syntax_colon:
1468       {
1469       // check that character classes are actually enabled:
1470       if((this->flags() & (regbase::main_option_type | regbase::no_char_classes))
1471          == (regbase::basic_syntax_group  | regbase::no_char_classes))
1472       {
1473          --m_position;
1474          parse_set_literal(char_set);
1475          return true;
1476       }
1477       // skip the ':'
1478       if(m_end == ++m_position)
1479       {
1480          fail(regex_constants::error_brack, m_position - m_base, incomplete_message);
1481          return false;
1482       }
1483       const charT* name_first = m_position;
1484       // skip at least one character, then find the matching ':]'
1485       if(m_end == ++m_position)
1486       {
1487          fail(regex_constants::error_brack, m_position - m_base, incomplete_message);
1488          return false;
1489       }
1490       while((m_position != m_end)
1491          && (this->m_traits.syntax_type(*m_position) != regex_constants::syntax_colon))
1492          ++m_position;
1493       const charT* name_last = m_position;
1494       if(m_end == m_position)
1495       {
1496          fail(regex_constants::error_brack, m_position - m_base, incomplete_message);
1497          return false;
1498       }
1499       if((m_end == ++m_position)
1500          || (this->m_traits.syntax_type(*m_position) != regex_constants::syntax_close_set))
1501       {
1502          fail(regex_constants::error_brack, m_position - m_base, incomplete_message);
1503          return false;
1504       }
1505       //
1506       // check for negated class:
1507       //
1508       bool negated = false;
1509       if(this->m_traits.syntax_type(*name_first) == regex_constants::syntax_caret)
1510       {
1511          ++name_first;
1512          negated = true;
1513       }
1514       typedef typename traits::char_class_type m_type;
1515       m_type m = this->m_traits.lookup_classname(name_first, name_last);
1516       if(m == 0)
1517       {
1518          if(char_set.empty() && (name_last - name_first == 1))
1519          {
1520             // maybe a special case:
1521             ++m_position;
1522             if( (m_position != m_end)
1523                && (this->m_traits.syntax_type(*m_position)
1524                   == regex_constants::syntax_close_set))
1525             {
1526                if(this->m_traits.escape_syntax_type(*name_first)
1527                   == regex_constants::escape_type_left_word)
1528                {
1529                   ++m_position;
1530                   this->append_state(syntax_element_word_start);
1531                   return false;
1532                }
1533                if(this->m_traits.escape_syntax_type(*name_first)
1534                   == regex_constants::escape_type_right_word)
1535                {
1536                   ++m_position;
1537                   this->append_state(syntax_element_word_end);
1538                   return false;
1539                }
1540             }
1541          }
1542          fail(regex_constants::error_ctype, name_first - m_base);
1543          return false;
1544       }
1545       if(negated == false)
1546          char_set.add_class(m);
1547       else
1548          char_set.add_negated_class(m);
1549       ++m_position;
1550       break;
1551    }
1552    case regex_constants::syntax_equal:
1553       {
1554       // skip the '='
1555       if(m_end == ++m_position)
1556       {
1557          fail(regex_constants::error_brack, m_position - m_base, incomplete_message);
1558          return false;
1559       }
1560       const charT* name_first = m_position;
1561       // skip at least one character, then find the matching '=]'
1562       if(m_end == ++m_position)
1563       {
1564          fail(regex_constants::error_brack, m_position - m_base, incomplete_message);
1565          return false;
1566       }
1567       while((m_position != m_end)
1568          && (this->m_traits.syntax_type(*m_position) != regex_constants::syntax_equal))
1569          ++m_position;
1570       const charT* name_last = m_position;
1571       if(m_end == m_position)
1572       {
1573          fail(regex_constants::error_brack, m_position - m_base, incomplete_message);
1574          return false;
1575       }
1576       if((m_end == ++m_position)
1577          || (this->m_traits.syntax_type(*m_position) != regex_constants::syntax_close_set))
1578       {
1579          fail(regex_constants::error_brack, m_position - m_base, incomplete_message);
1580          return false;
1581       }
1582       string_type m = this->m_traits.lookup_collatename(name_first, name_last);
1583       if((0 == m.size()) || (m.size() > 2))
1584       {
1585          fail(regex_constants::error_collate, name_first - m_base);
1586          return false;
1587       }
1588       digraph<charT> d;
1589       d.first = m[0];
1590       if(m.size() > 1)
1591          d.second = m[1];
1592       else
1593          d.second = 0;
1594       char_set.add_equivalent(d);
1595       ++m_position;
1596       break;
1597    }
1598    default:
1599       --m_position;
1600       parse_set_literal(char_set);
1601       break;
1602    }
1603    return true;
1604 }
1605 
1606 template <class charT, class traits>
parse_set_literal(basic_char_set<charT,traits> & char_set)1607 void basic_regex_parser<charT, traits>::parse_set_literal(basic_char_set<charT, traits>& char_set)
1608 {
1609    digraph<charT> start_range(get_next_set_literal(char_set));
1610    if(m_end == m_position)
1611    {
1612       fail(regex_constants::error_brack, m_position - m_base);
1613       return;
1614    }
1615    if(this->m_traits.syntax_type(*m_position) == regex_constants::syntax_dash)
1616    {
1617       // we have a range:
1618       if(m_end == ++m_position)
1619       {
1620          fail(regex_constants::error_brack, m_position - m_base);
1621          return;
1622       }
1623       if(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_close_set)
1624       {
1625          digraph<charT> end_range = get_next_set_literal(char_set);
1626          char_set.add_range(start_range, end_range);
1627          if(this->m_traits.syntax_type(*m_position) == regex_constants::syntax_dash)
1628          {
1629             if(m_end == ++m_position)
1630             {
1631                fail(regex_constants::error_brack, m_position - m_base);
1632                return;
1633             }
1634             if(this->m_traits.syntax_type(*m_position) == regex_constants::syntax_close_set)
1635             {
1636                // trailing - :
1637                --m_position;
1638                return;
1639             }
1640             fail(regex_constants::error_range, m_position - m_base);
1641             return;
1642          }
1643          return;
1644       }
1645       --m_position;
1646    }
1647    char_set.add_single(start_range);
1648 }
1649 
1650 template <class charT, class traits>
get_next_set_literal(basic_char_set<charT,traits> & char_set)1651 digraph<charT> basic_regex_parser<charT, traits>::get_next_set_literal(basic_char_set<charT, traits>& char_set)
1652 {
1653    digraph<charT> result;
1654    switch(this->m_traits.syntax_type(*m_position))
1655    {
1656    case regex_constants::syntax_dash:
1657       if(!char_set.empty())
1658       {
1659          // see if we are at the end of the set:
1660          if((++m_position == m_end) || (this->m_traits.syntax_type(*m_position) != regex_constants::syntax_close_set))
1661          {
1662             fail(regex_constants::error_range, m_position - m_base);
1663             return result;
1664          }
1665          --m_position;
1666       }
1667       result.first = *m_position++;
1668       return result;
1669    case regex_constants::syntax_escape:
1670       // check to see if escapes are supported first:
1671       if(this->flags() & regex_constants::no_escape_in_lists)
1672       {
1673          result = *m_position++;
1674          break;
1675       }
1676       ++m_position;
1677       result = unescape_character();
1678       break;
1679    case regex_constants::syntax_open_set:
1680    {
1681       if(m_end == ++m_position)
1682       {
1683          fail(regex_constants::error_collate, m_position - m_base);
1684          return result;
1685       }
1686       if(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_dot)
1687       {
1688          --m_position;
1689          result.first = *m_position;
1690          ++m_position;
1691          return result;
1692       }
1693       if(m_end == ++m_position)
1694       {
1695          fail(regex_constants::error_collate, m_position - m_base);
1696          return result;
1697       }
1698       const charT* name_first = m_position;
1699       // skip at least one character, then find the matching ':]'
1700       if(m_end == ++m_position)
1701       {
1702          fail(regex_constants::error_collate, name_first - m_base);
1703          return result;
1704       }
1705       while((m_position != m_end)
1706          && (this->m_traits.syntax_type(*m_position) != regex_constants::syntax_dot))
1707          ++m_position;
1708       const charT* name_last = m_position;
1709       if(m_end == m_position)
1710       {
1711          fail(regex_constants::error_collate, name_first - m_base);
1712          return result;
1713       }
1714       if((m_end == ++m_position)
1715          || (this->m_traits.syntax_type(*m_position) != regex_constants::syntax_close_set))
1716       {
1717          fail(regex_constants::error_collate, name_first - m_base);
1718          return result;
1719       }
1720       ++m_position;
1721       string_type s = this->m_traits.lookup_collatename(name_first, name_last);
1722       if(s.empty() || (s.size() > 2))
1723       {
1724          fail(regex_constants::error_collate, name_first - m_base);
1725          return result;
1726       }
1727       result.first = s[0];
1728       if(s.size() > 1)
1729          result.second = s[1];
1730       else
1731          result.second = 0;
1732       return result;
1733    }
1734    default:
1735       result = *m_position++;
1736    }
1737    return result;
1738 }
1739 
1740 //
1741 // does a value fit in the specified charT type?
1742 //
1743 template <class charT>
valid_value(charT,boost::intmax_t v,const mpl::true_ &)1744 bool valid_value(charT, boost::intmax_t v, const mpl::true_&)
1745 {
1746    return (v >> (sizeof(charT) * CHAR_BIT)) == 0;
1747 }
1748 template <class charT>
valid_value(charT,boost::intmax_t,const mpl::false_ &)1749 bool valid_value(charT, boost::intmax_t, const mpl::false_&)
1750 {
1751    return true; // v will alsways fit in a charT
1752 }
1753 template <class charT>
valid_value(charT c,boost::intmax_t v)1754 bool valid_value(charT c, boost::intmax_t v)
1755 {
1756    return valid_value(c, v, mpl::bool_<(sizeof(charT) < sizeof(boost::intmax_t))>());
1757 }
1758 
1759 template <class charT, class traits>
unescape_character()1760 charT basic_regex_parser<charT, traits>::unescape_character()
1761 {
1762 #ifdef BOOST_MSVC
1763 #pragma warning(push)
1764 #pragma warning(disable:4127)
1765 #endif
1766    charT result(0);
1767    if(m_position == m_end)
1768    {
1769       fail(regex_constants::error_escape, m_position - m_base, "Escape sequence terminated prematurely.");
1770       return false;
1771    }
1772    switch(this->m_traits.escape_syntax_type(*m_position))
1773    {
1774    case regex_constants::escape_type_control_a:
1775       result = charT('\a');
1776       break;
1777    case regex_constants::escape_type_e:
1778       result = charT(27);
1779       break;
1780    case regex_constants::escape_type_control_f:
1781       result = charT('\f');
1782       break;
1783    case regex_constants::escape_type_control_n:
1784       result = charT('\n');
1785       break;
1786    case regex_constants::escape_type_control_r:
1787       result = charT('\r');
1788       break;
1789    case regex_constants::escape_type_control_t:
1790       result = charT('\t');
1791       break;
1792    case regex_constants::escape_type_control_v:
1793       result = charT('\v');
1794       break;
1795    case regex_constants::escape_type_word_assert:
1796       result = charT('\b');
1797       break;
1798    case regex_constants::escape_type_ascii_control:
1799       ++m_position;
1800       if(m_position == m_end)
1801       {
1802          // Rewind to start of escape:
1803          --m_position;
1804          while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_escape) --m_position;
1805          fail(regex_constants::error_escape, m_position - m_base, "ASCII escape sequence terminated prematurely.");
1806          return result;
1807       }
1808       result = static_cast<charT>(*m_position % 32);
1809       break;
1810    case regex_constants::escape_type_hex:
1811       ++m_position;
1812       if(m_position == m_end)
1813       {
1814          // Rewind to start of escape:
1815          --m_position;
1816          while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_escape) --m_position;
1817          fail(regex_constants::error_escape, m_position - m_base, "Hexadecimal escape sequence terminated prematurely.");
1818          return result;
1819       }
1820       // maybe have \x{ddd}
1821       if(this->m_traits.syntax_type(*m_position) == regex_constants::syntax_open_brace)
1822       {
1823          ++m_position;
1824          if(m_position == m_end)
1825          {
1826             // Rewind to start of escape:
1827             --m_position;
1828             while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_escape) --m_position;
1829             fail(regex_constants::error_escape, m_position - m_base, "Missing } in hexadecimal escape sequence.");
1830             return result;
1831          }
1832          boost::intmax_t i = this->m_traits.toi(m_position, m_end, 16);
1833          if((m_position == m_end)
1834             || (i < 0)
1835             || ((std::numeric_limits<charT>::is_specialized) && (i > (boost::intmax_t)(std::numeric_limits<charT>::max)()))
1836             || (this->m_traits.syntax_type(*m_position) != regex_constants::syntax_close_brace))
1837          {
1838             // Rewind to start of escape:
1839             --m_position;
1840             while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_escape) --m_position;
1841             fail(regex_constants::error_badbrace, m_position - m_base, "Hexadecimal escape sequence was invalid.");
1842             return result;
1843          }
1844          ++m_position;
1845          result = charT(i);
1846       }
1847       else
1848       {
1849          std::ptrdiff_t len = (std::min)(static_cast<std::ptrdiff_t>(2), static_cast<std::ptrdiff_t>(m_end - m_position));
1850          boost::intmax_t i = this->m_traits.toi(m_position, m_position + len, 16);
1851          if((i < 0)
1852             || !valid_value(charT(0), i))
1853          {
1854             // Rewind to start of escape:
1855             --m_position;
1856             while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_escape) --m_position;
1857             fail(regex_constants::error_escape, m_position - m_base, "Escape sequence did not encode a valid character.");
1858             return result;
1859          }
1860          result = charT(i);
1861       }
1862       return result;
1863    case regex_constants::syntax_digit:
1864       {
1865       // an octal escape sequence, the first character must be a zero
1866       // followed by up to 3 octal digits:
1867       std::ptrdiff_t len = (std::min)(::boost::BOOST_REGEX_DETAIL_NS::distance(m_position, m_end), static_cast<std::ptrdiff_t>(4));
1868       const charT* bp = m_position;
1869       boost::intmax_t val = this->m_traits.toi(bp, bp + 1, 8);
1870       if(val != 0)
1871       {
1872          // Rewind to start of escape:
1873          --m_position;
1874          while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_escape) --m_position;
1875          // Oops not an octal escape after all:
1876          fail(regex_constants::error_escape, m_position - m_base, "Invalid octal escape sequence.");
1877          return result;
1878       }
1879       val = this->m_traits.toi(m_position, m_position + len, 8);
1880       if((val < 0) || (val > (boost::intmax_t)(std::numeric_limits<charT>::max)()))
1881       {
1882          // Rewind to start of escape:
1883          --m_position;
1884          while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_escape) --m_position;
1885          fail(regex_constants::error_escape, m_position - m_base, "Octal escape sequence is invalid.");
1886          return result;
1887       }
1888       return static_cast<charT>(val);
1889       }
1890    case regex_constants::escape_type_named_char:
1891       {
1892          ++m_position;
1893          if(m_position == m_end)
1894          {
1895             // Rewind to start of escape:
1896             --m_position;
1897             while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_escape) --m_position;
1898             fail(regex_constants::error_escape, m_position - m_base);
1899             return false;
1900          }
1901          // maybe have \N{name}
1902          if(this->m_traits.syntax_type(*m_position) == regex_constants::syntax_open_brace)
1903          {
1904             const charT* base = m_position;
1905             // skip forward until we find enclosing brace:
1906             while((m_position != m_end) && (this->m_traits.syntax_type(*m_position) != regex_constants::syntax_close_brace))
1907                ++m_position;
1908             if(m_position == m_end)
1909             {
1910                // Rewind to start of escape:
1911                --m_position;
1912                while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_escape) --m_position;
1913                fail(regex_constants::error_escape, m_position - m_base);
1914                return false;
1915             }
1916             string_type s = this->m_traits.lookup_collatename(++base, m_position++);
1917             if(s.empty())
1918             {
1919                // Rewind to start of escape:
1920                --m_position;
1921                while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_escape) --m_position;
1922                fail(regex_constants::error_collate, m_position - m_base);
1923                return false;
1924             }
1925             if(s.size() == 1)
1926             {
1927                return s[0];
1928             }
1929          }
1930          // fall through is a failure:
1931          // Rewind to start of escape:
1932          --m_position;
1933          while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_escape) --m_position;
1934          fail(regex_constants::error_escape, m_position - m_base);
1935          return false;
1936       }
1937    default:
1938       result = *m_position;
1939       break;
1940    }
1941    ++m_position;
1942    return result;
1943 #ifdef BOOST_MSVC
1944 #pragma warning(pop)
1945 #endif
1946 }
1947 
1948 template <class charT, class traits>
parse_backref()1949 bool basic_regex_parser<charT, traits>::parse_backref()
1950 {
1951    BOOST_ASSERT(m_position != m_end);
1952    const charT* pc = m_position;
1953    boost::intmax_t i = this->m_traits.toi(pc, pc + 1, 10);
1954    if((i == 0) || (((this->flags() & regbase::main_option_type) == regbase::perl_syntax_group) && (this->flags() & regbase::no_bk_refs)))
1955    {
1956       // not a backref at all but an octal escape sequence:
1957       charT c = unescape_character();
1958       this->append_literal(c);
1959    }
1960    else if((i > 0) && (this->m_backrefs.test(i)))
1961    {
1962       m_position = pc;
1963       re_brace* pb = static_cast<re_brace*>(this->append_state(syntax_element_backref, sizeof(re_brace)));
1964       pb->index = i;
1965       pb->icase = this->flags() & regbase::icase;
1966    }
1967    else
1968    {
1969       // Rewind to start of escape:
1970       --m_position;
1971       while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_escape) --m_position;
1972       fail(regex_constants::error_backref, m_position - m_base);
1973       return false;
1974    }
1975    return true;
1976 }
1977 
1978 template <class charT, class traits>
parse_QE()1979 bool basic_regex_parser<charT, traits>::parse_QE()
1980 {
1981 #ifdef BOOST_MSVC
1982 #pragma warning(push)
1983 #pragma warning(disable:4127)
1984 #endif
1985    //
1986    // parse a \Q...\E sequence:
1987    //
1988    ++m_position; // skip the Q
1989    const charT* start = m_position;
1990    const charT* end;
1991    do
1992    {
1993       while((m_position != m_end)
1994          && (this->m_traits.syntax_type(*m_position) != regex_constants::syntax_escape))
1995          ++m_position;
1996       if(m_position == m_end)
1997       {
1998          //  a \Q...\E sequence may terminate with the end of the expression:
1999          end = m_position;
2000          break;
2001       }
2002       if(++m_position == m_end) // skip the escape
2003       {
2004          fail(regex_constants::error_escape, m_position - m_base, "Unterminated \\Q...\\E sequence.");
2005          return false;
2006       }
2007       // check to see if it's a \E:
2008       if(this->m_traits.escape_syntax_type(*m_position) == regex_constants::escape_type_E)
2009       {
2010          ++m_position;
2011          end = m_position - 2;
2012          break;
2013       }
2014       // otherwise go round again:
2015    }while(true);
2016    //
2017    // now add all the character between the two escapes as literals:
2018    //
2019    while(start != end)
2020    {
2021       this->append_literal(*start);
2022       ++start;
2023    }
2024    return true;
2025 #ifdef BOOST_MSVC
2026 #pragma warning(pop)
2027 #endif
2028 }
2029 
2030 template <class charT, class traits>
parse_perl_extension()2031 bool basic_regex_parser<charT, traits>::parse_perl_extension()
2032 {
2033    if(++m_position == m_end)
2034    {
2035       // Rewind to start of (? sequence:
2036       --m_position;
2037       while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_open_mark) --m_position;
2038       fail(regex_constants::error_perl_extension, m_position - m_base);
2039       return false;
2040    }
2041    //
2042    // treat comments as a special case, as these
2043    // are the only ones that don't start with a leading
2044    // startmark state:
2045    //
2046    if(this->m_traits.syntax_type(*m_position) == regex_constants::syntax_hash)
2047    {
2048       while((m_position != m_end)
2049          && (this->m_traits.syntax_type(*m_position++) != regex_constants::syntax_close_mark))
2050       {}
2051       return true;
2052    }
2053    //
2054    // backup some state, and prepare the way:
2055    //
2056    int markid = 0;
2057    std::ptrdiff_t jump_offset = 0;
2058    re_brace* pb = static_cast<re_brace*>(this->append_state(syntax_element_startmark, sizeof(re_brace)));
2059    pb->icase = this->flags() & regbase::icase;
2060    std::ptrdiff_t last_paren_start = this->getoffset(pb);
2061    // back up insertion point for alternations, and set new point:
2062    std::ptrdiff_t last_alt_point = m_alt_insert_point;
2063    this->m_pdata->m_data.align();
2064    m_alt_insert_point = this->m_pdata->m_data.size();
2065    std::ptrdiff_t expected_alt_point = m_alt_insert_point;
2066    bool restore_flags = true;
2067    regex_constants::syntax_option_type old_flags = this->flags();
2068    bool old_case_change = m_has_case_change;
2069    m_has_case_change = false;
2070    charT name_delim;
2071    int mark_reset = m_mark_reset;
2072    int max_mark = m_max_mark;
2073    m_mark_reset = -1;
2074    m_max_mark = m_mark_count;
2075    boost::intmax_t v;
2076    //
2077    // select the actual extension used:
2078    //
2079    switch(this->m_traits.syntax_type(*m_position))
2080    {
2081    case regex_constants::syntax_or:
2082       m_mark_reset = m_mark_count;
2083       BOOST_FALLTHROUGH;
2084    case regex_constants::syntax_colon:
2085       //
2086       // a non-capturing mark:
2087       //
2088       pb->index = markid = 0;
2089       ++m_position;
2090       break;
2091    case regex_constants::syntax_digit:
2092       {
2093       //
2094       // a recursive subexpression:
2095       //
2096       v = this->m_traits.toi(m_position, m_end, 10);
2097       if((v < 0) || (this->m_traits.syntax_type(*m_position) != regex_constants::syntax_close_mark))
2098       {
2099          // Rewind to start of (? sequence:
2100          --m_position;
2101          while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_open_mark) --m_position;
2102          fail(regex_constants::error_perl_extension, m_position - m_base, "The recursive sub-expression refers to an invalid marking group, or is unterminated.");
2103          return false;
2104       }
2105 insert_recursion:
2106       pb->index = markid = 0;
2107       re_recurse* pr = static_cast<re_recurse*>(this->append_state(syntax_element_recurse, sizeof(re_recurse)));
2108       pr->alt.i = v;
2109       pr->state_id = 0;
2110       static_cast<re_case*>(
2111             this->append_state(syntax_element_toggle_case, sizeof(re_case))
2112             )->icase = this->flags() & regbase::icase;
2113       break;
2114       }
2115    case regex_constants::syntax_plus:
2116       //
2117       // A forward-relative recursive subexpression:
2118       //
2119       ++m_position;
2120       v = this->m_traits.toi(m_position, m_end, 10);
2121       if((v <= 0) || (this->m_traits.syntax_type(*m_position) != regex_constants::syntax_close_mark))
2122       {
2123          // Rewind to start of (? sequence:
2124          --m_position;
2125          while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_open_mark) --m_position;
2126          fail(regex_constants::error_perl_extension, m_position - m_base, "An invalid or unterminated recursive sub-expression.");
2127          return false;
2128       }
2129       if ((std::numeric_limits<boost::intmax_t>::max)() - m_mark_count < v)
2130       {
2131          fail(regex_constants::error_perl_extension, m_position - m_base, "An invalid or unterminated recursive sub-expression.");
2132          return false;
2133       }
2134       v += m_mark_count;
2135       goto insert_recursion;
2136    case regex_constants::syntax_dash:
2137       //
2138       // Possibly a backward-relative recursive subexpression:
2139       //
2140       ++m_position;
2141       v = this->m_traits.toi(m_position, m_end, 10);
2142       if(v <= 0)
2143       {
2144          --m_position;
2145          // Oops not a relative recursion at all, but a (?-imsx) group:
2146          goto option_group_jump;
2147       }
2148       v = static_cast<boost::intmax_t>(m_mark_count) + 1 - v;
2149       if(v <= 0)
2150       {
2151          // Rewind to start of (? sequence:
2152          --m_position;
2153          while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_open_mark) --m_position;
2154          fail(regex_constants::error_perl_extension, m_position - m_base, "An invalid or unterminated recursive sub-expression.");
2155          return false;
2156       }
2157       goto insert_recursion;
2158    case regex_constants::syntax_equal:
2159       pb->index = markid = -1;
2160       ++m_position;
2161       jump_offset = this->getoffset(this->append_state(syntax_element_jump, sizeof(re_jump)));
2162       this->m_pdata->m_data.align();
2163       m_alt_insert_point = this->m_pdata->m_data.size();
2164       break;
2165    case regex_constants::syntax_not:
2166       pb->index = markid = -2;
2167       ++m_position;
2168       jump_offset = this->getoffset(this->append_state(syntax_element_jump, sizeof(re_jump)));
2169       this->m_pdata->m_data.align();
2170       m_alt_insert_point = this->m_pdata->m_data.size();
2171       break;
2172    case regex_constants::escape_type_left_word:
2173       {
2174          // a lookbehind assertion:
2175          if(++m_position == m_end)
2176          {
2177             // Rewind to start of (? sequence:
2178             --m_position;
2179             while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_open_mark) --m_position;
2180             fail(regex_constants::error_perl_extension, m_position - m_base);
2181             return false;
2182          }
2183          regex_constants::syntax_type t = this->m_traits.syntax_type(*m_position);
2184          if(t == regex_constants::syntax_not)
2185             pb->index = markid = -2;
2186          else if(t == regex_constants::syntax_equal)
2187             pb->index = markid = -1;
2188          else
2189          {
2190             // Probably a named capture which also starts (?< :
2191             name_delim = '>';
2192             --m_position;
2193             goto named_capture_jump;
2194          }
2195          ++m_position;
2196          jump_offset = this->getoffset(this->append_state(syntax_element_jump, sizeof(re_jump)));
2197          this->append_state(syntax_element_backstep, sizeof(re_brace));
2198          this->m_pdata->m_data.align();
2199          m_alt_insert_point = this->m_pdata->m_data.size();
2200          break;
2201       }
2202    case regex_constants::escape_type_right_word:
2203       //
2204       // an independent sub-expression:
2205       //
2206       pb->index = markid = -3;
2207       ++m_position;
2208       jump_offset = this->getoffset(this->append_state(syntax_element_jump, sizeof(re_jump)));
2209       this->m_pdata->m_data.align();
2210       m_alt_insert_point = this->m_pdata->m_data.size();
2211       break;
2212    case regex_constants::syntax_open_mark:
2213       {
2214       // a conditional expression:
2215       pb->index = markid = -4;
2216       if(++m_position == m_end)
2217       {
2218          // Rewind to start of (? sequence:
2219          --m_position;
2220          while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_open_mark) --m_position;
2221          fail(regex_constants::error_perl_extension, m_position - m_base);
2222          return false;
2223       }
2224       v = this->m_traits.toi(m_position, m_end, 10);
2225       if(m_position == m_end)
2226       {
2227          // Rewind to start of (? sequence:
2228          --m_position;
2229          while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_open_mark) --m_position;
2230          fail(regex_constants::error_perl_extension, m_position - m_base);
2231          return false;
2232       }
2233       if(*m_position == charT('R'))
2234       {
2235          if(++m_position == m_end)
2236          {
2237             // Rewind to start of (? sequence:
2238             --m_position;
2239             while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_open_mark) --m_position;
2240             fail(regex_constants::error_perl_extension, m_position - m_base);
2241             return false;
2242          }
2243          if(*m_position == charT('&'))
2244          {
2245             const charT* base = ++m_position;
2246             while((m_position != m_end) && (this->m_traits.syntax_type(*m_position) != regex_constants::syntax_close_mark))
2247                ++m_position;
2248             if(m_position == m_end)
2249             {
2250                // Rewind to start of (? sequence:
2251                --m_position;
2252                while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_open_mark) --m_position;
2253                fail(regex_constants::error_perl_extension, m_position - m_base);
2254                return false;
2255             }
2256             v = -static_cast<int>(hash_value_from_capture_name(base, m_position));
2257          }
2258          else
2259          {
2260             v = -this->m_traits.toi(m_position, m_end, 10);
2261          }
2262          re_brace* br = static_cast<re_brace*>(this->append_state(syntax_element_assert_backref, sizeof(re_brace)));
2263          br->index = v < 0 ? (v - 1) : 0;
2264          if(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_close_mark)
2265          {
2266             // Rewind to start of (? sequence:
2267             --m_position;
2268             while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_open_mark) --m_position;
2269             fail(regex_constants::error_perl_extension, m_position - m_base);
2270             return false;
2271          }
2272          if(++m_position == m_end)
2273          {
2274             // Rewind to start of (? sequence:
2275             --m_position;
2276             while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_open_mark) --m_position;
2277             fail(regex_constants::error_perl_extension, m_position - m_base);
2278             return false;
2279          }
2280       }
2281       else if((*m_position == charT('\'')) || (*m_position == charT('<')))
2282       {
2283          const charT* base = ++m_position;
2284          while((m_position != m_end) && (*m_position != charT('>')) && (*m_position != charT('\'')))
2285             ++m_position;
2286          if(m_position == m_end)
2287          {
2288             // Rewind to start of (? sequence:
2289             --m_position;
2290             while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_open_mark) --m_position;
2291             fail(regex_constants::error_perl_extension, m_position - m_base);
2292             return false;
2293          }
2294          v = static_cast<int>(hash_value_from_capture_name(base, m_position));
2295          re_brace* br = static_cast<re_brace*>(this->append_state(syntax_element_assert_backref, sizeof(re_brace)));
2296          br->index = v;
2297          if(((*m_position != charT('>')) && (*m_position != charT('\''))) || (++m_position == m_end))
2298          {
2299             // Rewind to start of (? sequence:
2300             --m_position;
2301             while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_open_mark) --m_position;
2302             fail(regex_constants::error_perl_extension, m_position - m_base, "Unterminated named capture.");
2303             return false;
2304          }
2305          if(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_close_mark)
2306          {
2307             // Rewind to start of (? sequence:
2308             --m_position;
2309             while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_open_mark) --m_position;
2310             fail(regex_constants::error_perl_extension, m_position - m_base);
2311             return false;
2312          }
2313          if(++m_position == m_end)
2314          {
2315             // Rewind to start of (? sequence:
2316             --m_position;
2317             while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_open_mark) --m_position;
2318             fail(regex_constants::error_perl_extension, m_position - m_base);
2319             return false;
2320          }
2321       }
2322       else if(*m_position == charT('D'))
2323       {
2324          const char* def = "DEFINE";
2325          while(*def && (m_position != m_end) && (*m_position == charT(*def)))
2326             ++m_position, ++def;
2327          if((m_position == m_end) || *def)
2328          {
2329             // Rewind to start of (? sequence:
2330             --m_position;
2331             while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_open_mark) --m_position;
2332             fail(regex_constants::error_perl_extension, m_position - m_base);
2333             return false;
2334          }
2335          re_brace* br = static_cast<re_brace*>(this->append_state(syntax_element_assert_backref, sizeof(re_brace)));
2336          br->index = 9999; // special magic value!
2337          if(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_close_mark)
2338          {
2339             // Rewind to start of (? sequence:
2340             --m_position;
2341             while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_open_mark) --m_position;
2342             fail(regex_constants::error_perl_extension, m_position - m_base);
2343             return false;
2344          }
2345          if(++m_position == m_end)
2346          {
2347             // Rewind to start of (? sequence:
2348             --m_position;
2349             while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_open_mark) --m_position;
2350             fail(regex_constants::error_perl_extension, m_position - m_base);
2351             return false;
2352          }
2353       }
2354       else if(v > 0)
2355       {
2356          re_brace* br = static_cast<re_brace*>(this->append_state(syntax_element_assert_backref, sizeof(re_brace)));
2357          br->index = v;
2358          if(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_close_mark)
2359          {
2360             // Rewind to start of (? sequence:
2361             --m_position;
2362             while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_open_mark) --m_position;
2363             fail(regex_constants::error_perl_extension, m_position - m_base);
2364             return false;
2365          }
2366          if(++m_position == m_end)
2367          {
2368             // Rewind to start of (? sequence:
2369             --m_position;
2370             while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_open_mark) --m_position;
2371             fail(regex_constants::error_perl_extension, m_position - m_base);
2372             return false;
2373          }
2374       }
2375       else
2376       {
2377          // verify that we have a lookahead or lookbehind assert:
2378          if(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_question)
2379          {
2380             // Rewind to start of (? sequence:
2381             --m_position;
2382             while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_open_mark) --m_position;
2383             fail(regex_constants::error_perl_extension, m_position - m_base);
2384             return false;
2385          }
2386          if(++m_position == m_end)
2387          {
2388             // Rewind to start of (? sequence:
2389             --m_position;
2390             while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_open_mark) --m_position;
2391             fail(regex_constants::error_perl_extension, m_position - m_base);
2392             return false;
2393          }
2394          if(this->m_traits.syntax_type(*m_position) == regex_constants::escape_type_left_word)
2395          {
2396             if(++m_position == m_end)
2397             {
2398                // Rewind to start of (? sequence:
2399                --m_position;
2400                while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_open_mark) --m_position;
2401                fail(regex_constants::error_perl_extension, m_position - m_base);
2402                return false;
2403             }
2404             if((this->m_traits.syntax_type(*m_position) != regex_constants::syntax_equal)
2405                && (this->m_traits.syntax_type(*m_position) != regex_constants::syntax_not))
2406             {
2407                // Rewind to start of (? sequence:
2408                --m_position;
2409                while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_open_mark) --m_position;
2410                fail(regex_constants::error_perl_extension, m_position - m_base);
2411                return false;
2412             }
2413             m_position -= 3;
2414          }
2415          else
2416          {
2417             if((this->m_traits.syntax_type(*m_position) != regex_constants::syntax_equal)
2418                && (this->m_traits.syntax_type(*m_position) != regex_constants::syntax_not))
2419             {
2420                // Rewind to start of (? sequence:
2421                --m_position;
2422                while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_open_mark) --m_position;
2423                fail(regex_constants::error_perl_extension, m_position - m_base);
2424                return false;
2425             }
2426             m_position -= 2;
2427          }
2428       }
2429       break;
2430       }
2431    case regex_constants::syntax_close_mark:
2432       // Rewind to start of (? sequence:
2433       --m_position;
2434       while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_open_mark) --m_position;
2435       fail(regex_constants::error_perl_extension, m_position - m_base);
2436       return false;
2437    case regex_constants::escape_type_end_buffer:
2438       {
2439       name_delim = *m_position;
2440 named_capture_jump:
2441       markid = 0;
2442       if(0 == (this->flags() & regbase::nosubs))
2443       {
2444          markid = ++m_mark_count;
2445    #ifndef BOOST_NO_STD_DISTANCE
2446          if(this->flags() & regbase::save_subexpression_location)
2447             this->m_pdata->m_subs.push_back(std::pair<std::size_t, std::size_t>(std::distance(m_base, m_position) - 2, 0));
2448    #else
2449          if(this->flags() & regbase::save_subexpression_location)
2450             this->m_pdata->m_subs.push_back(std::pair<std::size_t, std::size_t>((m_position - m_base) - 2, 0));
2451    #endif
2452       }
2453       pb->index = markid;
2454       const charT* base = ++m_position;
2455       if(m_position == m_end)
2456       {
2457          // Rewind to start of (? sequence:
2458          --m_position;
2459          while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_open_mark) --m_position;
2460          fail(regex_constants::error_perl_extension, m_position - m_base);
2461          return false;
2462       }
2463       while((m_position != m_end) && (*m_position != name_delim))
2464          ++m_position;
2465       if(m_position == m_end)
2466       {
2467          // Rewind to start of (? sequence:
2468          --m_position;
2469          while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_open_mark) --m_position;
2470          fail(regex_constants::error_perl_extension, m_position - m_base);
2471          return false;
2472       }
2473       this->m_pdata->set_name(base, m_position, markid);
2474       ++m_position;
2475       break;
2476       }
2477    default:
2478       if(*m_position == charT('R'))
2479       {
2480          ++m_position;
2481          v = 0;
2482          if(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_close_mark)
2483          {
2484             // Rewind to start of (? sequence:
2485             --m_position;
2486             while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_open_mark) --m_position;
2487             fail(regex_constants::error_perl_extension, m_position - m_base);
2488             return false;
2489          }
2490          goto insert_recursion;
2491       }
2492       if(*m_position == charT('&'))
2493       {
2494          ++m_position;
2495          const charT* base = m_position;
2496          while((m_position != m_end) && (this->m_traits.syntax_type(*m_position) != regex_constants::syntax_close_mark))
2497             ++m_position;
2498          if(m_position == m_end)
2499          {
2500             // Rewind to start of (? sequence:
2501             --m_position;
2502             while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_open_mark) --m_position;
2503             fail(regex_constants::error_perl_extension, m_position - m_base);
2504             return false;
2505          }
2506          v = static_cast<int>(hash_value_from_capture_name(base, m_position));
2507          goto insert_recursion;
2508       }
2509       if(*m_position == charT('P'))
2510       {
2511          ++m_position;
2512          if(m_position == m_end)
2513          {
2514             // Rewind to start of (? sequence:
2515             --m_position;
2516             while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_open_mark) --m_position;
2517             fail(regex_constants::error_perl_extension, m_position - m_base);
2518             return false;
2519          }
2520          if(*m_position == charT('>'))
2521          {
2522             ++m_position;
2523             const charT* base = m_position;
2524             while((m_position != m_end) && (this->m_traits.syntax_type(*m_position) != regex_constants::syntax_close_mark))
2525                ++m_position;
2526             if(m_position == m_end)
2527             {
2528                // Rewind to start of (? sequence:
2529                --m_position;
2530                while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_open_mark) --m_position;
2531                fail(regex_constants::error_perl_extension, m_position - m_base);
2532                return false;
2533             }
2534             v = static_cast<int>(hash_value_from_capture_name(base, m_position));
2535             goto insert_recursion;
2536          }
2537       }
2538       //
2539       // lets assume that we have a (?imsx) group and try and parse it:
2540       //
2541 option_group_jump:
2542       regex_constants::syntax_option_type opts = parse_options();
2543       if(m_position == m_end)
2544       {
2545          // Rewind to start of (? sequence:
2546          --m_position;
2547          while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_open_mark) --m_position;
2548          fail(regex_constants::error_perl_extension, m_position - m_base);
2549          return false;
2550       }
2551       // make a note of whether we have a case change:
2552       m_has_case_change = ((opts & regbase::icase) != (this->flags() & regbase::icase));
2553       pb->index = markid = 0;
2554       if(this->m_traits.syntax_type(*m_position) == regex_constants::syntax_close_mark)
2555       {
2556          // update flags and carry on as normal:
2557          this->flags(opts);
2558          restore_flags = false;
2559          old_case_change |= m_has_case_change; // defer end of scope by one ')'
2560       }
2561       else if(this->m_traits.syntax_type(*m_position) == regex_constants::syntax_colon)
2562       {
2563          // update flags and carry on until the matching ')' is found:
2564          this->flags(opts);
2565          ++m_position;
2566       }
2567       else
2568       {
2569          // Rewind to start of (? sequence:
2570          --m_position;
2571          while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_open_mark) --m_position;
2572          fail(regex_constants::error_perl_extension, m_position - m_base);
2573          return false;
2574       }
2575 
2576       // finally append a case change state if we need it:
2577       if(m_has_case_change)
2578       {
2579          static_cast<re_case*>(
2580             this->append_state(syntax_element_toggle_case, sizeof(re_case))
2581             )->icase = opts & regbase::icase;
2582       }
2583 
2584    }
2585    //
2586    // now recursively add more states, this will terminate when we get to a
2587    // matching ')' :
2588    //
2589    parse_all();
2590    //
2591    // Unwind alternatives:
2592    //
2593    if(0 == unwind_alts(last_paren_start))
2594    {
2595       // Rewind to start of (? sequence:
2596       --m_position;
2597       while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_open_mark) --m_position;
2598       fail(regex_constants::error_perl_extension, m_position - m_base, "Invalid alternation operators within (?...) block.");
2599       return false;
2600    }
2601    //
2602    // we either have a ')' or we have run out of characters prematurely:
2603    //
2604    if(m_position == m_end)
2605    {
2606       // Rewind to start of (? sequence:
2607       --m_position;
2608       while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_open_mark) --m_position;
2609       this->fail(regex_constants::error_paren, ::boost::BOOST_REGEX_DETAIL_NS::distance(m_base, m_end));
2610       return false;
2611    }
2612    BOOST_ASSERT(this->m_traits.syntax_type(*m_position) == regex_constants::syntax_close_mark);
2613    ++m_position;
2614    //
2615    // restore the flags:
2616    //
2617    if(restore_flags)
2618    {
2619       // append a case change state if we need it:
2620       if(m_has_case_change)
2621       {
2622          static_cast<re_case*>(
2623             this->append_state(syntax_element_toggle_case, sizeof(re_case))
2624             )->icase = old_flags & regbase::icase;
2625       }
2626       this->flags(old_flags);
2627    }
2628    //
2629    // set up the jump pointer if we have one:
2630    //
2631    if(jump_offset)
2632    {
2633       this->m_pdata->m_data.align();
2634       re_jump* jmp = static_cast<re_jump*>(this->getaddress(jump_offset));
2635       jmp->alt.i = this->m_pdata->m_data.size() - this->getoffset(jmp);
2636       if((this->m_last_state == jmp) && (markid != -2))
2637       {
2638          // Oops... we didn't have anything inside the assertion.
2639          // Note we don't get here for negated forward lookahead as (?!)
2640          // does have some uses.
2641          // Rewind to start of (? sequence:
2642          --m_position;
2643          while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_open_mark) --m_position;
2644          fail(regex_constants::error_perl_extension, m_position - m_base, "Invalid or empty zero width assertion.");
2645          return false;
2646       }
2647    }
2648    //
2649    // verify that if this is conditional expression, that we do have
2650    // an alternative, if not add one:
2651    //
2652    if(markid == -4)
2653    {
2654       re_syntax_base* b = this->getaddress(expected_alt_point);
2655       // Make sure we have exactly one alternative following this state:
2656       if(b->type != syntax_element_alt)
2657       {
2658          re_alt* alt = static_cast<re_alt*>(this->insert_state(expected_alt_point, syntax_element_alt, sizeof(re_alt)));
2659          alt->alt.i = this->m_pdata->m_data.size() - this->getoffset(alt);
2660       }
2661       else if(((std::ptrdiff_t)this->m_pdata->m_data.size() > (static_cast<re_alt*>(b)->alt.i + this->getoffset(b))) && (static_cast<re_alt*>(b)->alt.i > 0) && this->getaddress(static_cast<re_alt*>(b)->alt.i, b)->type == syntax_element_alt)
2662       {
2663          // Can't have seen more than one alternative:
2664          // Rewind to start of (? sequence:
2665          --m_position;
2666          while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_open_mark) --m_position;
2667          fail(regex_constants::error_bad_pattern, m_position - m_base, "More than one alternation operator | was encountered inside a conditional expression.");
2668          return false;
2669       }
2670       else
2671       {
2672          // We must *not* have seen an alternative inside a (DEFINE) block:
2673          b = this->getaddress(b->next.i, b);
2674          if((b->type == syntax_element_assert_backref) && (static_cast<re_brace*>(b)->index == 9999))
2675          {
2676             // Rewind to start of (? sequence:
2677             --m_position;
2678             while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_open_mark) --m_position;
2679             fail(regex_constants::error_bad_pattern, m_position - m_base, "Alternation operators are not allowed inside a DEFINE block.");
2680             return false;
2681          }
2682       }
2683       // check for invalid repetition of next state:
2684       b = this->getaddress(expected_alt_point);
2685       b = this->getaddress(static_cast<re_alt*>(b)->next.i, b);
2686       if((b->type != syntax_element_assert_backref)
2687          && (b->type != syntax_element_startmark))
2688       {
2689          // Rewind to start of (? sequence:
2690          --m_position;
2691          while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_open_mark) --m_position;
2692          fail(regex_constants::error_badrepeat, m_position - m_base, "A repetition operator cannot be applied to a zero-width assertion.");
2693          return false;
2694       }
2695    }
2696    //
2697    // append closing parenthesis state:
2698    //
2699    pb = static_cast<re_brace*>(this->append_state(syntax_element_endmark, sizeof(re_brace)));
2700    pb->index = markid;
2701    pb->icase = this->flags() & regbase::icase;
2702    this->m_paren_start = last_paren_start;
2703    //
2704    // restore the alternate insertion point:
2705    //
2706    this->m_alt_insert_point = last_alt_point;
2707    //
2708    // and the case change data:
2709    //
2710    m_has_case_change = old_case_change;
2711    //
2712    // And the mark_reset data:
2713    //
2714    if(m_max_mark > m_mark_count)
2715    {
2716       m_mark_count = m_max_mark;
2717    }
2718    m_mark_reset = mark_reset;
2719    m_max_mark = max_mark;
2720 
2721 
2722    if(markid > 0)
2723    {
2724 #ifndef BOOST_NO_STD_DISTANCE
2725       if(this->flags() & regbase::save_subexpression_location)
2726          this->m_pdata->m_subs.at((std::size_t)markid - 1).second = std::distance(m_base, m_position) - 1;
2727 #else
2728       if(this->flags() & regbase::save_subexpression_location)
2729          this->m_pdata->m_subs.at(markid - 1).second = (m_position - m_base) - 1;
2730 #endif
2731       //
2732       // allow backrefs to this mark:
2733       //
2734       this->m_backrefs.set(markid);
2735    }
2736    return true;
2737 }
2738 
2739 template <class charT, class traits>
match_verb(const char * verb)2740 bool basic_regex_parser<charT, traits>::match_verb(const char* verb)
2741 {
2742    while(*verb)
2743    {
2744       if(static_cast<charT>(*verb) != *m_position)
2745       {
2746          while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_open_mark) --m_position;
2747          fail(regex_constants::error_perl_extension, m_position - m_base);
2748          return false;
2749       }
2750       if(++m_position == m_end)
2751       {
2752          --m_position;
2753          while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_open_mark) --m_position;
2754          fail(regex_constants::error_perl_extension, m_position - m_base);
2755          return false;
2756       }
2757       ++verb;
2758    }
2759    return true;
2760 }
2761 
2762 #ifdef BOOST_MSVC
2763 #  pragma warning(push)
2764 #if BOOST_MSVC >= 1800
2765 #pragma warning(disable:26812)
2766 #endif
2767 #endif
2768 template <class charT, class traits>
parse_perl_verb()2769 bool basic_regex_parser<charT, traits>::parse_perl_verb()
2770 {
2771    if(++m_position == m_end)
2772    {
2773       // Rewind to start of (* sequence:
2774       --m_position;
2775       while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_open_mark) --m_position;
2776       fail(regex_constants::error_perl_extension, m_position - m_base);
2777       return false;
2778    }
2779    switch(*m_position)
2780    {
2781    case 'F':
2782       if(++m_position == m_end)
2783       {
2784          // Rewind to start of (* sequence:
2785          --m_position;
2786          while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_open_mark) --m_position;
2787          fail(regex_constants::error_perl_extension, m_position - m_base);
2788          return false;
2789       }
2790       if((this->m_traits.syntax_type(*m_position) == regex_constants::syntax_close_mark) || match_verb("AIL"))
2791       {
2792          if((m_position == m_end) || (this->m_traits.syntax_type(*m_position) != regex_constants::syntax_close_mark))
2793          {
2794             // Rewind to start of (* sequence:
2795             --m_position;
2796             while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_open_mark) --m_position;
2797             fail(regex_constants::error_perl_extension, m_position - m_base);
2798             return false;
2799          }
2800          ++m_position;
2801          this->append_state(syntax_element_fail);
2802          return true;
2803       }
2804       break;
2805    case 'A':
2806       if(++m_position == m_end)
2807       {
2808          // Rewind to start of (* sequence:
2809          --m_position;
2810          while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_open_mark) --m_position;
2811          fail(regex_constants::error_perl_extension, m_position - m_base);
2812          return false;
2813       }
2814       if(match_verb("CCEPT"))
2815       {
2816          if((m_position == m_end) || (this->m_traits.syntax_type(*m_position) != regex_constants::syntax_close_mark))
2817          {
2818             // Rewind to start of (* sequence:
2819             --m_position;
2820             while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_open_mark) --m_position;
2821             fail(regex_constants::error_perl_extension, m_position - m_base);
2822             return false;
2823          }
2824          ++m_position;
2825          this->append_state(syntax_element_accept);
2826          return true;
2827       }
2828       break;
2829    case 'C':
2830       if(++m_position == m_end)
2831       {
2832          // Rewind to start of (* sequence:
2833          --m_position;
2834          while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_open_mark) --m_position;
2835          fail(regex_constants::error_perl_extension, m_position - m_base);
2836          return false;
2837       }
2838       if(match_verb("OMMIT"))
2839       {
2840          if((m_position == m_end) || (this->m_traits.syntax_type(*m_position) != regex_constants::syntax_close_mark))
2841          {
2842             // Rewind to start of (* sequence:
2843             --m_position;
2844             while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_open_mark) --m_position;
2845             fail(regex_constants::error_perl_extension, m_position - m_base);
2846             return false;
2847          }
2848          ++m_position;
2849          static_cast<re_commit*>(this->append_state(syntax_element_commit, sizeof(re_commit)))->action = commit_commit;
2850          this->m_pdata->m_disable_match_any = true;
2851          return true;
2852       }
2853       break;
2854    case 'P':
2855       if(++m_position == m_end)
2856       {
2857          // Rewind to start of (* sequence:
2858          --m_position;
2859          while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_open_mark) --m_position;
2860          fail(regex_constants::error_perl_extension, m_position - m_base);
2861          return false;
2862       }
2863       if(match_verb("RUNE"))
2864       {
2865          if((m_position == m_end) || (this->m_traits.syntax_type(*m_position) != regex_constants::syntax_close_mark))
2866          {
2867             // Rewind to start of (* sequence:
2868             --m_position;
2869             while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_open_mark) --m_position;
2870             fail(regex_constants::error_perl_extension, m_position - m_base);
2871             return false;
2872          }
2873          ++m_position;
2874          static_cast<re_commit*>(this->append_state(syntax_element_commit, sizeof(re_commit)))->action = commit_prune;
2875          this->m_pdata->m_disable_match_any = true;
2876          return true;
2877       }
2878       break;
2879    case 'S':
2880       if(++m_position == m_end)
2881       {
2882          // Rewind to start of (* sequence:
2883          --m_position;
2884          while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_open_mark) --m_position;
2885          fail(regex_constants::error_perl_extension, m_position - m_base);
2886          return false;
2887       }
2888       if(match_verb("KIP"))
2889       {
2890          if((m_position == m_end) || (this->m_traits.syntax_type(*m_position) != regex_constants::syntax_close_mark))
2891          {
2892             // Rewind to start of (* sequence:
2893             --m_position;
2894             while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_open_mark) --m_position;
2895             fail(regex_constants::error_perl_extension, m_position - m_base);
2896             return false;
2897          }
2898          ++m_position;
2899          static_cast<re_commit*>(this->append_state(syntax_element_commit, sizeof(re_commit)))->action = commit_skip;
2900          this->m_pdata->m_disable_match_any = true;
2901          return true;
2902       }
2903       break;
2904    case 'T':
2905       if(++m_position == m_end)
2906       {
2907          // Rewind to start of (* sequence:
2908          --m_position;
2909          while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_open_mark) --m_position;
2910          fail(regex_constants::error_perl_extension, m_position - m_base);
2911          return false;
2912       }
2913       if(match_verb("HEN"))
2914       {
2915          if((m_position == m_end) || (this->m_traits.syntax_type(*m_position) != regex_constants::syntax_close_mark))
2916          {
2917             // Rewind to start of (* sequence:
2918             --m_position;
2919             while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_open_mark) --m_position;
2920             fail(regex_constants::error_perl_extension, m_position - m_base);
2921             return false;
2922          }
2923          ++m_position;
2924          this->append_state(syntax_element_then);
2925          this->m_pdata->m_disable_match_any = true;
2926          return true;
2927       }
2928       break;
2929    }
2930    // Rewind to start of (* sequence:
2931    --m_position;
2932    while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_open_mark) --m_position;
2933    fail(regex_constants::error_perl_extension, m_position - m_base);
2934    return false;
2935 }
2936 #ifdef BOOST_MSVC
2937 #  pragma warning(pop)
2938 #endif
2939 
2940 template <class charT, class traits>
add_emacs_code(bool negate)2941 bool basic_regex_parser<charT, traits>::add_emacs_code(bool negate)
2942 {
2943    //
2944    // parses an emacs style \sx or \Sx construct.
2945    //
2946    if(++m_position == m_end)
2947    {
2948       // Rewind to start of sequence:
2949       --m_position;
2950       while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_escape) --m_position;
2951       fail(regex_constants::error_escape, m_position - m_base);
2952       return false;
2953    }
2954    basic_char_set<charT, traits> char_set;
2955    if(negate)
2956       char_set.negate();
2957 
2958    static const charT s_punct[5] = { 'p', 'u', 'n', 'c', 't', };
2959 
2960    switch(*m_position)
2961    {
2962    case 's':
2963    case ' ':
2964       char_set.add_class(this->m_mask_space);
2965       break;
2966    case 'w':
2967       char_set.add_class(this->m_word_mask);
2968       break;
2969    case '_':
2970       char_set.add_single(digraph<charT>(charT('$')));
2971       char_set.add_single(digraph<charT>(charT('&')));
2972       char_set.add_single(digraph<charT>(charT('*')));
2973       char_set.add_single(digraph<charT>(charT('+')));
2974       char_set.add_single(digraph<charT>(charT('-')));
2975       char_set.add_single(digraph<charT>(charT('_')));
2976       char_set.add_single(digraph<charT>(charT('<')));
2977       char_set.add_single(digraph<charT>(charT('>')));
2978       break;
2979    case '.':
2980       char_set.add_class(this->m_traits.lookup_classname(s_punct, s_punct+5));
2981       break;
2982    case '(':
2983       char_set.add_single(digraph<charT>(charT('(')));
2984       char_set.add_single(digraph<charT>(charT('[')));
2985       char_set.add_single(digraph<charT>(charT('{')));
2986       break;
2987    case ')':
2988       char_set.add_single(digraph<charT>(charT(')')));
2989       char_set.add_single(digraph<charT>(charT(']')));
2990       char_set.add_single(digraph<charT>(charT('}')));
2991       break;
2992    case '"':
2993       char_set.add_single(digraph<charT>(charT('"')));
2994       char_set.add_single(digraph<charT>(charT('\'')));
2995       char_set.add_single(digraph<charT>(charT('`')));
2996       break;
2997    case '\'':
2998       char_set.add_single(digraph<charT>(charT('\'')));
2999       char_set.add_single(digraph<charT>(charT(',')));
3000       char_set.add_single(digraph<charT>(charT('#')));
3001       break;
3002    case '<':
3003       char_set.add_single(digraph<charT>(charT(';')));
3004       break;
3005    case '>':
3006       char_set.add_single(digraph<charT>(charT('\n')));
3007       char_set.add_single(digraph<charT>(charT('\f')));
3008       break;
3009    default:
3010       fail(regex_constants::error_ctype, m_position - m_base);
3011       return false;
3012    }
3013    if(0 == this->append_set(char_set))
3014    {
3015       fail(regex_constants::error_ctype, m_position - m_base);
3016       return false;
3017    }
3018    ++m_position;
3019    return true;
3020 }
3021 
3022 template <class charT, class traits>
parse_options()3023 regex_constants::syntax_option_type basic_regex_parser<charT, traits>::parse_options()
3024 {
3025    // we have a (?imsx-imsx) group, convert it into a set of flags:
3026    regex_constants::syntax_option_type f = this->flags();
3027    bool breakout = false;
3028    do
3029    {
3030       switch(*m_position)
3031       {
3032       case 's':
3033          f |= regex_constants::mod_s;
3034          f &= ~regex_constants::no_mod_s;
3035          break;
3036       case 'm':
3037          f &= ~regex_constants::no_mod_m;
3038          break;
3039       case 'i':
3040          f |= regex_constants::icase;
3041          break;
3042       case 'x':
3043          f |= regex_constants::mod_x;
3044          break;
3045       default:
3046          breakout = true;
3047          continue;
3048       }
3049       if(++m_position == m_end)
3050       {
3051          // Rewind to start of (? sequence:
3052          --m_position;
3053          while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_open_mark) --m_position;
3054          fail(regex_constants::error_paren, m_position - m_base);
3055          return false;
3056       }
3057    }
3058    while(!breakout);
3059 
3060    breakout = false;
3061 
3062    if(*m_position == static_cast<charT>('-'))
3063    {
3064       if(++m_position == m_end)
3065       {
3066          // Rewind to start of (? sequence:
3067          --m_position;
3068          while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_open_mark) --m_position;
3069          fail(regex_constants::error_paren, m_position - m_base);
3070          return false;
3071       }
3072       do
3073       {
3074          switch(*m_position)
3075          {
3076          case 's':
3077             f &= ~regex_constants::mod_s;
3078             f |= regex_constants::no_mod_s;
3079             break;
3080          case 'm':
3081             f |= regex_constants::no_mod_m;
3082             break;
3083          case 'i':
3084             f &= ~regex_constants::icase;
3085             break;
3086          case 'x':
3087             f &= ~regex_constants::mod_x;
3088             break;
3089          default:
3090             breakout = true;
3091             continue;
3092          }
3093          if(++m_position == m_end)
3094          {
3095             // Rewind to start of (? sequence:
3096             --m_position;
3097             while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_open_mark) --m_position;
3098             fail(regex_constants::error_paren, m_position - m_base);
3099             return false;
3100          }
3101       }
3102       while(!breakout);
3103    }
3104    return f;
3105 }
3106 
3107 template <class charT, class traits>
unwind_alts(std::ptrdiff_t last_paren_start)3108 bool basic_regex_parser<charT, traits>::unwind_alts(std::ptrdiff_t last_paren_start)
3109 {
3110    //
3111    // If we didn't actually add any states after the last
3112    // alternative then that's an error:
3113    //
3114    if((this->m_alt_insert_point == static_cast<std::ptrdiff_t>(this->m_pdata->m_data.size()))
3115       && m_alt_jumps.size() && (m_alt_jumps.back() > last_paren_start)
3116       &&
3117       !(
3118          ((this->flags() & regbase::main_option_type) == regbase::perl_syntax_group)
3119            &&
3120          ((this->flags() & regbase::no_empty_expressions) == 0)
3121         )
3122       )
3123    {
3124       fail(regex_constants::error_empty, this->m_position - this->m_base, "Can't terminate a sub-expression with an alternation operator |.");
3125       return false;
3126    }
3127    //
3128    // Fix up our alternatives:
3129    //
3130    while(m_alt_jumps.size() && (m_alt_jumps.back() > last_paren_start))
3131    {
3132       //
3133       // fix up the jump to point to the end of the states
3134       // that we've just added:
3135       //
3136       std::ptrdiff_t jump_offset = m_alt_jumps.back();
3137       m_alt_jumps.pop_back();
3138       this->m_pdata->m_data.align();
3139       re_jump* jmp = static_cast<re_jump*>(this->getaddress(jump_offset));
3140       BOOST_ASSERT(jmp->type == syntax_element_jump);
3141       jmp->alt.i = this->m_pdata->m_data.size() - jump_offset;
3142    }
3143    return true;
3144 }
3145 
3146 #ifdef BOOST_MSVC
3147 #pragma warning(pop)
3148 #endif
3149 
3150 } // namespace BOOST_REGEX_DETAIL_NS
3151 } // namespace boost
3152 
3153 #ifdef BOOST_MSVC
3154 #pragma warning(push)
3155 #pragma warning(disable: 4103)
3156 #endif
3157 #ifdef BOOST_HAS_ABI_HEADERS
3158 #  include BOOST_ABI_SUFFIX
3159 #endif
3160 #ifdef BOOST_MSVC
3161 #pragma warning(pop)
3162 #endif
3163 
3164 #endif
3165