1 /*
2 *
3 * Copyright (c) 2004
4 * John Maddock
5 *
6 * Use, modification and distribution are subject to the
7 * Boost Software License, Version 1.0. (See accompanying file
8 * LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
9 *
10 */
11
12 /*
13 * LOCATION: see http://www.boost.org for most recent version.
14 * FILE basic_regex_parser.cpp
15 * VERSION see <boost/version.hpp>
16 * DESCRIPTION: Declares template class basic_regex_parser.
17 */
18
19 #ifndef BOOST_REGEX_V4_BASIC_REGEX_PARSER_HPP
20 #define BOOST_REGEX_V4_BASIC_REGEX_PARSER_HPP
21
22 #ifdef BOOST_MSVC
23 #pragma warning(push)
24 #pragma warning(disable: 4103)
25 #if BOOST_MSVC >= 1800
26 #pragma warning(disable: 26812)
27 #endif
28 #endif
29 #ifdef BOOST_HAS_ABI_HEADERS
30 # include BOOST_ABI_PREFIX
31 #endif
32 #ifdef BOOST_MSVC
33 #pragma warning(pop)
34 #endif
35
36 namespace boost{
37 namespace BOOST_REGEX_DETAIL_NS{
38
39 #ifdef BOOST_MSVC
40 #pragma warning(push)
41 #pragma warning(disable:4244)
42 #if BOOST_MSVC < 1910
43 #pragma warning(disable:4800)
44 #endif
45 #endif
46
umax(mpl::false_ const &)47 inline boost::intmax_t umax(mpl::false_ const&)
48 {
49 // Get out clause here, just in case numeric_limits is unspecialized:
50 return std::numeric_limits<boost::intmax_t>::is_specialized ? (std::numeric_limits<boost::intmax_t>::max)() : INT_MAX;
51 }
umax(mpl::true_ const &)52 inline boost::intmax_t umax(mpl::true_ const&)
53 {
54 return (std::numeric_limits<std::size_t>::max)();
55 }
56
umax()57 inline boost::intmax_t umax()
58 {
59 return umax(mpl::bool_<std::numeric_limits<boost::intmax_t>::digits >= std::numeric_limits<std::size_t>::digits>());
60 }
61
62 template <class charT, class traits>
63 class basic_regex_parser : public basic_regex_creator<charT, traits>
64 {
65 public:
66 basic_regex_parser(regex_data<charT, traits>* data);
67 void parse(const charT* p1, const charT* p2, unsigned flags);
68 void fail(regex_constants::error_type error_code, std::ptrdiff_t position);
69 void fail(regex_constants::error_type error_code, std::ptrdiff_t position, std::string message, std::ptrdiff_t start_pos);
fail(regex_constants::error_type error_code,std::ptrdiff_t position,const std::string & message)70 void fail(regex_constants::error_type error_code, std::ptrdiff_t position, const std::string& message)
71 {
72 fail(error_code, position, message, position);
73 }
74
75 bool parse_all();
76 bool parse_basic();
77 bool parse_extended();
78 bool parse_literal();
79 bool parse_open_paren();
80 bool parse_basic_escape();
81 bool parse_extended_escape();
82 bool parse_match_any();
83 bool parse_repeat(std::size_t low = 0, std::size_t high = (std::numeric_limits<std::size_t>::max)());
84 bool parse_repeat_range(bool isbasic);
85 bool parse_alt();
86 bool parse_set();
87 bool parse_backref();
88 void parse_set_literal(basic_char_set<charT, traits>& char_set);
89 bool parse_inner_set(basic_char_set<charT, traits>& char_set);
90 bool parse_QE();
91 bool parse_perl_extension();
92 bool parse_perl_verb();
93 bool match_verb(const char*);
94 bool add_emacs_code(bool negate);
95 bool unwind_alts(std::ptrdiff_t last_paren_start);
96 digraph<charT> get_next_set_literal(basic_char_set<charT, traits>& char_set);
97 charT unescape_character();
98 regex_constants::syntax_option_type parse_options();
99
100 private:
101 typedef bool (basic_regex_parser::*parser_proc_type)();
102 typedef typename traits::string_type string_type;
103 typedef typename traits::char_class_type char_class_type;
104 parser_proc_type m_parser_proc; // the main parser to use
105 const charT* m_base; // the start of the string being parsed
106 const charT* m_end; // the end of the string being parsed
107 const charT* m_position; // our current parser position
108 unsigned m_mark_count; // how many sub-expressions we have
109 int m_mark_reset; // used to indicate that we're inside a (?|...) block.
110 unsigned m_max_mark; // largest mark count seen inside a (?|...) block.
111 std::ptrdiff_t m_paren_start; // where the last seen ')' began (where repeats are inserted).
112 std::ptrdiff_t m_alt_insert_point; // where to insert the next alternative
113 bool m_has_case_change; // true if somewhere in the current block the case has changed
114 unsigned m_recursion_count; // How many times we've called parse_all.
115 #if defined(BOOST_MSVC) && defined(_M_IX86)
116 // This is an ugly warning suppression workaround (for warnings *inside* std::vector
117 // that can not otherwise be suppressed)...
118 BOOST_STATIC_ASSERT(sizeof(long) >= sizeof(void*));
119 std::vector<long> m_alt_jumps; // list of alternative in the current scope.
120 #else
121 std::vector<std::ptrdiff_t> m_alt_jumps; // list of alternative in the current scope.
122 #endif
123
124 basic_regex_parser& operator=(const basic_regex_parser&);
125 basic_regex_parser(const basic_regex_parser&);
126 };
127
128 template <class charT, class traits>
basic_regex_parser(regex_data<charT,traits> * data)129 basic_regex_parser<charT, traits>::basic_regex_parser(regex_data<charT, traits>* data)
130 : basic_regex_creator<charT, traits>(data), m_parser_proc(), m_base(0), m_end(0), m_position(0),
131 m_mark_count(0), m_mark_reset(-1), m_max_mark(0), m_paren_start(0), m_alt_insert_point(0), m_has_case_change(false), m_recursion_count(0)
132 {
133 }
134
135 template <class charT, class traits>
parse(const charT * p1,const charT * p2,unsigned l_flags)136 void basic_regex_parser<charT, traits>::parse(const charT* p1, const charT* p2, unsigned l_flags)
137 {
138 // pass l_flags on to base class:
139 this->init(l_flags);
140 // set up pointers:
141 m_position = m_base = p1;
142 m_end = p2;
143 // empty strings are errors:
144 if((p1 == p2) &&
145 (
146 ((l_flags & regbase::main_option_type) != regbase::perl_syntax_group)
147 || (l_flags & regbase::no_empty_expressions)
148 )
149 )
150 {
151 fail(regex_constants::error_empty, 0);
152 return;
153 }
154 // select which parser to use:
155 switch(l_flags & regbase::main_option_type)
156 {
157 case regbase::perl_syntax_group:
158 {
159 m_parser_proc = &basic_regex_parser<charT, traits>::parse_extended;
160 //
161 // Add a leading paren with index zero to give recursions a target:
162 //
163 re_brace* br = static_cast<re_brace*>(this->append_state(syntax_element_startmark, sizeof(re_brace)));
164 br->index = 0;
165 br->icase = this->flags() & regbase::icase;
166 break;
167 }
168 case regbase::basic_syntax_group:
169 m_parser_proc = &basic_regex_parser<charT, traits>::parse_basic;
170 break;
171 case regbase::literal:
172 m_parser_proc = &basic_regex_parser<charT, traits>::parse_literal;
173 break;
174 default:
175 // Ooops, someone has managed to set more than one of the main option flags,
176 // so this must be an error:
177 fail(regex_constants::error_unknown, 0, "An invalid combination of regular expression syntax flags was used.");
178 return;
179 }
180
181 // parse all our characters:
182 bool result = parse_all();
183 //
184 // Unwind our alternatives:
185 //
186 unwind_alts(-1);
187 // reset l_flags as a global scope (?imsx) may have altered them:
188 this->flags(l_flags);
189 // if we haven't gobbled up all the characters then we must
190 // have had an unexpected ')' :
191 if(!result)
192 {
193 fail(regex_constants::error_paren, ::boost::BOOST_REGEX_DETAIL_NS::distance(m_base, m_position), "Found a closing ) with no corresponding opening parenthesis.");
194 return;
195 }
196 // if an error has been set then give up now:
197 if(this->m_pdata->m_status)
198 return;
199 // fill in our sub-expression count:
200 this->m_pdata->m_mark_count = 1u + (std::size_t)m_mark_count;
201 this->finalize(p1, p2);
202 }
203
204 template <class charT, class traits>
fail(regex_constants::error_type error_code,std::ptrdiff_t position)205 void basic_regex_parser<charT, traits>::fail(regex_constants::error_type error_code, std::ptrdiff_t position)
206 {
207 // get the error message:
208 std::string message = this->m_pdata->m_ptraits->error_string(error_code);
209 fail(error_code, position, message);
210 }
211
212 template <class charT, class traits>
fail(regex_constants::error_type error_code,std::ptrdiff_t position,std::string message,std::ptrdiff_t start_pos)213 void basic_regex_parser<charT, traits>::fail(regex_constants::error_type error_code, std::ptrdiff_t position, std::string message, std::ptrdiff_t start_pos)
214 {
215 if(0 == this->m_pdata->m_status) // update the error code if not already set
216 this->m_pdata->m_status = error_code;
217 m_position = m_end; // don't bother parsing anything else
218
219 #ifndef BOOST_NO_TEMPLATED_ITERATOR_CONSTRUCTORS
220 //
221 // Augment error message with the regular expression text:
222 //
223 if(start_pos == position)
224 start_pos = (std::max)(static_cast<std::ptrdiff_t>(0), position - static_cast<std::ptrdiff_t>(10));
225 std::ptrdiff_t end_pos = (std::min)(position + static_cast<std::ptrdiff_t>(10), static_cast<std::ptrdiff_t>(m_end - m_base));
226 if(error_code != regex_constants::error_empty)
227 {
228 if((start_pos != 0) || (end_pos != (m_end - m_base)))
229 message += " The error occurred while parsing the regular expression fragment: '";
230 else
231 message += " The error occurred while parsing the regular expression: '";
232 if(start_pos != end_pos)
233 {
234 message += std::string(m_base + start_pos, m_base + position);
235 message += ">>>HERE>>>";
236 message += std::string(m_base + position, m_base + end_pos);
237 }
238 message += "'.";
239 }
240 #endif
241
242 #ifndef BOOST_NO_EXCEPTIONS
243 if(0 == (this->flags() & regex_constants::no_except))
244 {
245 boost::regex_error e(message, error_code, position);
246 e.raise();
247 }
248 #else
249 (void)position; // suppress warnings.
250 #endif
251 }
252
253 template <class charT, class traits>
parse_all()254 bool basic_regex_parser<charT, traits>::parse_all()
255 {
256 if (++m_recursion_count > 400)
257 {
258 // exceeded internal limits
259 fail(boost::regex_constants::error_complexity, m_position - m_base, "Exceeded nested brace limit.");
260 }
261 bool result = true;
262 while(result && (m_position != m_end))
263 {
264 result = (this->*m_parser_proc)();
265 }
266 --m_recursion_count;
267 return result;
268 }
269
270 #ifdef BOOST_MSVC
271 #pragma warning(push)
272 #pragma warning(disable:4702)
273 #endif
274 template <class charT, class traits>
parse_basic()275 bool basic_regex_parser<charT, traits>::parse_basic()
276 {
277 switch(this->m_traits.syntax_type(*m_position))
278 {
279 case regex_constants::syntax_escape:
280 return parse_basic_escape();
281 case regex_constants::syntax_dot:
282 return parse_match_any();
283 case regex_constants::syntax_caret:
284 ++m_position;
285 this->append_state(syntax_element_start_line);
286 break;
287 case regex_constants::syntax_dollar:
288 ++m_position;
289 this->append_state(syntax_element_end_line);
290 break;
291 case regex_constants::syntax_star:
292 if(!(this->m_last_state) || (this->m_last_state->type == syntax_element_start_line))
293 return parse_literal();
294 else
295 {
296 ++m_position;
297 return parse_repeat();
298 }
299 case regex_constants::syntax_plus:
300 if(!(this->m_last_state) || (this->m_last_state->type == syntax_element_start_line) || !(this->flags() & regbase::emacs_ex))
301 return parse_literal();
302 else
303 {
304 ++m_position;
305 return parse_repeat(1);
306 }
307 case regex_constants::syntax_question:
308 if(!(this->m_last_state) || (this->m_last_state->type == syntax_element_start_line) || !(this->flags() & regbase::emacs_ex))
309 return parse_literal();
310 else
311 {
312 ++m_position;
313 return parse_repeat(0, 1);
314 }
315 case regex_constants::syntax_open_set:
316 return parse_set();
317 case regex_constants::syntax_newline:
318 if(this->flags() & regbase::newline_alt)
319 return parse_alt();
320 else
321 return parse_literal();
322 default:
323 return parse_literal();
324 }
325 return true;
326 }
327
328 #ifdef BOOST_MSVC
329 # pragma warning(push)
330 #if BOOST_MSVC >= 1800
331 #pragma warning(disable:26812)
332 #endif
333 #endif
334 template <class charT, class traits>
parse_extended()335 bool basic_regex_parser<charT, traits>::parse_extended()
336 {
337 bool result = true;
338 switch(this->m_traits.syntax_type(*m_position))
339 {
340 case regex_constants::syntax_open_mark:
341 return parse_open_paren();
342 case regex_constants::syntax_close_mark:
343 return false;
344 case regex_constants::syntax_escape:
345 return parse_extended_escape();
346 case regex_constants::syntax_dot:
347 return parse_match_any();
348 case regex_constants::syntax_caret:
349 ++m_position;
350 this->append_state(
351 (this->flags() & regex_constants::no_mod_m ? syntax_element_buffer_start : syntax_element_start_line));
352 break;
353 case regex_constants::syntax_dollar:
354 ++m_position;
355 this->append_state(
356 (this->flags() & regex_constants::no_mod_m ? syntax_element_buffer_end : syntax_element_end_line));
357 break;
358 case regex_constants::syntax_star:
359 if(m_position == this->m_base)
360 {
361 fail(regex_constants::error_badrepeat, 0, "The repeat operator \"*\" cannot start a regular expression.");
362 return false;
363 }
364 ++m_position;
365 return parse_repeat();
366 case regex_constants::syntax_question:
367 if(m_position == this->m_base)
368 {
369 fail(regex_constants::error_badrepeat, 0, "The repeat operator \"?\" cannot start a regular expression.");
370 return false;
371 }
372 ++m_position;
373 return parse_repeat(0,1);
374 case regex_constants::syntax_plus:
375 if(m_position == this->m_base)
376 {
377 fail(regex_constants::error_badrepeat, 0, "The repeat operator \"+\" cannot start a regular expression.");
378 return false;
379 }
380 ++m_position;
381 return parse_repeat(1);
382 case regex_constants::syntax_open_brace:
383 ++m_position;
384 return parse_repeat_range(false);
385 case regex_constants::syntax_close_brace:
386 if((this->flags() & regbase::no_perl_ex) == regbase::no_perl_ex)
387 {
388 fail(regex_constants::error_brace, this->m_position - this->m_base, "Found a closing repetition operator } with no corresponding {.");
389 return false;
390 }
391 result = parse_literal();
392 break;
393 case regex_constants::syntax_or:
394 return parse_alt();
395 case regex_constants::syntax_open_set:
396 return parse_set();
397 case regex_constants::syntax_newline:
398 if(this->flags() & regbase::newline_alt)
399 return parse_alt();
400 else
401 return parse_literal();
402 case regex_constants::syntax_hash:
403 //
404 // If we have a mod_x flag set, then skip until
405 // we get to a newline character:
406 //
407 if((this->flags()
408 & (regbase::no_perl_ex|regbase::mod_x))
409 == regbase::mod_x)
410 {
411 while((m_position != m_end) && !is_separator(*m_position++)){}
412 return true;
413 }
414 BOOST_FALLTHROUGH;
415 default:
416 result = parse_literal();
417 break;
418 }
419 return result;
420 }
421 #ifdef BOOST_MSVC
422 # pragma warning(pop)
423 #endif
424 #ifdef BOOST_MSVC
425 #pragma warning(pop)
426 #endif
427
428 template <class charT, class traits>
parse_literal()429 bool basic_regex_parser<charT, traits>::parse_literal()
430 {
431 // append this as a literal provided it's not a space character
432 // or the perl option regbase::mod_x is not set:
433 if(
434 ((this->flags()
435 & (regbase::main_option_type|regbase::mod_x|regbase::no_perl_ex))
436 != regbase::mod_x)
437 || !this->m_traits.isctype(*m_position, this->m_mask_space))
438 this->append_literal(*m_position);
439 ++m_position;
440 return true;
441 }
442
443 template <class charT, class traits>
parse_open_paren()444 bool basic_regex_parser<charT, traits>::parse_open_paren()
445 {
446 //
447 // skip the '(' and error check:
448 //
449 if(++m_position == m_end)
450 {
451 fail(regex_constants::error_paren, m_position - m_base);
452 return false;
453 }
454 //
455 // begin by checking for a perl-style (?...) extension:
456 //
457 if(
458 ((this->flags() & (regbase::main_option_type | regbase::no_perl_ex)) == 0)
459 || ((this->flags() & (regbase::main_option_type | regbase::emacs_ex)) == (regbase::basic_syntax_group|regbase::emacs_ex))
460 )
461 {
462 if(this->m_traits.syntax_type(*m_position) == regex_constants::syntax_question)
463 return parse_perl_extension();
464 if(this->m_traits.syntax_type(*m_position) == regex_constants::syntax_star)
465 return parse_perl_verb();
466 }
467 //
468 // update our mark count, and append the required state:
469 //
470 unsigned markid = 0;
471 if(0 == (this->flags() & regbase::nosubs))
472 {
473 markid = ++m_mark_count;
474 #ifndef BOOST_NO_STD_DISTANCE
475 if(this->flags() & regbase::save_subexpression_location)
476 this->m_pdata->m_subs.push_back(std::pair<std::size_t, std::size_t>(std::distance(m_base, m_position) - 1, 0));
477 #else
478 if(this->flags() & regbase::save_subexpression_location)
479 this->m_pdata->m_subs.push_back(std::pair<std::size_t, std::size_t>((m_position - m_base) - 1, 0));
480 #endif
481 }
482 re_brace* pb = static_cast<re_brace*>(this->append_state(syntax_element_startmark, sizeof(re_brace)));
483 pb->index = markid;
484 pb->icase = this->flags() & regbase::icase;
485 std::ptrdiff_t last_paren_start = this->getoffset(pb);
486 // back up insertion point for alternations, and set new point:
487 std::ptrdiff_t last_alt_point = m_alt_insert_point;
488 this->m_pdata->m_data.align();
489 m_alt_insert_point = this->m_pdata->m_data.size();
490 //
491 // back up the current flags in case we have a nested (?imsx) group:
492 //
493 regex_constants::syntax_option_type opts = this->flags();
494 bool old_case_change = m_has_case_change;
495 m_has_case_change = false; // no changes to this scope as yet...
496 //
497 // Back up branch reset data in case we have a nested (?|...)
498 //
499 int mark_reset = m_mark_reset;
500 m_mark_reset = -1;
501 //
502 // now recursively add more states, this will terminate when we get to a
503 // matching ')' :
504 //
505 parse_all();
506 //
507 // Unwind pushed alternatives:
508 //
509 if(0 == unwind_alts(last_paren_start))
510 return false;
511 //
512 // restore flags:
513 //
514 if(m_has_case_change)
515 {
516 // the case has changed in one or more of the alternatives
517 // within the scoped (...) block: we have to add a state
518 // to reset the case sensitivity:
519 static_cast<re_case*>(
520 this->append_state(syntax_element_toggle_case, sizeof(re_case))
521 )->icase = opts & regbase::icase;
522 }
523 this->flags(opts);
524 m_has_case_change = old_case_change;
525 //
526 // restore branch reset:
527 //
528 m_mark_reset = mark_reset;
529 //
530 // we either have a ')' or we have run out of characters prematurely:
531 //
532 if(m_position == m_end)
533 {
534 this->fail(regex_constants::error_paren, ::boost::BOOST_REGEX_DETAIL_NS::distance(m_base, m_end));
535 return false;
536 }
537 if(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_close_mark)
538 return false;
539 #ifndef BOOST_NO_STD_DISTANCE
540 if(markid && (this->flags() & regbase::save_subexpression_location))
541 this->m_pdata->m_subs.at(markid - 1).second = std::distance(m_base, m_position);
542 #else
543 if(markid && (this->flags() & regbase::save_subexpression_location))
544 this->m_pdata->m_subs.at(markid - 1).second = (m_position - m_base);
545 #endif
546 ++m_position;
547 //
548 // append closing parenthesis state:
549 //
550 pb = static_cast<re_brace*>(this->append_state(syntax_element_endmark, sizeof(re_brace)));
551 pb->index = markid;
552 pb->icase = this->flags() & regbase::icase;
553 this->m_paren_start = last_paren_start;
554 //
555 // restore the alternate insertion point:
556 //
557 this->m_alt_insert_point = last_alt_point;
558 //
559 // allow backrefs to this mark:
560 //
561 if(markid > 0)
562 this->m_backrefs.set(markid);
563
564 return true;
565 }
566
567 template <class charT, class traits>
parse_basic_escape()568 bool basic_regex_parser<charT, traits>::parse_basic_escape()
569 {
570 if(++m_position == m_end)
571 {
572 fail(regex_constants::error_paren, m_position - m_base);
573 return false;
574 }
575 bool result = true;
576 switch(this->m_traits.escape_syntax_type(*m_position))
577 {
578 case regex_constants::syntax_open_mark:
579 return parse_open_paren();
580 case regex_constants::syntax_close_mark:
581 return false;
582 case regex_constants::syntax_plus:
583 if(this->flags() & regex_constants::bk_plus_qm)
584 {
585 ++m_position;
586 return parse_repeat(1);
587 }
588 else
589 return parse_literal();
590 case regex_constants::syntax_question:
591 if(this->flags() & regex_constants::bk_plus_qm)
592 {
593 ++m_position;
594 return parse_repeat(0, 1);
595 }
596 else
597 return parse_literal();
598 case regex_constants::syntax_open_brace:
599 if(this->flags() & regbase::no_intervals)
600 return parse_literal();
601 ++m_position;
602 return parse_repeat_range(true);
603 case regex_constants::syntax_close_brace:
604 if(this->flags() & regbase::no_intervals)
605 return parse_literal();
606 fail(regex_constants::error_brace, this->m_position - this->m_base, "Found a closing repetition operator } with no corresponding {.");
607 return false;
608 case regex_constants::syntax_or:
609 if(this->flags() & regbase::bk_vbar)
610 return parse_alt();
611 else
612 result = parse_literal();
613 break;
614 case regex_constants::syntax_digit:
615 return parse_backref();
616 case regex_constants::escape_type_start_buffer:
617 if(this->flags() & regbase::emacs_ex)
618 {
619 ++m_position;
620 this->append_state(syntax_element_buffer_start);
621 }
622 else
623 result = parse_literal();
624 break;
625 case regex_constants::escape_type_end_buffer:
626 if(this->flags() & regbase::emacs_ex)
627 {
628 ++m_position;
629 this->append_state(syntax_element_buffer_end);
630 }
631 else
632 result = parse_literal();
633 break;
634 case regex_constants::escape_type_word_assert:
635 if(this->flags() & regbase::emacs_ex)
636 {
637 ++m_position;
638 this->append_state(syntax_element_word_boundary);
639 }
640 else
641 result = parse_literal();
642 break;
643 case regex_constants::escape_type_not_word_assert:
644 if(this->flags() & regbase::emacs_ex)
645 {
646 ++m_position;
647 this->append_state(syntax_element_within_word);
648 }
649 else
650 result = parse_literal();
651 break;
652 case regex_constants::escape_type_left_word:
653 if(this->flags() & regbase::emacs_ex)
654 {
655 ++m_position;
656 this->append_state(syntax_element_word_start);
657 }
658 else
659 result = parse_literal();
660 break;
661 case regex_constants::escape_type_right_word:
662 if(this->flags() & regbase::emacs_ex)
663 {
664 ++m_position;
665 this->append_state(syntax_element_word_end);
666 }
667 else
668 result = parse_literal();
669 break;
670 default:
671 if(this->flags() & regbase::emacs_ex)
672 {
673 bool negate = true;
674 switch(*m_position)
675 {
676 case 'w':
677 negate = false;
678 BOOST_FALLTHROUGH;
679 case 'W':
680 {
681 basic_char_set<charT, traits> char_set;
682 if(negate)
683 char_set.negate();
684 char_set.add_class(this->m_word_mask);
685 if(0 == this->append_set(char_set))
686 {
687 fail(regex_constants::error_ctype, m_position - m_base);
688 return false;
689 }
690 ++m_position;
691 return true;
692 }
693 case 's':
694 negate = false;
695 BOOST_FALLTHROUGH;
696 case 'S':
697 return add_emacs_code(negate);
698 case 'c':
699 case 'C':
700 // not supported yet:
701 fail(regex_constants::error_escape, m_position - m_base, "The \\c and \\C escape sequences are not supported by POSIX basic regular expressions: try the Perl syntax instead.");
702 return false;
703 default:
704 break;
705 }
706 }
707 result = parse_literal();
708 break;
709 }
710 return result;
711 }
712
713 template <class charT, class traits>
parse_extended_escape()714 bool basic_regex_parser<charT, traits>::parse_extended_escape()
715 {
716 ++m_position;
717 if(m_position == m_end)
718 {
719 fail(regex_constants::error_escape, m_position - m_base, "Incomplete escape sequence found.");
720 return false;
721 }
722 bool negate = false; // in case this is a character class escape: \w \d etc
723 switch(this->m_traits.escape_syntax_type(*m_position))
724 {
725 case regex_constants::escape_type_not_class:
726 negate = true;
727 BOOST_FALLTHROUGH;
728 case regex_constants::escape_type_class:
729 {
730 escape_type_class_jump:
731 typedef typename traits::char_class_type m_type;
732 m_type m = this->m_traits.lookup_classname(m_position, m_position+1);
733 if(m != 0)
734 {
735 basic_char_set<charT, traits> char_set;
736 if(negate)
737 char_set.negate();
738 char_set.add_class(m);
739 if(0 == this->append_set(char_set))
740 {
741 fail(regex_constants::error_ctype, m_position - m_base);
742 return false;
743 }
744 ++m_position;
745 return true;
746 }
747 //
748 // not a class, just a regular unknown escape:
749 //
750 this->append_literal(unescape_character());
751 break;
752 }
753 case regex_constants::syntax_digit:
754 return parse_backref();
755 case regex_constants::escape_type_left_word:
756 ++m_position;
757 this->append_state(syntax_element_word_start);
758 break;
759 case regex_constants::escape_type_right_word:
760 ++m_position;
761 this->append_state(syntax_element_word_end);
762 break;
763 case regex_constants::escape_type_start_buffer:
764 ++m_position;
765 this->append_state(syntax_element_buffer_start);
766 break;
767 case regex_constants::escape_type_end_buffer:
768 ++m_position;
769 this->append_state(syntax_element_buffer_end);
770 break;
771 case regex_constants::escape_type_word_assert:
772 ++m_position;
773 this->append_state(syntax_element_word_boundary);
774 break;
775 case regex_constants::escape_type_not_word_assert:
776 ++m_position;
777 this->append_state(syntax_element_within_word);
778 break;
779 case regex_constants::escape_type_Z:
780 ++m_position;
781 this->append_state(syntax_element_soft_buffer_end);
782 break;
783 case regex_constants::escape_type_Q:
784 return parse_QE();
785 case regex_constants::escape_type_C:
786 return parse_match_any();
787 case regex_constants::escape_type_X:
788 ++m_position;
789 this->append_state(syntax_element_combining);
790 break;
791 case regex_constants::escape_type_G:
792 ++m_position;
793 this->append_state(syntax_element_restart_continue);
794 break;
795 case regex_constants::escape_type_not_property:
796 negate = true;
797 BOOST_FALLTHROUGH;
798 case regex_constants::escape_type_property:
799 {
800 ++m_position;
801 char_class_type m;
802 if(m_position == m_end)
803 {
804 fail(regex_constants::error_escape, m_position - m_base, "Incomplete property escape found.");
805 return false;
806 }
807 // maybe have \p{ddd}
808 if(this->m_traits.syntax_type(*m_position) == regex_constants::syntax_open_brace)
809 {
810 const charT* base = m_position;
811 // skip forward until we find enclosing brace:
812 while((m_position != m_end) && (this->m_traits.syntax_type(*m_position) != regex_constants::syntax_close_brace))
813 ++m_position;
814 if(m_position == m_end)
815 {
816 fail(regex_constants::error_escape, m_position - m_base, "Closing } missing from property escape sequence.");
817 return false;
818 }
819 m = this->m_traits.lookup_classname(++base, m_position++);
820 }
821 else
822 {
823 m = this->m_traits.lookup_classname(m_position, m_position+1);
824 ++m_position;
825 }
826 if(m != 0)
827 {
828 basic_char_set<charT, traits> char_set;
829 if(negate)
830 char_set.negate();
831 char_set.add_class(m);
832 if(0 == this->append_set(char_set))
833 {
834 fail(regex_constants::error_ctype, m_position - m_base);
835 return false;
836 }
837 return true;
838 }
839 fail(regex_constants::error_ctype, m_position - m_base, "Escape sequence was neither a valid property nor a valid character class name.");
840 return false;
841 }
842 case regex_constants::escape_type_reset_start_mark:
843 if(0 == (this->flags() & (regbase::main_option_type | regbase::no_perl_ex)))
844 {
845 re_brace* pb = static_cast<re_brace*>(this->append_state(syntax_element_startmark, sizeof(re_brace)));
846 pb->index = -5;
847 pb->icase = this->flags() & regbase::icase;
848 this->m_pdata->m_data.align();
849 ++m_position;
850 return true;
851 }
852 goto escape_type_class_jump;
853 case regex_constants::escape_type_line_ending:
854 if(0 == (this->flags() & (regbase::main_option_type | regbase::no_perl_ex)))
855 {
856 const charT* e = get_escape_R_string<charT>();
857 const charT* old_position = m_position;
858 const charT* old_end = m_end;
859 const charT* old_base = m_base;
860 m_position = e;
861 m_base = e;
862 m_end = e + traits::length(e);
863 bool r = parse_all();
864 m_position = ++old_position;
865 m_end = old_end;
866 m_base = old_base;
867 return r;
868 }
869 goto escape_type_class_jump;
870 case regex_constants::escape_type_extended_backref:
871 if(0 == (this->flags() & (regbase::main_option_type | regbase::no_perl_ex)))
872 {
873 bool have_brace = false;
874 bool negative = false;
875 static const char incomplete_message[] = "Incomplete \\g escape found.";
876 if(++m_position == m_end)
877 {
878 fail(regex_constants::error_escape, m_position - m_base, incomplete_message);
879 return false;
880 }
881 // maybe have \g{ddd}
882 regex_constants::syntax_type syn = this->m_traits.syntax_type(*m_position);
883 regex_constants::syntax_type syn_end = 0;
884 if((syn == regex_constants::syntax_open_brace)
885 || (syn == regex_constants::escape_type_left_word)
886 || (syn == regex_constants::escape_type_end_buffer))
887 {
888 if(++m_position == m_end)
889 {
890 fail(regex_constants::error_escape, m_position - m_base, incomplete_message);
891 return false;
892 }
893 have_brace = true;
894 switch(syn)
895 {
896 case regex_constants::syntax_open_brace:
897 syn_end = regex_constants::syntax_close_brace;
898 break;
899 case regex_constants::escape_type_left_word:
900 syn_end = regex_constants::escape_type_right_word;
901 break;
902 default:
903 syn_end = regex_constants::escape_type_end_buffer;
904 break;
905 }
906 }
907 negative = (*m_position == static_cast<charT>('-'));
908 if((negative) && (++m_position == m_end))
909 {
910 fail(regex_constants::error_escape, m_position - m_base, incomplete_message);
911 return false;
912 }
913 const charT* pc = m_position;
914 boost::intmax_t i = this->m_traits.toi(pc, m_end, 10);
915 if((i < 0) && syn_end)
916 {
917 // Check for a named capture, get the leftmost one if there is more than one:
918 const charT* base = m_position;
919 while((m_position != m_end) && (this->m_traits.syntax_type(*m_position) != syn_end))
920 {
921 ++m_position;
922 }
923 i = hash_value_from_capture_name(base, m_position);
924 pc = m_position;
925 }
926 if(negative)
927 i = 1 + (static_cast<boost::intmax_t>(m_mark_count) - i);
928 if(((i < hash_value_mask) && (i > 0) && (this->m_backrefs.test(i))) || ((i >= hash_value_mask) && (this->m_pdata->get_id(i) > 0) && (this->m_backrefs.test(this->m_pdata->get_id(i)))))
929 {
930 m_position = pc;
931 re_brace* pb = static_cast<re_brace*>(this->append_state(syntax_element_backref, sizeof(re_brace)));
932 pb->index = i;
933 pb->icase = this->flags() & regbase::icase;
934 }
935 else
936 {
937 fail(regex_constants::error_backref, m_position - m_base);
938 return false;
939 }
940 m_position = pc;
941 if(have_brace)
942 {
943 if((m_position == m_end) || (this->m_traits.syntax_type(*m_position) != syn_end))
944 {
945 fail(regex_constants::error_escape, m_position - m_base, incomplete_message);
946 return false;
947 }
948 ++m_position;
949 }
950 return true;
951 }
952 goto escape_type_class_jump;
953 case regex_constants::escape_type_control_v:
954 if(0 == (this->flags() & (regbase::main_option_type | regbase::no_perl_ex)))
955 goto escape_type_class_jump;
956 BOOST_FALLTHROUGH;
957 default:
958 this->append_literal(unescape_character());
959 break;
960 }
961 return true;
962 }
963
964 template <class charT, class traits>
parse_match_any()965 bool basic_regex_parser<charT, traits>::parse_match_any()
966 {
967 //
968 // we have a '.' that can match any character:
969 //
970 ++m_position;
971 static_cast<re_dot*>(
972 this->append_state(syntax_element_wild, sizeof(re_dot))
973 )->mask = static_cast<unsigned char>(this->flags() & regbase::no_mod_s
974 ? BOOST_REGEX_DETAIL_NS::force_not_newline
975 : this->flags() & regbase::mod_s ?
976 BOOST_REGEX_DETAIL_NS::force_newline : BOOST_REGEX_DETAIL_NS::dont_care);
977 return true;
978 }
979
980 template <class charT, class traits>
parse_repeat(std::size_t low,std::size_t high)981 bool basic_regex_parser<charT, traits>::parse_repeat(std::size_t low, std::size_t high)
982 {
983 bool greedy = true;
984 bool pocessive = false;
985 std::size_t insert_point;
986 //
987 // when we get to here we may have a non-greedy ? mark still to come:
988 //
989 if((m_position != m_end)
990 && (
991 (0 == (this->flags() & (regbase::main_option_type | regbase::no_perl_ex)))
992 || ((regbase::basic_syntax_group|regbase::emacs_ex) == (this->flags() & (regbase::main_option_type | regbase::emacs_ex)))
993 )
994 )
995 {
996 // OK we have a perl or emacs regex, check for a '?':
997 if ((this->flags() & (regbase::main_option_type | regbase::mod_x | regbase::no_perl_ex)) == regbase::mod_x)
998 {
999 // whitespace skip:
1000 while ((m_position != m_end) && this->m_traits.isctype(*m_position, this->m_mask_space))
1001 ++m_position;
1002 }
1003 if((m_position != m_end) && (this->m_traits.syntax_type(*m_position) == regex_constants::syntax_question))
1004 {
1005 greedy = false;
1006 ++m_position;
1007 }
1008 // for perl regexes only check for pocessive ++ repeats.
1009 if((m_position != m_end)
1010 && (0 == (this->flags() & regbase::main_option_type))
1011 && (this->m_traits.syntax_type(*m_position) == regex_constants::syntax_plus))
1012 {
1013 pocessive = true;
1014 ++m_position;
1015 }
1016 }
1017 if(0 == this->m_last_state)
1018 {
1019 fail(regex_constants::error_badrepeat, ::boost::BOOST_REGEX_DETAIL_NS::distance(m_base, m_position), "Nothing to repeat.");
1020 return false;
1021 }
1022 if(this->m_last_state->type == syntax_element_endmark)
1023 {
1024 // insert a repeat before the '(' matching the last ')':
1025 insert_point = this->m_paren_start;
1026 }
1027 else if((this->m_last_state->type == syntax_element_literal) && (static_cast<re_literal*>(this->m_last_state)->length > 1))
1028 {
1029 // the last state was a literal with more than one character, split it in two:
1030 re_literal* lit = static_cast<re_literal*>(this->m_last_state);
1031 charT c = (static_cast<charT*>(static_cast<void*>(lit+1)))[lit->length - 1];
1032 lit->length -= 1;
1033 // now append new state:
1034 lit = static_cast<re_literal*>(this->append_state(syntax_element_literal, sizeof(re_literal) + sizeof(charT)));
1035 lit->length = 1;
1036 (static_cast<charT*>(static_cast<void*>(lit+1)))[0] = c;
1037 insert_point = this->getoffset(this->m_last_state);
1038 }
1039 else
1040 {
1041 // repeat the last state whatever it was, need to add some error checking here:
1042 switch(this->m_last_state->type)
1043 {
1044 case syntax_element_start_line:
1045 case syntax_element_end_line:
1046 case syntax_element_word_boundary:
1047 case syntax_element_within_word:
1048 case syntax_element_word_start:
1049 case syntax_element_word_end:
1050 case syntax_element_buffer_start:
1051 case syntax_element_buffer_end:
1052 case syntax_element_alt:
1053 case syntax_element_soft_buffer_end:
1054 case syntax_element_restart_continue:
1055 case syntax_element_jump:
1056 case syntax_element_startmark:
1057 case syntax_element_backstep:
1058 // can't legally repeat any of the above:
1059 fail(regex_constants::error_badrepeat, m_position - m_base);
1060 return false;
1061 default:
1062 // do nothing...
1063 break;
1064 }
1065 insert_point = this->getoffset(this->m_last_state);
1066 }
1067 //
1068 // OK we now know what to repeat, so insert the repeat around it:
1069 //
1070 re_repeat* rep = static_cast<re_repeat*>(this->insert_state(insert_point, syntax_element_rep, re_repeater_size));
1071 rep->min = low;
1072 rep->max = high;
1073 rep->greedy = greedy;
1074 rep->leading = false;
1075 // store our repeater position for later:
1076 std::ptrdiff_t rep_off = this->getoffset(rep);
1077 // and append a back jump to the repeat:
1078 re_jump* jmp = static_cast<re_jump*>(this->append_state(syntax_element_jump, sizeof(re_jump)));
1079 jmp->alt.i = rep_off - this->getoffset(jmp);
1080 this->m_pdata->m_data.align();
1081 // now fill in the alt jump for the repeat:
1082 rep = static_cast<re_repeat*>(this->getaddress(rep_off));
1083 rep->alt.i = this->m_pdata->m_data.size() - rep_off;
1084 //
1085 // If the repeat is pocessive then bracket the repeat with a (?>...)
1086 // independent sub-expression construct:
1087 //
1088 if(pocessive)
1089 {
1090 if(m_position != m_end)
1091 {
1092 //
1093 // Check for illegal following quantifier, we have to do this here, because
1094 // the extra states we insert below circumvents our usual error checking :-(
1095 //
1096 bool contin = false;
1097 do
1098 {
1099 if ((this->flags() & (regbase::main_option_type | regbase::mod_x | regbase::no_perl_ex)) == regbase::mod_x)
1100 {
1101 // whitespace skip:
1102 while ((m_position != m_end) && this->m_traits.isctype(*m_position, this->m_mask_space))
1103 ++m_position;
1104 }
1105 if (m_position != m_end)
1106 {
1107 switch (this->m_traits.syntax_type(*m_position))
1108 {
1109 case regex_constants::syntax_star:
1110 case regex_constants::syntax_plus:
1111 case regex_constants::syntax_question:
1112 case regex_constants::syntax_open_brace:
1113 fail(regex_constants::error_badrepeat, m_position - m_base);
1114 return false;
1115 case regex_constants::syntax_open_mark:
1116 // Do we have a comment? If so we need to skip it here...
1117 if ((m_position + 2 < m_end) && this->m_traits.syntax_type(*(m_position + 1)) == regex_constants::syntax_question
1118 && this->m_traits.syntax_type(*(m_position + 2)) == regex_constants::syntax_hash)
1119 {
1120 while ((m_position != m_end)
1121 && (this->m_traits.syntax_type(*m_position++) != regex_constants::syntax_close_mark)) {
1122 }
1123 contin = true;
1124 }
1125 else
1126 contin = false;
1127 }
1128 }
1129 else
1130 contin = false;
1131 } while (contin);
1132 }
1133 re_brace* pb = static_cast<re_brace*>(this->insert_state(insert_point, syntax_element_startmark, sizeof(re_brace)));
1134 pb->index = -3;
1135 pb->icase = this->flags() & regbase::icase;
1136 jmp = static_cast<re_jump*>(this->insert_state(insert_point + sizeof(re_brace), syntax_element_jump, sizeof(re_jump)));
1137 this->m_pdata->m_data.align();
1138 jmp->alt.i = this->m_pdata->m_data.size() - this->getoffset(jmp);
1139 pb = static_cast<re_brace*>(this->append_state(syntax_element_endmark, sizeof(re_brace)));
1140 pb->index = -3;
1141 pb->icase = this->flags() & regbase::icase;
1142 }
1143 return true;
1144 }
1145
1146 template <class charT, class traits>
parse_repeat_range(bool isbasic)1147 bool basic_regex_parser<charT, traits>::parse_repeat_range(bool isbasic)
1148 {
1149 static const char incomplete_message[] = "Missing } in quantified repetition.";
1150 //
1151 // parse a repeat-range:
1152 //
1153 std::size_t min, max;
1154 boost::intmax_t v;
1155 // skip whitespace:
1156 while((m_position != m_end) && this->m_traits.isctype(*m_position, this->m_mask_space))
1157 ++m_position;
1158 if(this->m_position == this->m_end)
1159 {
1160 if(this->flags() & (regbase::main_option_type | regbase::no_perl_ex))
1161 {
1162 fail(regex_constants::error_brace, this->m_position - this->m_base, incomplete_message);
1163 return false;
1164 }
1165 // Treat the opening '{' as a literal character, rewind to start of error:
1166 --m_position;
1167 while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_open_brace) --m_position;
1168 return parse_literal();
1169 }
1170 // get min:
1171 v = this->m_traits.toi(m_position, m_end, 10);
1172 // skip whitespace:
1173 if((v < 0) || (v > umax()))
1174 {
1175 if(this->flags() & (regbase::main_option_type | regbase::no_perl_ex))
1176 {
1177 fail(regex_constants::error_brace, this->m_position - this->m_base, incomplete_message);
1178 return false;
1179 }
1180 // Treat the opening '{' as a literal character, rewind to start of error:
1181 --m_position;
1182 while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_open_brace) --m_position;
1183 return parse_literal();
1184 }
1185 while((m_position != m_end) && this->m_traits.isctype(*m_position, this->m_mask_space))
1186 ++m_position;
1187 if(this->m_position == this->m_end)
1188 {
1189 if(this->flags() & (regbase::main_option_type | regbase::no_perl_ex))
1190 {
1191 fail(regex_constants::error_brace, this->m_position - this->m_base, incomplete_message);
1192 return false;
1193 }
1194 // Treat the opening '{' as a literal character, rewind to start of error:
1195 --m_position;
1196 while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_open_brace) --m_position;
1197 return parse_literal();
1198 }
1199 min = static_cast<std::size_t>(v);
1200 // see if we have a comma:
1201 if(this->m_traits.syntax_type(*m_position) == regex_constants::syntax_comma)
1202 {
1203 // move on and error check:
1204 ++m_position;
1205 // skip whitespace:
1206 while((m_position != m_end) && this->m_traits.isctype(*m_position, this->m_mask_space))
1207 ++m_position;
1208 if(this->m_position == this->m_end)
1209 {
1210 if(this->flags() & (regbase::main_option_type | regbase::no_perl_ex))
1211 {
1212 fail(regex_constants::error_brace, this->m_position - this->m_base, incomplete_message);
1213 return false;
1214 }
1215 // Treat the opening '{' as a literal character, rewind to start of error:
1216 --m_position;
1217 while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_open_brace) --m_position;
1218 return parse_literal();
1219 }
1220 // get the value if any:
1221 v = this->m_traits.toi(m_position, m_end, 10);
1222 max = ((v >= 0) && (v < umax())) ? (std::size_t)v : (std::numeric_limits<std::size_t>::max)();
1223 }
1224 else
1225 {
1226 // no comma, max = min:
1227 max = min;
1228 }
1229 // skip whitespace:
1230 while((m_position != m_end) && this->m_traits.isctype(*m_position, this->m_mask_space))
1231 ++m_position;
1232 // OK now check trailing }:
1233 if(this->m_position == this->m_end)
1234 {
1235 if(this->flags() & (regbase::main_option_type | regbase::no_perl_ex))
1236 {
1237 fail(regex_constants::error_brace, this->m_position - this->m_base, incomplete_message);
1238 return false;
1239 }
1240 // Treat the opening '{' as a literal character, rewind to start of error:
1241 --m_position;
1242 while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_open_brace) --m_position;
1243 return parse_literal();
1244 }
1245 if(isbasic)
1246 {
1247 if(this->m_traits.syntax_type(*m_position) == regex_constants::syntax_escape)
1248 {
1249 ++m_position;
1250 if(this->m_position == this->m_end)
1251 {
1252 fail(regex_constants::error_brace, this->m_position - this->m_base, incomplete_message);
1253 return false;
1254 }
1255 }
1256 else
1257 {
1258 fail(regex_constants::error_brace, this->m_position - this->m_base, incomplete_message);
1259 return false;
1260 }
1261 }
1262 if(this->m_traits.syntax_type(*m_position) == regex_constants::syntax_close_brace)
1263 ++m_position;
1264 else
1265 {
1266 // Treat the opening '{' as a literal character, rewind to start of error:
1267 --m_position;
1268 while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_open_brace) --m_position;
1269 return parse_literal();
1270 }
1271 //
1272 // finally go and add the repeat, unless error:
1273 //
1274 if(min > max)
1275 {
1276 // Backtrack to error location:
1277 m_position -= 2;
1278 while(this->m_traits.isctype(*m_position, this->m_word_mask)) --m_position;
1279 ++m_position;
1280 fail(regex_constants::error_badbrace, m_position - m_base);
1281 return false;
1282 }
1283 return parse_repeat(min, max);
1284 }
1285
1286 template <class charT, class traits>
parse_alt()1287 bool basic_regex_parser<charT, traits>::parse_alt()
1288 {
1289 //
1290 // error check: if there have been no previous states,
1291 // or if the last state was a '(' then error:
1292 //
1293 if(
1294 ((this->m_last_state == 0) || (this->m_last_state->type == syntax_element_startmark))
1295 &&
1296 !(
1297 ((this->flags() & regbase::main_option_type) == regbase::perl_syntax_group)
1298 &&
1299 ((this->flags() & regbase::no_empty_expressions) == 0)
1300 )
1301 )
1302 {
1303 fail(regex_constants::error_empty, this->m_position - this->m_base, "A regular expression cannot start with the alternation operator |.");
1304 return false;
1305 }
1306 //
1307 // Reset mark count if required:
1308 //
1309 if(m_max_mark < m_mark_count)
1310 m_max_mark = m_mark_count;
1311 if(m_mark_reset >= 0)
1312 m_mark_count = m_mark_reset;
1313
1314 ++m_position;
1315 //
1316 // we need to append a trailing jump:
1317 //
1318 re_syntax_base* pj = this->append_state(BOOST_REGEX_DETAIL_NS::syntax_element_jump, sizeof(re_jump));
1319 std::ptrdiff_t jump_offset = this->getoffset(pj);
1320 //
1321 // now insert the alternative:
1322 //
1323 re_alt* palt = static_cast<re_alt*>(this->insert_state(this->m_alt_insert_point, syntax_element_alt, re_alt_size));
1324 jump_offset += re_alt_size;
1325 this->m_pdata->m_data.align();
1326 palt->alt.i = this->m_pdata->m_data.size() - this->getoffset(palt);
1327 //
1328 // update m_alt_insert_point so that the next alternate gets
1329 // inserted at the start of the second of the two we've just created:
1330 //
1331 this->m_alt_insert_point = this->m_pdata->m_data.size();
1332 //
1333 // the start of this alternative must have a case changes state
1334 // if the current block has messed around with case changes:
1335 //
1336 if(m_has_case_change)
1337 {
1338 static_cast<re_case*>(
1339 this->append_state(syntax_element_toggle_case, sizeof(re_case))
1340 )->icase = this->m_icase;
1341 }
1342 //
1343 // push the alternative onto our stack, a recursive
1344 // implementation here is easier to understand (and faster
1345 // as it happens), but causes all kinds of stack overflow problems
1346 // on programs with small stacks (COM+).
1347 //
1348 m_alt_jumps.push_back(jump_offset);
1349 return true;
1350 }
1351
1352 template <class charT, class traits>
parse_set()1353 bool basic_regex_parser<charT, traits>::parse_set()
1354 {
1355 static const char incomplete_message[] = "Character set declaration starting with [ terminated prematurely - either no ] was found or the set had no content.";
1356 ++m_position;
1357 if(m_position == m_end)
1358 {
1359 fail(regex_constants::error_brack, m_position - m_base, incomplete_message);
1360 return false;
1361 }
1362 basic_char_set<charT, traits> char_set;
1363
1364 const charT* base = m_position; // where the '[' was
1365 const charT* item_base = m_position; // where the '[' or '^' was
1366
1367 while(m_position != m_end)
1368 {
1369 switch(this->m_traits.syntax_type(*m_position))
1370 {
1371 case regex_constants::syntax_caret:
1372 if(m_position == base)
1373 {
1374 char_set.negate();
1375 ++m_position;
1376 item_base = m_position;
1377 }
1378 else
1379 parse_set_literal(char_set);
1380 break;
1381 case regex_constants::syntax_close_set:
1382 if(m_position == item_base)
1383 {
1384 parse_set_literal(char_set);
1385 break;
1386 }
1387 else
1388 {
1389 ++m_position;
1390 if(0 == this->append_set(char_set))
1391 {
1392 fail(regex_constants::error_ctype, m_position - m_base);
1393 return false;
1394 }
1395 }
1396 return true;
1397 case regex_constants::syntax_open_set:
1398 if(parse_inner_set(char_set))
1399 break;
1400 return true;
1401 case regex_constants::syntax_escape:
1402 {
1403 //
1404 // look ahead and see if this is a character class shortcut
1405 // \d \w \s etc...
1406 //
1407 ++m_position;
1408 if(this->m_traits.escape_syntax_type(*m_position)
1409 == regex_constants::escape_type_class)
1410 {
1411 char_class_type m = this->m_traits.lookup_classname(m_position, m_position+1);
1412 if(m != 0)
1413 {
1414 char_set.add_class(m);
1415 ++m_position;
1416 break;
1417 }
1418 }
1419 else if(this->m_traits.escape_syntax_type(*m_position)
1420 == regex_constants::escape_type_not_class)
1421 {
1422 // negated character class:
1423 char_class_type m = this->m_traits.lookup_classname(m_position, m_position+1);
1424 if(m != 0)
1425 {
1426 char_set.add_negated_class(m);
1427 ++m_position;
1428 break;
1429 }
1430 }
1431 // not a character class, just a regular escape:
1432 --m_position;
1433 parse_set_literal(char_set);
1434 break;
1435 }
1436 default:
1437 parse_set_literal(char_set);
1438 break;
1439 }
1440 }
1441 return m_position != m_end;
1442 }
1443
1444 template <class charT, class traits>
parse_inner_set(basic_char_set<charT,traits> & char_set)1445 bool basic_regex_parser<charT, traits>::parse_inner_set(basic_char_set<charT, traits>& char_set)
1446 {
1447 static const char incomplete_message[] = "Character class declaration starting with [ terminated prematurely - either no ] was found or the set had no content.";
1448 //
1449 // we have either a character class [:name:]
1450 // a collating element [.name.]
1451 // or an equivalence class [=name=]
1452 //
1453 if(m_end == ++m_position)
1454 {
1455 fail(regex_constants::error_brack, m_position - m_base, incomplete_message);
1456 return false;
1457 }
1458 switch(this->m_traits.syntax_type(*m_position))
1459 {
1460 case regex_constants::syntax_dot:
1461 //
1462 // a collating element is treated as a literal:
1463 //
1464 --m_position;
1465 parse_set_literal(char_set);
1466 return true;
1467 case regex_constants::syntax_colon:
1468 {
1469 // check that character classes are actually enabled:
1470 if((this->flags() & (regbase::main_option_type | regbase::no_char_classes))
1471 == (regbase::basic_syntax_group | regbase::no_char_classes))
1472 {
1473 --m_position;
1474 parse_set_literal(char_set);
1475 return true;
1476 }
1477 // skip the ':'
1478 if(m_end == ++m_position)
1479 {
1480 fail(regex_constants::error_brack, m_position - m_base, incomplete_message);
1481 return false;
1482 }
1483 const charT* name_first = m_position;
1484 // skip at least one character, then find the matching ':]'
1485 if(m_end == ++m_position)
1486 {
1487 fail(regex_constants::error_brack, m_position - m_base, incomplete_message);
1488 return false;
1489 }
1490 while((m_position != m_end)
1491 && (this->m_traits.syntax_type(*m_position) != regex_constants::syntax_colon))
1492 ++m_position;
1493 const charT* name_last = m_position;
1494 if(m_end == m_position)
1495 {
1496 fail(regex_constants::error_brack, m_position - m_base, incomplete_message);
1497 return false;
1498 }
1499 if((m_end == ++m_position)
1500 || (this->m_traits.syntax_type(*m_position) != regex_constants::syntax_close_set))
1501 {
1502 fail(regex_constants::error_brack, m_position - m_base, incomplete_message);
1503 return false;
1504 }
1505 //
1506 // check for negated class:
1507 //
1508 bool negated = false;
1509 if(this->m_traits.syntax_type(*name_first) == regex_constants::syntax_caret)
1510 {
1511 ++name_first;
1512 negated = true;
1513 }
1514 typedef typename traits::char_class_type m_type;
1515 m_type m = this->m_traits.lookup_classname(name_first, name_last);
1516 if(m == 0)
1517 {
1518 if(char_set.empty() && (name_last - name_first == 1))
1519 {
1520 // maybe a special case:
1521 ++m_position;
1522 if( (m_position != m_end)
1523 && (this->m_traits.syntax_type(*m_position)
1524 == regex_constants::syntax_close_set))
1525 {
1526 if(this->m_traits.escape_syntax_type(*name_first)
1527 == regex_constants::escape_type_left_word)
1528 {
1529 ++m_position;
1530 this->append_state(syntax_element_word_start);
1531 return false;
1532 }
1533 if(this->m_traits.escape_syntax_type(*name_first)
1534 == regex_constants::escape_type_right_word)
1535 {
1536 ++m_position;
1537 this->append_state(syntax_element_word_end);
1538 return false;
1539 }
1540 }
1541 }
1542 fail(regex_constants::error_ctype, name_first - m_base);
1543 return false;
1544 }
1545 if(negated == false)
1546 char_set.add_class(m);
1547 else
1548 char_set.add_negated_class(m);
1549 ++m_position;
1550 break;
1551 }
1552 case regex_constants::syntax_equal:
1553 {
1554 // skip the '='
1555 if(m_end == ++m_position)
1556 {
1557 fail(regex_constants::error_brack, m_position - m_base, incomplete_message);
1558 return false;
1559 }
1560 const charT* name_first = m_position;
1561 // skip at least one character, then find the matching '=]'
1562 if(m_end == ++m_position)
1563 {
1564 fail(regex_constants::error_brack, m_position - m_base, incomplete_message);
1565 return false;
1566 }
1567 while((m_position != m_end)
1568 && (this->m_traits.syntax_type(*m_position) != regex_constants::syntax_equal))
1569 ++m_position;
1570 const charT* name_last = m_position;
1571 if(m_end == m_position)
1572 {
1573 fail(regex_constants::error_brack, m_position - m_base, incomplete_message);
1574 return false;
1575 }
1576 if((m_end == ++m_position)
1577 || (this->m_traits.syntax_type(*m_position) != regex_constants::syntax_close_set))
1578 {
1579 fail(regex_constants::error_brack, m_position - m_base, incomplete_message);
1580 return false;
1581 }
1582 string_type m = this->m_traits.lookup_collatename(name_first, name_last);
1583 if((0 == m.size()) || (m.size() > 2))
1584 {
1585 fail(regex_constants::error_collate, name_first - m_base);
1586 return false;
1587 }
1588 digraph<charT> d;
1589 d.first = m[0];
1590 if(m.size() > 1)
1591 d.second = m[1];
1592 else
1593 d.second = 0;
1594 char_set.add_equivalent(d);
1595 ++m_position;
1596 break;
1597 }
1598 default:
1599 --m_position;
1600 parse_set_literal(char_set);
1601 break;
1602 }
1603 return true;
1604 }
1605
1606 template <class charT, class traits>
parse_set_literal(basic_char_set<charT,traits> & char_set)1607 void basic_regex_parser<charT, traits>::parse_set_literal(basic_char_set<charT, traits>& char_set)
1608 {
1609 digraph<charT> start_range(get_next_set_literal(char_set));
1610 if(m_end == m_position)
1611 {
1612 fail(regex_constants::error_brack, m_position - m_base);
1613 return;
1614 }
1615 if(this->m_traits.syntax_type(*m_position) == regex_constants::syntax_dash)
1616 {
1617 // we have a range:
1618 if(m_end == ++m_position)
1619 {
1620 fail(regex_constants::error_brack, m_position - m_base);
1621 return;
1622 }
1623 if(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_close_set)
1624 {
1625 digraph<charT> end_range = get_next_set_literal(char_set);
1626 char_set.add_range(start_range, end_range);
1627 if(this->m_traits.syntax_type(*m_position) == regex_constants::syntax_dash)
1628 {
1629 if(m_end == ++m_position)
1630 {
1631 fail(regex_constants::error_brack, m_position - m_base);
1632 return;
1633 }
1634 if(this->m_traits.syntax_type(*m_position) == regex_constants::syntax_close_set)
1635 {
1636 // trailing - :
1637 --m_position;
1638 return;
1639 }
1640 fail(regex_constants::error_range, m_position - m_base);
1641 return;
1642 }
1643 return;
1644 }
1645 --m_position;
1646 }
1647 char_set.add_single(start_range);
1648 }
1649
1650 template <class charT, class traits>
get_next_set_literal(basic_char_set<charT,traits> & char_set)1651 digraph<charT> basic_regex_parser<charT, traits>::get_next_set_literal(basic_char_set<charT, traits>& char_set)
1652 {
1653 digraph<charT> result;
1654 switch(this->m_traits.syntax_type(*m_position))
1655 {
1656 case regex_constants::syntax_dash:
1657 if(!char_set.empty())
1658 {
1659 // see if we are at the end of the set:
1660 if((++m_position == m_end) || (this->m_traits.syntax_type(*m_position) != regex_constants::syntax_close_set))
1661 {
1662 fail(regex_constants::error_range, m_position - m_base);
1663 return result;
1664 }
1665 --m_position;
1666 }
1667 result.first = *m_position++;
1668 return result;
1669 case regex_constants::syntax_escape:
1670 // check to see if escapes are supported first:
1671 if(this->flags() & regex_constants::no_escape_in_lists)
1672 {
1673 result = *m_position++;
1674 break;
1675 }
1676 ++m_position;
1677 result = unescape_character();
1678 break;
1679 case regex_constants::syntax_open_set:
1680 {
1681 if(m_end == ++m_position)
1682 {
1683 fail(regex_constants::error_collate, m_position - m_base);
1684 return result;
1685 }
1686 if(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_dot)
1687 {
1688 --m_position;
1689 result.first = *m_position;
1690 ++m_position;
1691 return result;
1692 }
1693 if(m_end == ++m_position)
1694 {
1695 fail(regex_constants::error_collate, m_position - m_base);
1696 return result;
1697 }
1698 const charT* name_first = m_position;
1699 // skip at least one character, then find the matching ':]'
1700 if(m_end == ++m_position)
1701 {
1702 fail(regex_constants::error_collate, name_first - m_base);
1703 return result;
1704 }
1705 while((m_position != m_end)
1706 && (this->m_traits.syntax_type(*m_position) != regex_constants::syntax_dot))
1707 ++m_position;
1708 const charT* name_last = m_position;
1709 if(m_end == m_position)
1710 {
1711 fail(regex_constants::error_collate, name_first - m_base);
1712 return result;
1713 }
1714 if((m_end == ++m_position)
1715 || (this->m_traits.syntax_type(*m_position) != regex_constants::syntax_close_set))
1716 {
1717 fail(regex_constants::error_collate, name_first - m_base);
1718 return result;
1719 }
1720 ++m_position;
1721 string_type s = this->m_traits.lookup_collatename(name_first, name_last);
1722 if(s.empty() || (s.size() > 2))
1723 {
1724 fail(regex_constants::error_collate, name_first - m_base);
1725 return result;
1726 }
1727 result.first = s[0];
1728 if(s.size() > 1)
1729 result.second = s[1];
1730 else
1731 result.second = 0;
1732 return result;
1733 }
1734 default:
1735 result = *m_position++;
1736 }
1737 return result;
1738 }
1739
1740 //
1741 // does a value fit in the specified charT type?
1742 //
1743 template <class charT>
valid_value(charT,boost::intmax_t v,const mpl::true_ &)1744 bool valid_value(charT, boost::intmax_t v, const mpl::true_&)
1745 {
1746 return (v >> (sizeof(charT) * CHAR_BIT)) == 0;
1747 }
1748 template <class charT>
valid_value(charT,boost::intmax_t,const mpl::false_ &)1749 bool valid_value(charT, boost::intmax_t, const mpl::false_&)
1750 {
1751 return true; // v will alsways fit in a charT
1752 }
1753 template <class charT>
valid_value(charT c,boost::intmax_t v)1754 bool valid_value(charT c, boost::intmax_t v)
1755 {
1756 return valid_value(c, v, mpl::bool_<(sizeof(charT) < sizeof(boost::intmax_t))>());
1757 }
1758
1759 template <class charT, class traits>
unescape_character()1760 charT basic_regex_parser<charT, traits>::unescape_character()
1761 {
1762 #ifdef BOOST_MSVC
1763 #pragma warning(push)
1764 #pragma warning(disable:4127)
1765 #endif
1766 charT result(0);
1767 if(m_position == m_end)
1768 {
1769 fail(regex_constants::error_escape, m_position - m_base, "Escape sequence terminated prematurely.");
1770 return false;
1771 }
1772 switch(this->m_traits.escape_syntax_type(*m_position))
1773 {
1774 case regex_constants::escape_type_control_a:
1775 result = charT('\a');
1776 break;
1777 case regex_constants::escape_type_e:
1778 result = charT(27);
1779 break;
1780 case regex_constants::escape_type_control_f:
1781 result = charT('\f');
1782 break;
1783 case regex_constants::escape_type_control_n:
1784 result = charT('\n');
1785 break;
1786 case regex_constants::escape_type_control_r:
1787 result = charT('\r');
1788 break;
1789 case regex_constants::escape_type_control_t:
1790 result = charT('\t');
1791 break;
1792 case regex_constants::escape_type_control_v:
1793 result = charT('\v');
1794 break;
1795 case regex_constants::escape_type_word_assert:
1796 result = charT('\b');
1797 break;
1798 case regex_constants::escape_type_ascii_control:
1799 ++m_position;
1800 if(m_position == m_end)
1801 {
1802 // Rewind to start of escape:
1803 --m_position;
1804 while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_escape) --m_position;
1805 fail(regex_constants::error_escape, m_position - m_base, "ASCII escape sequence terminated prematurely.");
1806 return result;
1807 }
1808 result = static_cast<charT>(*m_position % 32);
1809 break;
1810 case regex_constants::escape_type_hex:
1811 ++m_position;
1812 if(m_position == m_end)
1813 {
1814 // Rewind to start of escape:
1815 --m_position;
1816 while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_escape) --m_position;
1817 fail(regex_constants::error_escape, m_position - m_base, "Hexadecimal escape sequence terminated prematurely.");
1818 return result;
1819 }
1820 // maybe have \x{ddd}
1821 if(this->m_traits.syntax_type(*m_position) == regex_constants::syntax_open_brace)
1822 {
1823 ++m_position;
1824 if(m_position == m_end)
1825 {
1826 // Rewind to start of escape:
1827 --m_position;
1828 while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_escape) --m_position;
1829 fail(regex_constants::error_escape, m_position - m_base, "Missing } in hexadecimal escape sequence.");
1830 return result;
1831 }
1832 boost::intmax_t i = this->m_traits.toi(m_position, m_end, 16);
1833 if((m_position == m_end)
1834 || (i < 0)
1835 || ((std::numeric_limits<charT>::is_specialized) && (i > (boost::intmax_t)(std::numeric_limits<charT>::max)()))
1836 || (this->m_traits.syntax_type(*m_position) != regex_constants::syntax_close_brace))
1837 {
1838 // Rewind to start of escape:
1839 --m_position;
1840 while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_escape) --m_position;
1841 fail(regex_constants::error_badbrace, m_position - m_base, "Hexadecimal escape sequence was invalid.");
1842 return result;
1843 }
1844 ++m_position;
1845 result = charT(i);
1846 }
1847 else
1848 {
1849 std::ptrdiff_t len = (std::min)(static_cast<std::ptrdiff_t>(2), static_cast<std::ptrdiff_t>(m_end - m_position));
1850 boost::intmax_t i = this->m_traits.toi(m_position, m_position + len, 16);
1851 if((i < 0)
1852 || !valid_value(charT(0), i))
1853 {
1854 // Rewind to start of escape:
1855 --m_position;
1856 while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_escape) --m_position;
1857 fail(regex_constants::error_escape, m_position - m_base, "Escape sequence did not encode a valid character.");
1858 return result;
1859 }
1860 result = charT(i);
1861 }
1862 return result;
1863 case regex_constants::syntax_digit:
1864 {
1865 // an octal escape sequence, the first character must be a zero
1866 // followed by up to 3 octal digits:
1867 std::ptrdiff_t len = (std::min)(::boost::BOOST_REGEX_DETAIL_NS::distance(m_position, m_end), static_cast<std::ptrdiff_t>(4));
1868 const charT* bp = m_position;
1869 boost::intmax_t val = this->m_traits.toi(bp, bp + 1, 8);
1870 if(val != 0)
1871 {
1872 // Rewind to start of escape:
1873 --m_position;
1874 while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_escape) --m_position;
1875 // Oops not an octal escape after all:
1876 fail(regex_constants::error_escape, m_position - m_base, "Invalid octal escape sequence.");
1877 return result;
1878 }
1879 val = this->m_traits.toi(m_position, m_position + len, 8);
1880 if((val < 0) || (val > (boost::intmax_t)(std::numeric_limits<charT>::max)()))
1881 {
1882 // Rewind to start of escape:
1883 --m_position;
1884 while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_escape) --m_position;
1885 fail(regex_constants::error_escape, m_position - m_base, "Octal escape sequence is invalid.");
1886 return result;
1887 }
1888 return static_cast<charT>(val);
1889 }
1890 case regex_constants::escape_type_named_char:
1891 {
1892 ++m_position;
1893 if(m_position == m_end)
1894 {
1895 // Rewind to start of escape:
1896 --m_position;
1897 while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_escape) --m_position;
1898 fail(regex_constants::error_escape, m_position - m_base);
1899 return false;
1900 }
1901 // maybe have \N{name}
1902 if(this->m_traits.syntax_type(*m_position) == regex_constants::syntax_open_brace)
1903 {
1904 const charT* base = m_position;
1905 // skip forward until we find enclosing brace:
1906 while((m_position != m_end) && (this->m_traits.syntax_type(*m_position) != regex_constants::syntax_close_brace))
1907 ++m_position;
1908 if(m_position == m_end)
1909 {
1910 // Rewind to start of escape:
1911 --m_position;
1912 while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_escape) --m_position;
1913 fail(regex_constants::error_escape, m_position - m_base);
1914 return false;
1915 }
1916 string_type s = this->m_traits.lookup_collatename(++base, m_position++);
1917 if(s.empty())
1918 {
1919 // Rewind to start of escape:
1920 --m_position;
1921 while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_escape) --m_position;
1922 fail(regex_constants::error_collate, m_position - m_base);
1923 return false;
1924 }
1925 if(s.size() == 1)
1926 {
1927 return s[0];
1928 }
1929 }
1930 // fall through is a failure:
1931 // Rewind to start of escape:
1932 --m_position;
1933 while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_escape) --m_position;
1934 fail(regex_constants::error_escape, m_position - m_base);
1935 return false;
1936 }
1937 default:
1938 result = *m_position;
1939 break;
1940 }
1941 ++m_position;
1942 return result;
1943 #ifdef BOOST_MSVC
1944 #pragma warning(pop)
1945 #endif
1946 }
1947
1948 template <class charT, class traits>
parse_backref()1949 bool basic_regex_parser<charT, traits>::parse_backref()
1950 {
1951 BOOST_ASSERT(m_position != m_end);
1952 const charT* pc = m_position;
1953 boost::intmax_t i = this->m_traits.toi(pc, pc + 1, 10);
1954 if((i == 0) || (((this->flags() & regbase::main_option_type) == regbase::perl_syntax_group) && (this->flags() & regbase::no_bk_refs)))
1955 {
1956 // not a backref at all but an octal escape sequence:
1957 charT c = unescape_character();
1958 this->append_literal(c);
1959 }
1960 else if((i > 0) && (this->m_backrefs.test(i)))
1961 {
1962 m_position = pc;
1963 re_brace* pb = static_cast<re_brace*>(this->append_state(syntax_element_backref, sizeof(re_brace)));
1964 pb->index = i;
1965 pb->icase = this->flags() & regbase::icase;
1966 }
1967 else
1968 {
1969 // Rewind to start of escape:
1970 --m_position;
1971 while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_escape) --m_position;
1972 fail(regex_constants::error_backref, m_position - m_base);
1973 return false;
1974 }
1975 return true;
1976 }
1977
1978 template <class charT, class traits>
parse_QE()1979 bool basic_regex_parser<charT, traits>::parse_QE()
1980 {
1981 #ifdef BOOST_MSVC
1982 #pragma warning(push)
1983 #pragma warning(disable:4127)
1984 #endif
1985 //
1986 // parse a \Q...\E sequence:
1987 //
1988 ++m_position; // skip the Q
1989 const charT* start = m_position;
1990 const charT* end;
1991 do
1992 {
1993 while((m_position != m_end)
1994 && (this->m_traits.syntax_type(*m_position) != regex_constants::syntax_escape))
1995 ++m_position;
1996 if(m_position == m_end)
1997 {
1998 // a \Q...\E sequence may terminate with the end of the expression:
1999 end = m_position;
2000 break;
2001 }
2002 if(++m_position == m_end) // skip the escape
2003 {
2004 fail(regex_constants::error_escape, m_position - m_base, "Unterminated \\Q...\\E sequence.");
2005 return false;
2006 }
2007 // check to see if it's a \E:
2008 if(this->m_traits.escape_syntax_type(*m_position) == regex_constants::escape_type_E)
2009 {
2010 ++m_position;
2011 end = m_position - 2;
2012 break;
2013 }
2014 // otherwise go round again:
2015 }while(true);
2016 //
2017 // now add all the character between the two escapes as literals:
2018 //
2019 while(start != end)
2020 {
2021 this->append_literal(*start);
2022 ++start;
2023 }
2024 return true;
2025 #ifdef BOOST_MSVC
2026 #pragma warning(pop)
2027 #endif
2028 }
2029
2030 template <class charT, class traits>
parse_perl_extension()2031 bool basic_regex_parser<charT, traits>::parse_perl_extension()
2032 {
2033 if(++m_position == m_end)
2034 {
2035 // Rewind to start of (? sequence:
2036 --m_position;
2037 while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_open_mark) --m_position;
2038 fail(regex_constants::error_perl_extension, m_position - m_base);
2039 return false;
2040 }
2041 //
2042 // treat comments as a special case, as these
2043 // are the only ones that don't start with a leading
2044 // startmark state:
2045 //
2046 if(this->m_traits.syntax_type(*m_position) == regex_constants::syntax_hash)
2047 {
2048 while((m_position != m_end)
2049 && (this->m_traits.syntax_type(*m_position++) != regex_constants::syntax_close_mark))
2050 {}
2051 return true;
2052 }
2053 //
2054 // backup some state, and prepare the way:
2055 //
2056 int markid = 0;
2057 std::ptrdiff_t jump_offset = 0;
2058 re_brace* pb = static_cast<re_brace*>(this->append_state(syntax_element_startmark, sizeof(re_brace)));
2059 pb->icase = this->flags() & regbase::icase;
2060 std::ptrdiff_t last_paren_start = this->getoffset(pb);
2061 // back up insertion point for alternations, and set new point:
2062 std::ptrdiff_t last_alt_point = m_alt_insert_point;
2063 this->m_pdata->m_data.align();
2064 m_alt_insert_point = this->m_pdata->m_data.size();
2065 std::ptrdiff_t expected_alt_point = m_alt_insert_point;
2066 bool restore_flags = true;
2067 regex_constants::syntax_option_type old_flags = this->flags();
2068 bool old_case_change = m_has_case_change;
2069 m_has_case_change = false;
2070 charT name_delim;
2071 int mark_reset = m_mark_reset;
2072 int max_mark = m_max_mark;
2073 m_mark_reset = -1;
2074 m_max_mark = m_mark_count;
2075 boost::intmax_t v;
2076 //
2077 // select the actual extension used:
2078 //
2079 switch(this->m_traits.syntax_type(*m_position))
2080 {
2081 case regex_constants::syntax_or:
2082 m_mark_reset = m_mark_count;
2083 BOOST_FALLTHROUGH;
2084 case regex_constants::syntax_colon:
2085 //
2086 // a non-capturing mark:
2087 //
2088 pb->index = markid = 0;
2089 ++m_position;
2090 break;
2091 case regex_constants::syntax_digit:
2092 {
2093 //
2094 // a recursive subexpression:
2095 //
2096 v = this->m_traits.toi(m_position, m_end, 10);
2097 if((v < 0) || (this->m_traits.syntax_type(*m_position) != regex_constants::syntax_close_mark))
2098 {
2099 // Rewind to start of (? sequence:
2100 --m_position;
2101 while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_open_mark) --m_position;
2102 fail(regex_constants::error_perl_extension, m_position - m_base, "The recursive sub-expression refers to an invalid marking group, or is unterminated.");
2103 return false;
2104 }
2105 insert_recursion:
2106 pb->index = markid = 0;
2107 re_recurse* pr = static_cast<re_recurse*>(this->append_state(syntax_element_recurse, sizeof(re_recurse)));
2108 pr->alt.i = v;
2109 pr->state_id = 0;
2110 static_cast<re_case*>(
2111 this->append_state(syntax_element_toggle_case, sizeof(re_case))
2112 )->icase = this->flags() & regbase::icase;
2113 break;
2114 }
2115 case regex_constants::syntax_plus:
2116 //
2117 // A forward-relative recursive subexpression:
2118 //
2119 ++m_position;
2120 v = this->m_traits.toi(m_position, m_end, 10);
2121 if((v <= 0) || (this->m_traits.syntax_type(*m_position) != regex_constants::syntax_close_mark))
2122 {
2123 // Rewind to start of (? sequence:
2124 --m_position;
2125 while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_open_mark) --m_position;
2126 fail(regex_constants::error_perl_extension, m_position - m_base, "An invalid or unterminated recursive sub-expression.");
2127 return false;
2128 }
2129 if ((std::numeric_limits<boost::intmax_t>::max)() - m_mark_count < v)
2130 {
2131 fail(regex_constants::error_perl_extension, m_position - m_base, "An invalid or unterminated recursive sub-expression.");
2132 return false;
2133 }
2134 v += m_mark_count;
2135 goto insert_recursion;
2136 case regex_constants::syntax_dash:
2137 //
2138 // Possibly a backward-relative recursive subexpression:
2139 //
2140 ++m_position;
2141 v = this->m_traits.toi(m_position, m_end, 10);
2142 if(v <= 0)
2143 {
2144 --m_position;
2145 // Oops not a relative recursion at all, but a (?-imsx) group:
2146 goto option_group_jump;
2147 }
2148 v = static_cast<boost::intmax_t>(m_mark_count) + 1 - v;
2149 if(v <= 0)
2150 {
2151 // Rewind to start of (? sequence:
2152 --m_position;
2153 while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_open_mark) --m_position;
2154 fail(regex_constants::error_perl_extension, m_position - m_base, "An invalid or unterminated recursive sub-expression.");
2155 return false;
2156 }
2157 goto insert_recursion;
2158 case regex_constants::syntax_equal:
2159 pb->index = markid = -1;
2160 ++m_position;
2161 jump_offset = this->getoffset(this->append_state(syntax_element_jump, sizeof(re_jump)));
2162 this->m_pdata->m_data.align();
2163 m_alt_insert_point = this->m_pdata->m_data.size();
2164 break;
2165 case regex_constants::syntax_not:
2166 pb->index = markid = -2;
2167 ++m_position;
2168 jump_offset = this->getoffset(this->append_state(syntax_element_jump, sizeof(re_jump)));
2169 this->m_pdata->m_data.align();
2170 m_alt_insert_point = this->m_pdata->m_data.size();
2171 break;
2172 case regex_constants::escape_type_left_word:
2173 {
2174 // a lookbehind assertion:
2175 if(++m_position == m_end)
2176 {
2177 // Rewind to start of (? sequence:
2178 --m_position;
2179 while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_open_mark) --m_position;
2180 fail(regex_constants::error_perl_extension, m_position - m_base);
2181 return false;
2182 }
2183 regex_constants::syntax_type t = this->m_traits.syntax_type(*m_position);
2184 if(t == regex_constants::syntax_not)
2185 pb->index = markid = -2;
2186 else if(t == regex_constants::syntax_equal)
2187 pb->index = markid = -1;
2188 else
2189 {
2190 // Probably a named capture which also starts (?< :
2191 name_delim = '>';
2192 --m_position;
2193 goto named_capture_jump;
2194 }
2195 ++m_position;
2196 jump_offset = this->getoffset(this->append_state(syntax_element_jump, sizeof(re_jump)));
2197 this->append_state(syntax_element_backstep, sizeof(re_brace));
2198 this->m_pdata->m_data.align();
2199 m_alt_insert_point = this->m_pdata->m_data.size();
2200 break;
2201 }
2202 case regex_constants::escape_type_right_word:
2203 //
2204 // an independent sub-expression:
2205 //
2206 pb->index = markid = -3;
2207 ++m_position;
2208 jump_offset = this->getoffset(this->append_state(syntax_element_jump, sizeof(re_jump)));
2209 this->m_pdata->m_data.align();
2210 m_alt_insert_point = this->m_pdata->m_data.size();
2211 break;
2212 case regex_constants::syntax_open_mark:
2213 {
2214 // a conditional expression:
2215 pb->index = markid = -4;
2216 if(++m_position == m_end)
2217 {
2218 // Rewind to start of (? sequence:
2219 --m_position;
2220 while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_open_mark) --m_position;
2221 fail(regex_constants::error_perl_extension, m_position - m_base);
2222 return false;
2223 }
2224 v = this->m_traits.toi(m_position, m_end, 10);
2225 if(m_position == m_end)
2226 {
2227 // Rewind to start of (? sequence:
2228 --m_position;
2229 while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_open_mark) --m_position;
2230 fail(regex_constants::error_perl_extension, m_position - m_base);
2231 return false;
2232 }
2233 if(*m_position == charT('R'))
2234 {
2235 if(++m_position == m_end)
2236 {
2237 // Rewind to start of (? sequence:
2238 --m_position;
2239 while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_open_mark) --m_position;
2240 fail(regex_constants::error_perl_extension, m_position - m_base);
2241 return false;
2242 }
2243 if(*m_position == charT('&'))
2244 {
2245 const charT* base = ++m_position;
2246 while((m_position != m_end) && (this->m_traits.syntax_type(*m_position) != regex_constants::syntax_close_mark))
2247 ++m_position;
2248 if(m_position == m_end)
2249 {
2250 // Rewind to start of (? sequence:
2251 --m_position;
2252 while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_open_mark) --m_position;
2253 fail(regex_constants::error_perl_extension, m_position - m_base);
2254 return false;
2255 }
2256 v = -static_cast<int>(hash_value_from_capture_name(base, m_position));
2257 }
2258 else
2259 {
2260 v = -this->m_traits.toi(m_position, m_end, 10);
2261 }
2262 re_brace* br = static_cast<re_brace*>(this->append_state(syntax_element_assert_backref, sizeof(re_brace)));
2263 br->index = v < 0 ? (v - 1) : 0;
2264 if(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_close_mark)
2265 {
2266 // Rewind to start of (? sequence:
2267 --m_position;
2268 while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_open_mark) --m_position;
2269 fail(regex_constants::error_perl_extension, m_position - m_base);
2270 return false;
2271 }
2272 if(++m_position == m_end)
2273 {
2274 // Rewind to start of (? sequence:
2275 --m_position;
2276 while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_open_mark) --m_position;
2277 fail(regex_constants::error_perl_extension, m_position - m_base);
2278 return false;
2279 }
2280 }
2281 else if((*m_position == charT('\'')) || (*m_position == charT('<')))
2282 {
2283 const charT* base = ++m_position;
2284 while((m_position != m_end) && (*m_position != charT('>')) && (*m_position != charT('\'')))
2285 ++m_position;
2286 if(m_position == m_end)
2287 {
2288 // Rewind to start of (? sequence:
2289 --m_position;
2290 while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_open_mark) --m_position;
2291 fail(regex_constants::error_perl_extension, m_position - m_base);
2292 return false;
2293 }
2294 v = static_cast<int>(hash_value_from_capture_name(base, m_position));
2295 re_brace* br = static_cast<re_brace*>(this->append_state(syntax_element_assert_backref, sizeof(re_brace)));
2296 br->index = v;
2297 if(((*m_position != charT('>')) && (*m_position != charT('\''))) || (++m_position == m_end))
2298 {
2299 // Rewind to start of (? sequence:
2300 --m_position;
2301 while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_open_mark) --m_position;
2302 fail(regex_constants::error_perl_extension, m_position - m_base, "Unterminated named capture.");
2303 return false;
2304 }
2305 if(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_close_mark)
2306 {
2307 // Rewind to start of (? sequence:
2308 --m_position;
2309 while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_open_mark) --m_position;
2310 fail(regex_constants::error_perl_extension, m_position - m_base);
2311 return false;
2312 }
2313 if(++m_position == m_end)
2314 {
2315 // Rewind to start of (? sequence:
2316 --m_position;
2317 while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_open_mark) --m_position;
2318 fail(regex_constants::error_perl_extension, m_position - m_base);
2319 return false;
2320 }
2321 }
2322 else if(*m_position == charT('D'))
2323 {
2324 const char* def = "DEFINE";
2325 while(*def && (m_position != m_end) && (*m_position == charT(*def)))
2326 ++m_position, ++def;
2327 if((m_position == m_end) || *def)
2328 {
2329 // Rewind to start of (? sequence:
2330 --m_position;
2331 while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_open_mark) --m_position;
2332 fail(regex_constants::error_perl_extension, m_position - m_base);
2333 return false;
2334 }
2335 re_brace* br = static_cast<re_brace*>(this->append_state(syntax_element_assert_backref, sizeof(re_brace)));
2336 br->index = 9999; // special magic value!
2337 if(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_close_mark)
2338 {
2339 // Rewind to start of (? sequence:
2340 --m_position;
2341 while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_open_mark) --m_position;
2342 fail(regex_constants::error_perl_extension, m_position - m_base);
2343 return false;
2344 }
2345 if(++m_position == m_end)
2346 {
2347 // Rewind to start of (? sequence:
2348 --m_position;
2349 while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_open_mark) --m_position;
2350 fail(regex_constants::error_perl_extension, m_position - m_base);
2351 return false;
2352 }
2353 }
2354 else if(v > 0)
2355 {
2356 re_brace* br = static_cast<re_brace*>(this->append_state(syntax_element_assert_backref, sizeof(re_brace)));
2357 br->index = v;
2358 if(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_close_mark)
2359 {
2360 // Rewind to start of (? sequence:
2361 --m_position;
2362 while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_open_mark) --m_position;
2363 fail(regex_constants::error_perl_extension, m_position - m_base);
2364 return false;
2365 }
2366 if(++m_position == m_end)
2367 {
2368 // Rewind to start of (? sequence:
2369 --m_position;
2370 while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_open_mark) --m_position;
2371 fail(regex_constants::error_perl_extension, m_position - m_base);
2372 return false;
2373 }
2374 }
2375 else
2376 {
2377 // verify that we have a lookahead or lookbehind assert:
2378 if(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_question)
2379 {
2380 // Rewind to start of (? sequence:
2381 --m_position;
2382 while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_open_mark) --m_position;
2383 fail(regex_constants::error_perl_extension, m_position - m_base);
2384 return false;
2385 }
2386 if(++m_position == m_end)
2387 {
2388 // Rewind to start of (? sequence:
2389 --m_position;
2390 while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_open_mark) --m_position;
2391 fail(regex_constants::error_perl_extension, m_position - m_base);
2392 return false;
2393 }
2394 if(this->m_traits.syntax_type(*m_position) == regex_constants::escape_type_left_word)
2395 {
2396 if(++m_position == m_end)
2397 {
2398 // Rewind to start of (? sequence:
2399 --m_position;
2400 while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_open_mark) --m_position;
2401 fail(regex_constants::error_perl_extension, m_position - m_base);
2402 return false;
2403 }
2404 if((this->m_traits.syntax_type(*m_position) != regex_constants::syntax_equal)
2405 && (this->m_traits.syntax_type(*m_position) != regex_constants::syntax_not))
2406 {
2407 // Rewind to start of (? sequence:
2408 --m_position;
2409 while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_open_mark) --m_position;
2410 fail(regex_constants::error_perl_extension, m_position - m_base);
2411 return false;
2412 }
2413 m_position -= 3;
2414 }
2415 else
2416 {
2417 if((this->m_traits.syntax_type(*m_position) != regex_constants::syntax_equal)
2418 && (this->m_traits.syntax_type(*m_position) != regex_constants::syntax_not))
2419 {
2420 // Rewind to start of (? sequence:
2421 --m_position;
2422 while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_open_mark) --m_position;
2423 fail(regex_constants::error_perl_extension, m_position - m_base);
2424 return false;
2425 }
2426 m_position -= 2;
2427 }
2428 }
2429 break;
2430 }
2431 case regex_constants::syntax_close_mark:
2432 // Rewind to start of (? sequence:
2433 --m_position;
2434 while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_open_mark) --m_position;
2435 fail(regex_constants::error_perl_extension, m_position - m_base);
2436 return false;
2437 case regex_constants::escape_type_end_buffer:
2438 {
2439 name_delim = *m_position;
2440 named_capture_jump:
2441 markid = 0;
2442 if(0 == (this->flags() & regbase::nosubs))
2443 {
2444 markid = ++m_mark_count;
2445 #ifndef BOOST_NO_STD_DISTANCE
2446 if(this->flags() & regbase::save_subexpression_location)
2447 this->m_pdata->m_subs.push_back(std::pair<std::size_t, std::size_t>(std::distance(m_base, m_position) - 2, 0));
2448 #else
2449 if(this->flags() & regbase::save_subexpression_location)
2450 this->m_pdata->m_subs.push_back(std::pair<std::size_t, std::size_t>((m_position - m_base) - 2, 0));
2451 #endif
2452 }
2453 pb->index = markid;
2454 const charT* base = ++m_position;
2455 if(m_position == m_end)
2456 {
2457 // Rewind to start of (? sequence:
2458 --m_position;
2459 while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_open_mark) --m_position;
2460 fail(regex_constants::error_perl_extension, m_position - m_base);
2461 return false;
2462 }
2463 while((m_position != m_end) && (*m_position != name_delim))
2464 ++m_position;
2465 if(m_position == m_end)
2466 {
2467 // Rewind to start of (? sequence:
2468 --m_position;
2469 while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_open_mark) --m_position;
2470 fail(regex_constants::error_perl_extension, m_position - m_base);
2471 return false;
2472 }
2473 this->m_pdata->set_name(base, m_position, markid);
2474 ++m_position;
2475 break;
2476 }
2477 default:
2478 if(*m_position == charT('R'))
2479 {
2480 ++m_position;
2481 v = 0;
2482 if(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_close_mark)
2483 {
2484 // Rewind to start of (? sequence:
2485 --m_position;
2486 while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_open_mark) --m_position;
2487 fail(regex_constants::error_perl_extension, m_position - m_base);
2488 return false;
2489 }
2490 goto insert_recursion;
2491 }
2492 if(*m_position == charT('&'))
2493 {
2494 ++m_position;
2495 const charT* base = m_position;
2496 while((m_position != m_end) && (this->m_traits.syntax_type(*m_position) != regex_constants::syntax_close_mark))
2497 ++m_position;
2498 if(m_position == m_end)
2499 {
2500 // Rewind to start of (? sequence:
2501 --m_position;
2502 while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_open_mark) --m_position;
2503 fail(regex_constants::error_perl_extension, m_position - m_base);
2504 return false;
2505 }
2506 v = static_cast<int>(hash_value_from_capture_name(base, m_position));
2507 goto insert_recursion;
2508 }
2509 if(*m_position == charT('P'))
2510 {
2511 ++m_position;
2512 if(m_position == m_end)
2513 {
2514 // Rewind to start of (? sequence:
2515 --m_position;
2516 while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_open_mark) --m_position;
2517 fail(regex_constants::error_perl_extension, m_position - m_base);
2518 return false;
2519 }
2520 if(*m_position == charT('>'))
2521 {
2522 ++m_position;
2523 const charT* base = m_position;
2524 while((m_position != m_end) && (this->m_traits.syntax_type(*m_position) != regex_constants::syntax_close_mark))
2525 ++m_position;
2526 if(m_position == m_end)
2527 {
2528 // Rewind to start of (? sequence:
2529 --m_position;
2530 while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_open_mark) --m_position;
2531 fail(regex_constants::error_perl_extension, m_position - m_base);
2532 return false;
2533 }
2534 v = static_cast<int>(hash_value_from_capture_name(base, m_position));
2535 goto insert_recursion;
2536 }
2537 }
2538 //
2539 // lets assume that we have a (?imsx) group and try and parse it:
2540 //
2541 option_group_jump:
2542 regex_constants::syntax_option_type opts = parse_options();
2543 if(m_position == m_end)
2544 {
2545 // Rewind to start of (? sequence:
2546 --m_position;
2547 while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_open_mark) --m_position;
2548 fail(regex_constants::error_perl_extension, m_position - m_base);
2549 return false;
2550 }
2551 // make a note of whether we have a case change:
2552 m_has_case_change = ((opts & regbase::icase) != (this->flags() & regbase::icase));
2553 pb->index = markid = 0;
2554 if(this->m_traits.syntax_type(*m_position) == regex_constants::syntax_close_mark)
2555 {
2556 // update flags and carry on as normal:
2557 this->flags(opts);
2558 restore_flags = false;
2559 old_case_change |= m_has_case_change; // defer end of scope by one ')'
2560 }
2561 else if(this->m_traits.syntax_type(*m_position) == regex_constants::syntax_colon)
2562 {
2563 // update flags and carry on until the matching ')' is found:
2564 this->flags(opts);
2565 ++m_position;
2566 }
2567 else
2568 {
2569 // Rewind to start of (? sequence:
2570 --m_position;
2571 while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_open_mark) --m_position;
2572 fail(regex_constants::error_perl_extension, m_position - m_base);
2573 return false;
2574 }
2575
2576 // finally append a case change state if we need it:
2577 if(m_has_case_change)
2578 {
2579 static_cast<re_case*>(
2580 this->append_state(syntax_element_toggle_case, sizeof(re_case))
2581 )->icase = opts & regbase::icase;
2582 }
2583
2584 }
2585 //
2586 // now recursively add more states, this will terminate when we get to a
2587 // matching ')' :
2588 //
2589 parse_all();
2590 //
2591 // Unwind alternatives:
2592 //
2593 if(0 == unwind_alts(last_paren_start))
2594 {
2595 // Rewind to start of (? sequence:
2596 --m_position;
2597 while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_open_mark) --m_position;
2598 fail(regex_constants::error_perl_extension, m_position - m_base, "Invalid alternation operators within (?...) block.");
2599 return false;
2600 }
2601 //
2602 // we either have a ')' or we have run out of characters prematurely:
2603 //
2604 if(m_position == m_end)
2605 {
2606 // Rewind to start of (? sequence:
2607 --m_position;
2608 while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_open_mark) --m_position;
2609 this->fail(regex_constants::error_paren, ::boost::BOOST_REGEX_DETAIL_NS::distance(m_base, m_end));
2610 return false;
2611 }
2612 BOOST_ASSERT(this->m_traits.syntax_type(*m_position) == regex_constants::syntax_close_mark);
2613 ++m_position;
2614 //
2615 // restore the flags:
2616 //
2617 if(restore_flags)
2618 {
2619 // append a case change state if we need it:
2620 if(m_has_case_change)
2621 {
2622 static_cast<re_case*>(
2623 this->append_state(syntax_element_toggle_case, sizeof(re_case))
2624 )->icase = old_flags & regbase::icase;
2625 }
2626 this->flags(old_flags);
2627 }
2628 //
2629 // set up the jump pointer if we have one:
2630 //
2631 if(jump_offset)
2632 {
2633 this->m_pdata->m_data.align();
2634 re_jump* jmp = static_cast<re_jump*>(this->getaddress(jump_offset));
2635 jmp->alt.i = this->m_pdata->m_data.size() - this->getoffset(jmp);
2636 if((this->m_last_state == jmp) && (markid != -2))
2637 {
2638 // Oops... we didn't have anything inside the assertion.
2639 // Note we don't get here for negated forward lookahead as (?!)
2640 // does have some uses.
2641 // Rewind to start of (? sequence:
2642 --m_position;
2643 while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_open_mark) --m_position;
2644 fail(regex_constants::error_perl_extension, m_position - m_base, "Invalid or empty zero width assertion.");
2645 return false;
2646 }
2647 }
2648 //
2649 // verify that if this is conditional expression, that we do have
2650 // an alternative, if not add one:
2651 //
2652 if(markid == -4)
2653 {
2654 re_syntax_base* b = this->getaddress(expected_alt_point);
2655 // Make sure we have exactly one alternative following this state:
2656 if(b->type != syntax_element_alt)
2657 {
2658 re_alt* alt = static_cast<re_alt*>(this->insert_state(expected_alt_point, syntax_element_alt, sizeof(re_alt)));
2659 alt->alt.i = this->m_pdata->m_data.size() - this->getoffset(alt);
2660 }
2661 else if(((std::ptrdiff_t)this->m_pdata->m_data.size() > (static_cast<re_alt*>(b)->alt.i + this->getoffset(b))) && (static_cast<re_alt*>(b)->alt.i > 0) && this->getaddress(static_cast<re_alt*>(b)->alt.i, b)->type == syntax_element_alt)
2662 {
2663 // Can't have seen more than one alternative:
2664 // Rewind to start of (? sequence:
2665 --m_position;
2666 while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_open_mark) --m_position;
2667 fail(regex_constants::error_bad_pattern, m_position - m_base, "More than one alternation operator | was encountered inside a conditional expression.");
2668 return false;
2669 }
2670 else
2671 {
2672 // We must *not* have seen an alternative inside a (DEFINE) block:
2673 b = this->getaddress(b->next.i, b);
2674 if((b->type == syntax_element_assert_backref) && (static_cast<re_brace*>(b)->index == 9999))
2675 {
2676 // Rewind to start of (? sequence:
2677 --m_position;
2678 while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_open_mark) --m_position;
2679 fail(regex_constants::error_bad_pattern, m_position - m_base, "Alternation operators are not allowed inside a DEFINE block.");
2680 return false;
2681 }
2682 }
2683 // check for invalid repetition of next state:
2684 b = this->getaddress(expected_alt_point);
2685 b = this->getaddress(static_cast<re_alt*>(b)->next.i, b);
2686 if((b->type != syntax_element_assert_backref)
2687 && (b->type != syntax_element_startmark))
2688 {
2689 // Rewind to start of (? sequence:
2690 --m_position;
2691 while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_open_mark) --m_position;
2692 fail(regex_constants::error_badrepeat, m_position - m_base, "A repetition operator cannot be applied to a zero-width assertion.");
2693 return false;
2694 }
2695 }
2696 //
2697 // append closing parenthesis state:
2698 //
2699 pb = static_cast<re_brace*>(this->append_state(syntax_element_endmark, sizeof(re_brace)));
2700 pb->index = markid;
2701 pb->icase = this->flags() & regbase::icase;
2702 this->m_paren_start = last_paren_start;
2703 //
2704 // restore the alternate insertion point:
2705 //
2706 this->m_alt_insert_point = last_alt_point;
2707 //
2708 // and the case change data:
2709 //
2710 m_has_case_change = old_case_change;
2711 //
2712 // And the mark_reset data:
2713 //
2714 if(m_max_mark > m_mark_count)
2715 {
2716 m_mark_count = m_max_mark;
2717 }
2718 m_mark_reset = mark_reset;
2719 m_max_mark = max_mark;
2720
2721
2722 if(markid > 0)
2723 {
2724 #ifndef BOOST_NO_STD_DISTANCE
2725 if(this->flags() & regbase::save_subexpression_location)
2726 this->m_pdata->m_subs.at((std::size_t)markid - 1).second = std::distance(m_base, m_position) - 1;
2727 #else
2728 if(this->flags() & regbase::save_subexpression_location)
2729 this->m_pdata->m_subs.at(markid - 1).second = (m_position - m_base) - 1;
2730 #endif
2731 //
2732 // allow backrefs to this mark:
2733 //
2734 this->m_backrefs.set(markid);
2735 }
2736 return true;
2737 }
2738
2739 template <class charT, class traits>
match_verb(const char * verb)2740 bool basic_regex_parser<charT, traits>::match_verb(const char* verb)
2741 {
2742 while(*verb)
2743 {
2744 if(static_cast<charT>(*verb) != *m_position)
2745 {
2746 while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_open_mark) --m_position;
2747 fail(regex_constants::error_perl_extension, m_position - m_base);
2748 return false;
2749 }
2750 if(++m_position == m_end)
2751 {
2752 --m_position;
2753 while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_open_mark) --m_position;
2754 fail(regex_constants::error_perl_extension, m_position - m_base);
2755 return false;
2756 }
2757 ++verb;
2758 }
2759 return true;
2760 }
2761
2762 #ifdef BOOST_MSVC
2763 # pragma warning(push)
2764 #if BOOST_MSVC >= 1800
2765 #pragma warning(disable:26812)
2766 #endif
2767 #endif
2768 template <class charT, class traits>
parse_perl_verb()2769 bool basic_regex_parser<charT, traits>::parse_perl_verb()
2770 {
2771 if(++m_position == m_end)
2772 {
2773 // Rewind to start of (* sequence:
2774 --m_position;
2775 while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_open_mark) --m_position;
2776 fail(regex_constants::error_perl_extension, m_position - m_base);
2777 return false;
2778 }
2779 switch(*m_position)
2780 {
2781 case 'F':
2782 if(++m_position == m_end)
2783 {
2784 // Rewind to start of (* sequence:
2785 --m_position;
2786 while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_open_mark) --m_position;
2787 fail(regex_constants::error_perl_extension, m_position - m_base);
2788 return false;
2789 }
2790 if((this->m_traits.syntax_type(*m_position) == regex_constants::syntax_close_mark) || match_verb("AIL"))
2791 {
2792 if((m_position == m_end) || (this->m_traits.syntax_type(*m_position) != regex_constants::syntax_close_mark))
2793 {
2794 // Rewind to start of (* sequence:
2795 --m_position;
2796 while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_open_mark) --m_position;
2797 fail(regex_constants::error_perl_extension, m_position - m_base);
2798 return false;
2799 }
2800 ++m_position;
2801 this->append_state(syntax_element_fail);
2802 return true;
2803 }
2804 break;
2805 case 'A':
2806 if(++m_position == m_end)
2807 {
2808 // Rewind to start of (* sequence:
2809 --m_position;
2810 while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_open_mark) --m_position;
2811 fail(regex_constants::error_perl_extension, m_position - m_base);
2812 return false;
2813 }
2814 if(match_verb("CCEPT"))
2815 {
2816 if((m_position == m_end) || (this->m_traits.syntax_type(*m_position) != regex_constants::syntax_close_mark))
2817 {
2818 // Rewind to start of (* sequence:
2819 --m_position;
2820 while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_open_mark) --m_position;
2821 fail(regex_constants::error_perl_extension, m_position - m_base);
2822 return false;
2823 }
2824 ++m_position;
2825 this->append_state(syntax_element_accept);
2826 return true;
2827 }
2828 break;
2829 case 'C':
2830 if(++m_position == m_end)
2831 {
2832 // Rewind to start of (* sequence:
2833 --m_position;
2834 while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_open_mark) --m_position;
2835 fail(regex_constants::error_perl_extension, m_position - m_base);
2836 return false;
2837 }
2838 if(match_verb("OMMIT"))
2839 {
2840 if((m_position == m_end) || (this->m_traits.syntax_type(*m_position) != regex_constants::syntax_close_mark))
2841 {
2842 // Rewind to start of (* sequence:
2843 --m_position;
2844 while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_open_mark) --m_position;
2845 fail(regex_constants::error_perl_extension, m_position - m_base);
2846 return false;
2847 }
2848 ++m_position;
2849 static_cast<re_commit*>(this->append_state(syntax_element_commit, sizeof(re_commit)))->action = commit_commit;
2850 this->m_pdata->m_disable_match_any = true;
2851 return true;
2852 }
2853 break;
2854 case 'P':
2855 if(++m_position == m_end)
2856 {
2857 // Rewind to start of (* sequence:
2858 --m_position;
2859 while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_open_mark) --m_position;
2860 fail(regex_constants::error_perl_extension, m_position - m_base);
2861 return false;
2862 }
2863 if(match_verb("RUNE"))
2864 {
2865 if((m_position == m_end) || (this->m_traits.syntax_type(*m_position) != regex_constants::syntax_close_mark))
2866 {
2867 // Rewind to start of (* sequence:
2868 --m_position;
2869 while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_open_mark) --m_position;
2870 fail(regex_constants::error_perl_extension, m_position - m_base);
2871 return false;
2872 }
2873 ++m_position;
2874 static_cast<re_commit*>(this->append_state(syntax_element_commit, sizeof(re_commit)))->action = commit_prune;
2875 this->m_pdata->m_disable_match_any = true;
2876 return true;
2877 }
2878 break;
2879 case 'S':
2880 if(++m_position == m_end)
2881 {
2882 // Rewind to start of (* sequence:
2883 --m_position;
2884 while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_open_mark) --m_position;
2885 fail(regex_constants::error_perl_extension, m_position - m_base);
2886 return false;
2887 }
2888 if(match_verb("KIP"))
2889 {
2890 if((m_position == m_end) || (this->m_traits.syntax_type(*m_position) != regex_constants::syntax_close_mark))
2891 {
2892 // Rewind to start of (* sequence:
2893 --m_position;
2894 while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_open_mark) --m_position;
2895 fail(regex_constants::error_perl_extension, m_position - m_base);
2896 return false;
2897 }
2898 ++m_position;
2899 static_cast<re_commit*>(this->append_state(syntax_element_commit, sizeof(re_commit)))->action = commit_skip;
2900 this->m_pdata->m_disable_match_any = true;
2901 return true;
2902 }
2903 break;
2904 case 'T':
2905 if(++m_position == m_end)
2906 {
2907 // Rewind to start of (* sequence:
2908 --m_position;
2909 while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_open_mark) --m_position;
2910 fail(regex_constants::error_perl_extension, m_position - m_base);
2911 return false;
2912 }
2913 if(match_verb("HEN"))
2914 {
2915 if((m_position == m_end) || (this->m_traits.syntax_type(*m_position) != regex_constants::syntax_close_mark))
2916 {
2917 // Rewind to start of (* sequence:
2918 --m_position;
2919 while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_open_mark) --m_position;
2920 fail(regex_constants::error_perl_extension, m_position - m_base);
2921 return false;
2922 }
2923 ++m_position;
2924 this->append_state(syntax_element_then);
2925 this->m_pdata->m_disable_match_any = true;
2926 return true;
2927 }
2928 break;
2929 }
2930 // Rewind to start of (* sequence:
2931 --m_position;
2932 while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_open_mark) --m_position;
2933 fail(regex_constants::error_perl_extension, m_position - m_base);
2934 return false;
2935 }
2936 #ifdef BOOST_MSVC
2937 # pragma warning(pop)
2938 #endif
2939
2940 template <class charT, class traits>
add_emacs_code(bool negate)2941 bool basic_regex_parser<charT, traits>::add_emacs_code(bool negate)
2942 {
2943 //
2944 // parses an emacs style \sx or \Sx construct.
2945 //
2946 if(++m_position == m_end)
2947 {
2948 // Rewind to start of sequence:
2949 --m_position;
2950 while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_escape) --m_position;
2951 fail(regex_constants::error_escape, m_position - m_base);
2952 return false;
2953 }
2954 basic_char_set<charT, traits> char_set;
2955 if(negate)
2956 char_set.negate();
2957
2958 static const charT s_punct[5] = { 'p', 'u', 'n', 'c', 't', };
2959
2960 switch(*m_position)
2961 {
2962 case 's':
2963 case ' ':
2964 char_set.add_class(this->m_mask_space);
2965 break;
2966 case 'w':
2967 char_set.add_class(this->m_word_mask);
2968 break;
2969 case '_':
2970 char_set.add_single(digraph<charT>(charT('$')));
2971 char_set.add_single(digraph<charT>(charT('&')));
2972 char_set.add_single(digraph<charT>(charT('*')));
2973 char_set.add_single(digraph<charT>(charT('+')));
2974 char_set.add_single(digraph<charT>(charT('-')));
2975 char_set.add_single(digraph<charT>(charT('_')));
2976 char_set.add_single(digraph<charT>(charT('<')));
2977 char_set.add_single(digraph<charT>(charT('>')));
2978 break;
2979 case '.':
2980 char_set.add_class(this->m_traits.lookup_classname(s_punct, s_punct+5));
2981 break;
2982 case '(':
2983 char_set.add_single(digraph<charT>(charT('(')));
2984 char_set.add_single(digraph<charT>(charT('[')));
2985 char_set.add_single(digraph<charT>(charT('{')));
2986 break;
2987 case ')':
2988 char_set.add_single(digraph<charT>(charT(')')));
2989 char_set.add_single(digraph<charT>(charT(']')));
2990 char_set.add_single(digraph<charT>(charT('}')));
2991 break;
2992 case '"':
2993 char_set.add_single(digraph<charT>(charT('"')));
2994 char_set.add_single(digraph<charT>(charT('\'')));
2995 char_set.add_single(digraph<charT>(charT('`')));
2996 break;
2997 case '\'':
2998 char_set.add_single(digraph<charT>(charT('\'')));
2999 char_set.add_single(digraph<charT>(charT(',')));
3000 char_set.add_single(digraph<charT>(charT('#')));
3001 break;
3002 case '<':
3003 char_set.add_single(digraph<charT>(charT(';')));
3004 break;
3005 case '>':
3006 char_set.add_single(digraph<charT>(charT('\n')));
3007 char_set.add_single(digraph<charT>(charT('\f')));
3008 break;
3009 default:
3010 fail(regex_constants::error_ctype, m_position - m_base);
3011 return false;
3012 }
3013 if(0 == this->append_set(char_set))
3014 {
3015 fail(regex_constants::error_ctype, m_position - m_base);
3016 return false;
3017 }
3018 ++m_position;
3019 return true;
3020 }
3021
3022 template <class charT, class traits>
parse_options()3023 regex_constants::syntax_option_type basic_regex_parser<charT, traits>::parse_options()
3024 {
3025 // we have a (?imsx-imsx) group, convert it into a set of flags:
3026 regex_constants::syntax_option_type f = this->flags();
3027 bool breakout = false;
3028 do
3029 {
3030 switch(*m_position)
3031 {
3032 case 's':
3033 f |= regex_constants::mod_s;
3034 f &= ~regex_constants::no_mod_s;
3035 break;
3036 case 'm':
3037 f &= ~regex_constants::no_mod_m;
3038 break;
3039 case 'i':
3040 f |= regex_constants::icase;
3041 break;
3042 case 'x':
3043 f |= regex_constants::mod_x;
3044 break;
3045 default:
3046 breakout = true;
3047 continue;
3048 }
3049 if(++m_position == m_end)
3050 {
3051 // Rewind to start of (? sequence:
3052 --m_position;
3053 while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_open_mark) --m_position;
3054 fail(regex_constants::error_paren, m_position - m_base);
3055 return false;
3056 }
3057 }
3058 while(!breakout);
3059
3060 breakout = false;
3061
3062 if(*m_position == static_cast<charT>('-'))
3063 {
3064 if(++m_position == m_end)
3065 {
3066 // Rewind to start of (? sequence:
3067 --m_position;
3068 while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_open_mark) --m_position;
3069 fail(regex_constants::error_paren, m_position - m_base);
3070 return false;
3071 }
3072 do
3073 {
3074 switch(*m_position)
3075 {
3076 case 's':
3077 f &= ~regex_constants::mod_s;
3078 f |= regex_constants::no_mod_s;
3079 break;
3080 case 'm':
3081 f |= regex_constants::no_mod_m;
3082 break;
3083 case 'i':
3084 f &= ~regex_constants::icase;
3085 break;
3086 case 'x':
3087 f &= ~regex_constants::mod_x;
3088 break;
3089 default:
3090 breakout = true;
3091 continue;
3092 }
3093 if(++m_position == m_end)
3094 {
3095 // Rewind to start of (? sequence:
3096 --m_position;
3097 while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_open_mark) --m_position;
3098 fail(regex_constants::error_paren, m_position - m_base);
3099 return false;
3100 }
3101 }
3102 while(!breakout);
3103 }
3104 return f;
3105 }
3106
3107 template <class charT, class traits>
unwind_alts(std::ptrdiff_t last_paren_start)3108 bool basic_regex_parser<charT, traits>::unwind_alts(std::ptrdiff_t last_paren_start)
3109 {
3110 //
3111 // If we didn't actually add any states after the last
3112 // alternative then that's an error:
3113 //
3114 if((this->m_alt_insert_point == static_cast<std::ptrdiff_t>(this->m_pdata->m_data.size()))
3115 && m_alt_jumps.size() && (m_alt_jumps.back() > last_paren_start)
3116 &&
3117 !(
3118 ((this->flags() & regbase::main_option_type) == regbase::perl_syntax_group)
3119 &&
3120 ((this->flags() & regbase::no_empty_expressions) == 0)
3121 )
3122 )
3123 {
3124 fail(regex_constants::error_empty, this->m_position - this->m_base, "Can't terminate a sub-expression with an alternation operator |.");
3125 return false;
3126 }
3127 //
3128 // Fix up our alternatives:
3129 //
3130 while(m_alt_jumps.size() && (m_alt_jumps.back() > last_paren_start))
3131 {
3132 //
3133 // fix up the jump to point to the end of the states
3134 // that we've just added:
3135 //
3136 std::ptrdiff_t jump_offset = m_alt_jumps.back();
3137 m_alt_jumps.pop_back();
3138 this->m_pdata->m_data.align();
3139 re_jump* jmp = static_cast<re_jump*>(this->getaddress(jump_offset));
3140 BOOST_ASSERT(jmp->type == syntax_element_jump);
3141 jmp->alt.i = this->m_pdata->m_data.size() - jump_offset;
3142 }
3143 return true;
3144 }
3145
3146 #ifdef BOOST_MSVC
3147 #pragma warning(pop)
3148 #endif
3149
3150 } // namespace BOOST_REGEX_DETAIL_NS
3151 } // namespace boost
3152
3153 #ifdef BOOST_MSVC
3154 #pragma warning(push)
3155 #pragma warning(disable: 4103)
3156 #endif
3157 #ifdef BOOST_HAS_ABI_HEADERS
3158 # include BOOST_ABI_SUFFIX
3159 #endif
3160 #ifdef BOOST_MSVC
3161 #pragma warning(pop)
3162 #endif
3163
3164 #endif
3165