1 /*************************************************
2 * Perl-Compatible Regular Expressions *
3 *************************************************/
4
5 /* PCRE is a library of functions to support regular expressions whose syntax
6 and semantics are as close as possible to those of the Perl 5 language.
7
8 Written by Philip Hazel
9 Copyright (c) 1997-2012 University of Cambridge
10
11 -----------------------------------------------------------------------------
12 Redistribution and use in source and binary forms, with or without
13 modification, are permitted provided that the following conditions are met:
14
15 * Redistributions of source code must retain the above copyright notice,
16 this list of conditions and the following disclaimer.
17
18 * Redistributions in binary form must reproduce the above copyright
19 notice, this list of conditions and the following disclaimer in the
20 documentation and/or other materials provided with the distribution.
21
22 * Neither the name of the University of Cambridge nor the names of its
23 contributors may be used to endorse or promote products derived from
24 this software without specific prior written permission.
25
26 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
27 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
30 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36 POSSIBILITY OF SUCH DAMAGE.
37 -----------------------------------------------------------------------------
38 */
39
40 /* This module contains pcre_exec(), the externally visible function that does
41 pattern matching using an NFA algorithm, trying to mimic Perl as closely as
42 possible. There are also some static supporting functions. */
43
44 #include "config.h"
45
46 #define NLBLOCK md /* Block containing newline information */
47 #define PSSTART start_subject /* Field containing processed string start */
48 #define PSEND end_subject /* Field containing processed string end */
49
50 #include "pcre_internal.h"
51
52 /* Undefine some potentially clashing cpp symbols */
53
54 #undef min
55 #undef max
56
57 /* Values for setting in md->match_function_type to indicate two special types
58 of call to match(). We do it this way to save on using another stack variable,
59 as stack usage is to be discouraged. */
60
61 #define MATCH_CONDASSERT 1 /* Called to check a condition assertion */
62 #define MATCH_CBEGROUP 2 /* Could-be-empty unlimited repeat group */
63
64 /* Non-error returns from the match() function. Error returns are externally
65 defined PCRE_ERROR_xxx codes, which are all negative. */
66
67 #define MATCH_MATCH 1
68 #define MATCH_NOMATCH 0
69
70 /* Special internal returns from the match() function. Make them sufficiently
71 negative to avoid the external error codes. */
72
73 #define MATCH_ACCEPT (-999)
74 #define MATCH_COMMIT (-998)
75 #define MATCH_KETRPOS (-997)
76 #define MATCH_ONCE (-996)
77 #define MATCH_PRUNE (-995)
78 #define MATCH_SKIP (-994)
79 #define MATCH_SKIP_ARG (-993)
80 #define MATCH_THEN (-992)
81
82 /* Maximum number of ints of offset to save on the stack for recursive calls.
83 If the offset vector is bigger, malloc is used. This should be a multiple of 3,
84 because the offset vector is always a multiple of 3 long. */
85
86 #define REC_STACK_SAVE_MAX 30
87
88 /* Min and max values for the common repeats; for the maxima, 0 => infinity */
89
90 static const char rep_min[] = { 0, 0, 1, 1, 0, 0 };
91 static const char rep_max[] = { 0, 0, 0, 0, 1, 1 };
92
93
94
95 #ifdef PCRE_DEBUG
96 /*************************************************
97 * Debugging function to print chars *
98 *************************************************/
99
100 /* Print a sequence of chars in printable format, stopping at the end of the
101 subject if the requested.
102
103 Arguments:
104 p points to characters
105 length number to print
106 is_subject TRUE if printing from within md->start_subject
107 md pointer to matching data block, if is_subject is TRUE
108
109 Returns: nothing
110 */
111
112 static void
pchars(const pcre_uchar * p,int length,BOOL is_subject,match_data * md)113 pchars(const pcre_uchar *p, int length, BOOL is_subject, match_data *md)
114 {
115 unsigned int c;
116 if (is_subject && length > md->end_subject - p) length = md->end_subject - p;
117 while (length-- > 0)
118 if (isprint(c = *(p++))) printf("%c", c); else printf("\\x%02x", c);
119 }
120 #endif
121
122
123
124 /*************************************************
125 * Match a back-reference *
126 *************************************************/
127
128 /* Normally, if a back reference hasn't been set, the length that is passed is
129 negative, so the match always fails. However, in JavaScript compatibility mode,
130 the length passed is zero. Note that in caseless UTF-8 mode, the number of
131 subject bytes matched may be different to the number of reference bytes.
132
133 Arguments:
134 offset index into the offset vector
135 eptr pointer into the subject
136 length length of reference to be matched (number of bytes)
137 md points to match data block
138 caseless TRUE if caseless
139
140 Returns: >= 0 the number of subject bytes matched
141 -1 no match
142 -2 partial match; always given if at end subject
143 */
144
145 static int
match_ref(int offset,PCRE_PUCHAR eptr,int length,match_data * md,BOOL caseless)146 match_ref(int offset, PCRE_PUCHAR eptr, int length, match_data *md,
147 BOOL caseless)
148 {
149 PCRE_PUCHAR eptr_start = eptr;
150 PCRE_PUCHAR p = md->start_subject + md->offset_vector[offset];
151
152 #ifdef PCRE_DEBUG
153 if (eptr >= md->end_subject)
154 printf("matching subject <null>");
155 else
156 {
157 printf("matching subject ");
158 pchars(eptr, length, TRUE, md);
159 }
160 printf(" against backref ");
161 pchars(p, length, FALSE, md);
162 printf("\n");
163 #endif
164
165 /* Always fail if reference not set (and not JavaScript compatible - in that
166 case the length is passed as zero). */
167
168 if (length < 0) return -1;
169
170 /* Separate the caseless case for speed. In UTF-8 mode we can only do this
171 properly if Unicode properties are supported. Otherwise, we can check only
172 ASCII characters. */
173
174 if (caseless)
175 {
176 #ifdef SUPPORT_UTF
177 #ifdef SUPPORT_UCP
178 if (md->utf)
179 {
180 /* Match characters up to the end of the reference. NOTE: the number of
181 bytes matched may differ, because there are some characters whose upper and
182 lower case versions code as different numbers of bytes. For example, U+023A
183 (2 bytes in UTF-8) is the upper case version of U+2C65 (3 bytes in UTF-8);
184 a sequence of 3 of the former uses 6 bytes, as does a sequence of two of
185 the latter. It is important, therefore, to check the length along the
186 reference, not along the subject (earlier code did this wrong). */
187
188 PCRE_PUCHAR endptr = p + length;
189 while (p < endptr)
190 {
191 int c, d;
192 if (eptr >= md->end_subject) return -2; /* Partial match */
193 GETCHARINC(c, eptr);
194 GETCHARINC(d, p);
195 if (c != d && c != UCD_OTHERCASE(d)) return -1;
196 }
197 }
198 else
199 #endif
200 #endif
201
202 /* The same code works when not in UTF-8 mode and in UTF-8 mode when there
203 is no UCP support. */
204 {
205 while (length-- > 0)
206 {
207 if (eptr >= md->end_subject) return -2; /* Partial match */
208 if (TABLE_GET(*p, md->lcc, *p) != TABLE_GET(*eptr, md->lcc, *eptr)) return -1;
209 p++;
210 eptr++;
211 }
212 }
213 }
214
215 /* In the caseful case, we can just compare the bytes, whether or not we
216 are in UTF-8 mode. */
217
218 else
219 {
220 while (length-- > 0)
221 {
222 if (eptr >= md->end_subject) return -2; /* Partial match */
223 if (*p++ != *eptr++) return -1;
224 }
225 }
226
227 return (int)(eptr - eptr_start);
228 }
229
230
231
232 /***************************************************************************
233 ****************************************************************************
234 RECURSION IN THE match() FUNCTION
235
236 The match() function is highly recursive, though not every recursive call
237 increases the recursive depth. Nevertheless, some regular expressions can cause
238 it to recurse to a great depth. I was writing for Unix, so I just let it call
239 itself recursively. This uses the stack for saving everything that has to be
240 saved for a recursive call. On Unix, the stack can be large, and this works
241 fine.
242
243 It turns out that on some non-Unix-like systems there are problems with
244 programs that use a lot of stack. (This despite the fact that every last chip
245 has oodles of memory these days, and techniques for extending the stack have
246 been known for decades.) So....
247
248 There is a fudge, triggered by defining NO_RECURSE, which avoids recursive
249 calls by keeping local variables that need to be preserved in blocks of memory
250 obtained from malloc() instead instead of on the stack. Macros are used to
251 achieve this so that the actual code doesn't look very different to what it
252 always used to.
253
254 The original heap-recursive code used longjmp(). However, it seems that this
255 can be very slow on some operating systems. Following a suggestion from Stan
256 Switzer, the use of longjmp() has been abolished, at the cost of having to
257 provide a unique number for each call to RMATCH. There is no way of generating
258 a sequence of numbers at compile time in C. I have given them names, to make
259 them stand out more clearly.
260
261 Crude tests on x86 Linux show a small speedup of around 5-8%. However, on
262 FreeBSD, avoiding longjmp() more than halves the time taken to run the standard
263 tests. Furthermore, not using longjmp() means that local dynamic variables
264 don't have indeterminate values; this has meant that the frame size can be
265 reduced because the result can be "passed back" by straight setting of the
266 variable instead of being passed in the frame.
267 ****************************************************************************
268 ***************************************************************************/
269
270 /* Numbers for RMATCH calls. When this list is changed, the code at HEAP_RETURN
271 below must be updated in sync. */
272
273 enum { RM1=1, RM2, RM3, RM4, RM5, RM6, RM7, RM8, RM9, RM10,
274 RM11, RM12, RM13, RM14, RM15, RM16, RM17, RM18, RM19, RM20,
275 RM21, RM22, RM23, RM24, RM25, RM26, RM27, RM28, RM29, RM30,
276 RM31, RM32, RM33, RM34, RM35, RM36, RM37, RM38, RM39, RM40,
277 RM41, RM42, RM43, RM44, RM45, RM46, RM47, RM48, RM49, RM50,
278 RM51, RM52, RM53, RM54, RM55, RM56, RM57, RM58, RM59, RM60,
279 RM61, RM62, RM63, RM64, RM65, RM66 };
280
281 /* These versions of the macros use the stack, as normal. There are debugging
282 versions and production versions. Note that the "rw" argument of RMATCH isn't
283 actually used in this definition. */
284
285 #ifndef NO_RECURSE
286
287 #ifdef PCRE_DEBUG
288 #define RMATCH(ra,rb,rc,rd,re,rw) \
289 { \
290 printf("match() called in line %d\n", __LINE__); \
291 rrc = match(ra,rb,mstart,rc,rd,re,rdepth+1); \
292 printf("to line %d\n", __LINE__); \
293 }
294 #define RRETURN(ra) \
295 { \
296 printf("match() returned %d from line %d ", ra, __LINE__); \
297 return ra; \
298 }
299 #else
300 #define RMATCH(ra,rb,rc,rd,re,rw) \
301 rrc = match(ra,rb,mstart,rc,rd,re,rdepth+1)
302 #define RRETURN(ra) return ra
303 #endif
304
305 #else
306
307
308 /* These versions of the macros manage a private stack on the heap. Note that
309 the "rd" argument of RMATCH isn't actually used in this definition. It's the md
310 argument of match(), which never changes. */
311
312 #define RMATCH(ra,rb,rc,rd,re,rw)\
313 {\
314 heapframe *newframe = frame->Xnextframe;\
315 if (newframe == NULL)\
316 {\
317 newframe = (heapframe *)(PUBL(stack_malloc))(sizeof(heapframe));\
318 if (newframe == NULL) RRETURN(PCRE_ERROR_NOMEMORY);\
319 newframe->Xnextframe = NULL;\
320 frame->Xnextframe = newframe;\
321 }\
322 frame->Xwhere = rw;\
323 newframe->Xeptr = ra;\
324 newframe->Xecode = rb;\
325 newframe->Xmstart = mstart;\
326 newframe->Xoffset_top = rc;\
327 newframe->Xeptrb = re;\
328 newframe->Xrdepth = frame->Xrdepth + 1;\
329 newframe->Xprevframe = frame;\
330 frame = newframe;\
331 DPRINTF(("restarting from line %d\n", __LINE__));\
332 goto HEAP_RECURSE;\
333 L_##rw:\
334 DPRINTF(("jumped back to line %d\n", __LINE__));\
335 }
336
337 #define RRETURN(ra)\
338 {\
339 heapframe *oldframe = frame;\
340 frame = oldframe->Xprevframe;\
341 if (frame != NULL)\
342 {\
343 rrc = ra;\
344 goto HEAP_RETURN;\
345 }\
346 return ra;\
347 }
348
349
350 /* Structure for remembering the local variables in a private frame */
351
352 typedef struct heapframe {
353 struct heapframe *Xprevframe;
354 struct heapframe *Xnextframe;
355
356 /* Function arguments that may change */
357
358 PCRE_PUCHAR Xeptr;
359 const pcre_uchar *Xecode;
360 PCRE_PUCHAR Xmstart;
361 int Xoffset_top;
362 eptrblock *Xeptrb;
363 unsigned int Xrdepth;
364
365 /* Function local variables */
366
367 PCRE_PUCHAR Xcallpat;
368 #ifdef SUPPORT_UTF
369 PCRE_PUCHAR Xcharptr;
370 #endif
371 PCRE_PUCHAR Xdata;
372 PCRE_PUCHAR Xnext;
373 PCRE_PUCHAR Xpp;
374 PCRE_PUCHAR Xprev;
375 PCRE_PUCHAR Xsaved_eptr;
376
377 recursion_info Xnew_recursive;
378
379 BOOL Xcur_is_word;
380 BOOL Xcondition;
381 BOOL Xprev_is_word;
382
383 #ifdef SUPPORT_UCP
384 int Xprop_type;
385 int Xprop_value;
386 int Xprop_fail_result;
387 int Xoclength;
388 pcre_uchar Xocchars[6];
389 #endif
390
391 int Xcodelink;
392 int Xctype;
393 unsigned int Xfc;
394 int Xfi;
395 int Xlength;
396 int Xmax;
397 int Xmin;
398 int Xnumber;
399 int Xoffset;
400 int Xop;
401 int Xsave_capture_last;
402 int Xsave_offset1, Xsave_offset2, Xsave_offset3;
403 int Xstacksave[REC_STACK_SAVE_MAX];
404
405 eptrblock Xnewptrb;
406
407 /* Where to jump back to */
408
409 int Xwhere;
410
411 } heapframe;
412
413 #endif
414
415
416 /***************************************************************************
417 ***************************************************************************/
418
419
420
421 /*************************************************
422 * Match from current position *
423 *************************************************/
424
425 /* This function is called recursively in many circumstances. Whenever it
426 returns a negative (error) response, the outer incarnation must also return the
427 same response. */
428
429 /* These macros pack up tests that are used for partial matching, and which
430 appear several times in the code. We set the "hit end" flag if the pointer is
431 at the end of the subject and also past the start of the subject (i.e.
432 something has been matched). For hard partial matching, we then return
433 immediately. The second one is used when we already know we are past the end of
434 the subject. */
435
436 #define CHECK_PARTIAL()\
437 if (md->partial != 0 && eptr >= md->end_subject && \
438 eptr > md->start_used_ptr) \
439 { \
440 md->hitend = TRUE; \
441 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL); \
442 }
443
444 #define SCHECK_PARTIAL()\
445 if (md->partial != 0 && eptr > md->start_used_ptr) \
446 { \
447 md->hitend = TRUE; \
448 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL); \
449 }
450
451
452 /* Performance note: It might be tempting to extract commonly used fields from
453 the md structure (e.g. utf, end_subject) into individual variables to improve
454 performance. Tests using gcc on a SPARC disproved this; in the first case, it
455 made performance worse.
456
457 Arguments:
458 eptr pointer to current character in subject
459 ecode pointer to current position in compiled code
460 mstart pointer to the current match start position (can be modified
461 by encountering \K)
462 offset_top current top pointer
463 md pointer to "static" info for the match
464 eptrb pointer to chain of blocks containing eptr at start of
465 brackets - for testing for empty matches
466 rdepth the recursion depth
467
468 Returns: MATCH_MATCH if matched ) these values are >= 0
469 MATCH_NOMATCH if failed to match )
470 a negative MATCH_xxx value for PRUNE, SKIP, etc
471 a negative PCRE_ERROR_xxx value if aborted by an error condition
472 (e.g. stopped by repeated call or recursion limit)
473 */
474
475 static int
match(PCRE_PUCHAR eptr,const pcre_uchar * ecode,PCRE_PUCHAR mstart,int offset_top,match_data * md,eptrblock * eptrb,unsigned int rdepth)476 match(PCRE_PUCHAR eptr, const pcre_uchar *ecode,
477 PCRE_PUCHAR mstart, int offset_top, match_data *md, eptrblock *eptrb,
478 unsigned int rdepth)
479 {
480 /* These variables do not need to be preserved over recursion in this function,
481 so they can be ordinary variables in all cases. Mark some of them with
482 "register" because they are used a lot in loops. */
483
484 int rrc; /* Returns from recursive calls */
485 int i; /* Used for loops not involving calls to RMATCH() */
486 unsigned int c; /* Character values not kept over RMATCH() calls */
487 BOOL utf; /* Local copy of UTF flag for speed */
488
489 BOOL minimize, possessive; /* Quantifier options */
490 BOOL caseless;
491 int condcode;
492
493 /* When recursion is not being used, all "local" variables that have to be
494 preserved over calls to RMATCH() are part of a "frame". We set up the top-level
495 frame on the stack here; subsequent instantiations are obtained from the heap
496 whenever RMATCH() does a "recursion". See the macro definitions above. Putting
497 the top-level on the stack rather than malloc-ing them all gives a performance
498 boost in many cases where there is not much "recursion". */
499
500 #ifdef NO_RECURSE
501 heapframe *frame = (heapframe *)md->match_frames_base;
502
503 /* Copy in the original argument variables */
504
505 frame->Xeptr = eptr;
506 frame->Xecode = ecode;
507 frame->Xmstart = mstart;
508 frame->Xoffset_top = offset_top;
509 frame->Xeptrb = eptrb;
510 frame->Xrdepth = rdepth;
511
512 /* This is where control jumps back to to effect "recursion" */
513
514 HEAP_RECURSE:
515
516 /* Macros make the argument variables come from the current frame */
517
518 #define eptr frame->Xeptr
519 #define ecode frame->Xecode
520 #define mstart frame->Xmstart
521 #define offset_top frame->Xoffset_top
522 #define eptrb frame->Xeptrb
523 #define rdepth frame->Xrdepth
524
525 /* Ditto for the local variables */
526
527 #ifdef SUPPORT_UTF
528 #define charptr frame->Xcharptr
529 #endif
530 #define callpat frame->Xcallpat
531 #define codelink frame->Xcodelink
532 #define data frame->Xdata
533 #define next frame->Xnext
534 #define pp frame->Xpp
535 #define prev frame->Xprev
536 #define saved_eptr frame->Xsaved_eptr
537
538 #define new_recursive frame->Xnew_recursive
539
540 #define cur_is_word frame->Xcur_is_word
541 #define condition frame->Xcondition
542 #define prev_is_word frame->Xprev_is_word
543
544 #ifdef SUPPORT_UCP
545 #define prop_type frame->Xprop_type
546 #define prop_value frame->Xprop_value
547 #define prop_fail_result frame->Xprop_fail_result
548 #define oclength frame->Xoclength
549 #define occhars frame->Xocchars
550 #endif
551
552 #define ctype frame->Xctype
553 #define fc frame->Xfc
554 #define fi frame->Xfi
555 #define length frame->Xlength
556 #define max frame->Xmax
557 #define min frame->Xmin
558 #define number frame->Xnumber
559 #define offset frame->Xoffset
560 #define op frame->Xop
561 #define save_capture_last frame->Xsave_capture_last
562 #define save_offset1 frame->Xsave_offset1
563 #define save_offset2 frame->Xsave_offset2
564 #define save_offset3 frame->Xsave_offset3
565 #define stacksave frame->Xstacksave
566
567 #define newptrb frame->Xnewptrb
568
569 /* When recursion is being used, local variables are allocated on the stack and
570 get preserved during recursion in the normal way. In this environment, fi and
571 i, and fc and c, can be the same variables. */
572
573 #else /* NO_RECURSE not defined */
574 #define fi i
575 #define fc c
576
577 /* Many of the following variables are used only in small blocks of the code.
578 My normal style of coding would have declared them within each of those blocks.
579 However, in order to accommodate the version of this code that uses an external
580 "stack" implemented on the heap, it is easier to declare them all here, so the
581 declarations can be cut out in a block. The only declarations within blocks
582 below are for variables that do not have to be preserved over a recursive call
583 to RMATCH(). */
584
585 #ifdef SUPPORT_UTF
586 const pcre_uchar *charptr;
587 #endif
588 const pcre_uchar *callpat;
589 const pcre_uchar *data;
590 const pcre_uchar *next;
591 PCRE_PUCHAR pp;
592 const pcre_uchar *prev;
593 PCRE_PUCHAR saved_eptr;
594
595 recursion_info new_recursive;
596
597 BOOL cur_is_word;
598 BOOL condition;
599 BOOL prev_is_word;
600
601 #ifdef SUPPORT_UCP
602 int prop_type;
603 int prop_value;
604 int prop_fail_result;
605 int oclength;
606 pcre_uchar occhars[6];
607 #endif
608
609 int codelink;
610 int ctype;
611 int length;
612 int max;
613 int min;
614 int number;
615 int offset;
616 int op;
617 int save_capture_last;
618 int save_offset1, save_offset2, save_offset3;
619 int stacksave[REC_STACK_SAVE_MAX];
620
621 eptrblock newptrb;
622
623 /* There is a special fudge for calling match() in a way that causes it to
624 measure the size of its basic stack frame when the stack is being used for
625 recursion. The second argument (ecode) being NULL triggers this behaviour. It
626 cannot normally ever be NULL. The return is the negated value of the frame
627 size. */
628
629 if (ecode == NULL)
630 {
631 if (rdepth == 0)
632 return match((PCRE_PUCHAR)&rdepth, NULL, NULL, 0, NULL, NULL, 1);
633 else
634 {
635 int len = (char *)&rdepth - (char *)eptr;
636 return (len > 0)? -len : len;
637 }
638 }
639 #endif /* NO_RECURSE */
640
641 /* To save space on the stack and in the heap frame, I have doubled up on some
642 of the local variables that are used only in localised parts of the code, but
643 still need to be preserved over recursive calls of match(). These macros define
644 the alternative names that are used. */
645
646 #define allow_zero cur_is_word
647 #define cbegroup condition
648 #define code_offset codelink
649 #define condassert condition
650 #define matched_once prev_is_word
651 #define foc number
652 #define save_mark data
653
654 /* These statements are here to stop the compiler complaining about unitialized
655 variables. */
656
657 #ifdef SUPPORT_UCP
658 prop_value = 0;
659 prop_fail_result = 0;
660 #endif
661
662
663 /* This label is used for tail recursion, which is used in a few cases even
664 when NO_RECURSE is not defined, in order to reduce the amount of stack that is
665 used. Thanks to Ian Taylor for noticing this possibility and sending the
666 original patch. */
667
668 TAIL_RECURSE:
669
670 /* OK, now we can get on with the real code of the function. Recursive calls
671 are specified by the macro RMATCH and RRETURN is used to return. When
672 NO_RECURSE is *not* defined, these just turn into a recursive call to match()
673 and a "return", respectively (possibly with some debugging if PCRE_DEBUG is
674 defined). However, RMATCH isn't like a function call because it's quite a
675 complicated macro. It has to be used in one particular way. This shouldn't,
676 however, impact performance when true recursion is being used. */
677
678 #ifdef SUPPORT_UTF
679 utf = md->utf; /* Local copy of the flag */
680 #else
681 utf = FALSE;
682 #endif
683
684 /* First check that we haven't called match() too many times, or that we
685 haven't exceeded the recursive call limit. */
686
687 if (md->match_call_count++ >= md->match_limit) RRETURN(PCRE_ERROR_MATCHLIMIT);
688 if (rdepth >= md->match_limit_recursion) RRETURN(PCRE_ERROR_RECURSIONLIMIT);
689
690 /* At the start of a group with an unlimited repeat that may match an empty
691 string, the variable md->match_function_type is set to MATCH_CBEGROUP. It is
692 done this way to save having to use another function argument, which would take
693 up space on the stack. See also MATCH_CONDASSERT below.
694
695 When MATCH_CBEGROUP is set, add the current subject pointer to the chain of
696 such remembered pointers, to be checked when we hit the closing ket, in order
697 to break infinite loops that match no characters. When match() is called in
698 other circumstances, don't add to the chain. The MATCH_CBEGROUP feature must
699 NOT be used with tail recursion, because the memory block that is used is on
700 the stack, so a new one may be required for each match(). */
701
702 if (md->match_function_type == MATCH_CBEGROUP)
703 {
704 newptrb.epb_saved_eptr = eptr;
705 newptrb.epb_prev = eptrb;
706 eptrb = &newptrb;
707 md->match_function_type = 0;
708 }
709
710 /* Now start processing the opcodes. */
711
712 for (;;)
713 {
714 minimize = possessive = FALSE;
715 op = *ecode;
716
717 switch(op)
718 {
719 case OP_MARK:
720 md->nomatch_mark = ecode + 2;
721 md->mark = NULL; /* In case previously set by assertion */
722 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode] + ecode[1], offset_top, md,
723 eptrb, RM55);
724 if ((rrc == MATCH_MATCH || rrc == MATCH_ACCEPT) &&
725 md->mark == NULL) md->mark = ecode + 2;
726
727 /* A return of MATCH_SKIP_ARG means that matching failed at SKIP with an
728 argument, and we must check whether that argument matches this MARK's
729 argument. It is passed back in md->start_match_ptr (an overloading of that
730 variable). If it does match, we reset that variable to the current subject
731 position and return MATCH_SKIP. Otherwise, pass back the return code
732 unaltered. */
733
734 else if (rrc == MATCH_SKIP_ARG &&
735 STRCMP_UC_UC(ecode + 2, md->start_match_ptr) == 0)
736 {
737 md->start_match_ptr = eptr;
738 RRETURN(MATCH_SKIP);
739 }
740 RRETURN(rrc);
741
742 case OP_FAIL:
743 RRETURN(MATCH_NOMATCH);
744
745 /* COMMIT overrides PRUNE, SKIP, and THEN */
746
747 case OP_COMMIT:
748 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md,
749 eptrb, RM52);
750 if (rrc != MATCH_NOMATCH && rrc != MATCH_PRUNE &&
751 rrc != MATCH_SKIP && rrc != MATCH_SKIP_ARG &&
752 rrc != MATCH_THEN)
753 RRETURN(rrc);
754 RRETURN(MATCH_COMMIT);
755
756 /* PRUNE overrides THEN */
757
758 case OP_PRUNE:
759 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md,
760 eptrb, RM51);
761 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
762 RRETURN(MATCH_PRUNE);
763
764 case OP_PRUNE_ARG:
765 md->nomatch_mark = ecode + 2;
766 md->mark = NULL; /* In case previously set by assertion */
767 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode] + ecode[1], offset_top, md,
768 eptrb, RM56);
769 if ((rrc == MATCH_MATCH || rrc == MATCH_ACCEPT) &&
770 md->mark == NULL) md->mark = ecode + 2;
771 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
772 RRETURN(MATCH_PRUNE);
773
774 /* SKIP overrides PRUNE and THEN */
775
776 case OP_SKIP:
777 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md,
778 eptrb, RM53);
779 if (rrc != MATCH_NOMATCH && rrc != MATCH_PRUNE && rrc != MATCH_THEN)
780 RRETURN(rrc);
781 md->start_match_ptr = eptr; /* Pass back current position */
782 RRETURN(MATCH_SKIP);
783
784 /* Note that, for Perl compatibility, SKIP with an argument does NOT set
785 nomatch_mark. There is a flag that disables this opcode when re-matching a
786 pattern that ended with a SKIP for which there was not a matching MARK. */
787
788 case OP_SKIP_ARG:
789 if (md->ignore_skip_arg)
790 {
791 ecode += PRIV(OP_lengths)[*ecode] + ecode[1];
792 break;
793 }
794 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode] + ecode[1], offset_top, md,
795 eptrb, RM57);
796 if (rrc != MATCH_NOMATCH && rrc != MATCH_PRUNE && rrc != MATCH_THEN)
797 RRETURN(rrc);
798
799 /* Pass back the current skip name by overloading md->start_match_ptr and
800 returning the special MATCH_SKIP_ARG return code. This will either be
801 caught by a matching MARK, or get to the top, where it causes a rematch
802 with the md->ignore_skip_arg flag set. */
803
804 md->start_match_ptr = ecode + 2;
805 RRETURN(MATCH_SKIP_ARG);
806
807 /* For THEN (and THEN_ARG) we pass back the address of the opcode, so that
808 the branch in which it occurs can be determined. Overload the start of
809 match pointer to do this. */
810
811 case OP_THEN:
812 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md,
813 eptrb, RM54);
814 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
815 md->start_match_ptr = ecode;
816 RRETURN(MATCH_THEN);
817
818 case OP_THEN_ARG:
819 md->nomatch_mark = ecode + 2;
820 md->mark = NULL; /* In case previously set by assertion */
821 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode] + ecode[1], offset_top,
822 md, eptrb, RM58);
823 if ((rrc == MATCH_MATCH || rrc == MATCH_ACCEPT) &&
824 md->mark == NULL) md->mark = ecode + 2;
825 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
826 md->start_match_ptr = ecode;
827 RRETURN(MATCH_THEN);
828
829 /* Handle an atomic group that does not contain any capturing parentheses.
830 This can be handled like an assertion. Prior to 8.13, all atomic groups
831 were handled this way. In 8.13, the code was changed as below for ONCE, so
832 that backups pass through the group and thereby reset captured values.
833 However, this uses a lot more stack, so in 8.20, atomic groups that do not
834 contain any captures generate OP_ONCE_NC, which can be handled in the old,
835 less stack intensive way.
836
837 Check the alternative branches in turn - the matching won't pass the KET
838 for this kind of subpattern. If any one branch matches, we carry on as at
839 the end of a normal bracket, leaving the subject pointer, but resetting
840 the start-of-match value in case it was changed by \K. */
841
842 case OP_ONCE_NC:
843 prev = ecode;
844 saved_eptr = eptr;
845 save_mark = md->mark;
846 do
847 {
848 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM64);
849 if (rrc == MATCH_MATCH) /* Note: _not_ MATCH_ACCEPT */
850 {
851 mstart = md->start_match_ptr;
852 break;
853 }
854 if (rrc == MATCH_THEN)
855 {
856 next = ecode + GET(ecode,1);
857 if (md->start_match_ptr < next &&
858 (*ecode == OP_ALT || *next == OP_ALT))
859 rrc = MATCH_NOMATCH;
860 }
861
862 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
863 ecode += GET(ecode,1);
864 md->mark = save_mark;
865 }
866 while (*ecode == OP_ALT);
867
868 /* If hit the end of the group (which could be repeated), fail */
869
870 if (*ecode != OP_ONCE_NC && *ecode != OP_ALT) RRETURN(MATCH_NOMATCH);
871
872 /* Continue as from after the group, updating the offsets high water
873 mark, since extracts may have been taken. */
874
875 do ecode += GET(ecode, 1); while (*ecode == OP_ALT);
876
877 offset_top = md->end_offset_top;
878 eptr = md->end_match_ptr;
879
880 /* For a non-repeating ket, just continue at this level. This also
881 happens for a repeating ket if no characters were matched in the group.
882 This is the forcible breaking of infinite loops as implemented in Perl
883 5.005. */
884
885 if (*ecode == OP_KET || eptr == saved_eptr)
886 {
887 ecode += 1+LINK_SIZE;
888 break;
889 }
890
891 /* The repeating kets try the rest of the pattern or restart from the
892 preceding bracket, in the appropriate order. The second "call" of match()
893 uses tail recursion, to avoid using another stack frame. */
894
895 if (*ecode == OP_KETRMIN)
896 {
897 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM65);
898 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
899 ecode = prev;
900 goto TAIL_RECURSE;
901 }
902 else /* OP_KETRMAX */
903 {
904 RMATCH(eptr, prev, offset_top, md, eptrb, RM66);
905 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
906 ecode += 1 + LINK_SIZE;
907 goto TAIL_RECURSE;
908 }
909 /* Control never gets here */
910
911 /* Handle a capturing bracket, other than those that are possessive with an
912 unlimited repeat. If there is space in the offset vector, save the current
913 subject position in the working slot at the top of the vector. We mustn't
914 change the current values of the data slot, because they may be set from a
915 previous iteration of this group, and be referred to by a reference inside
916 the group. A failure to match might occur after the group has succeeded,
917 if something later on doesn't match. For this reason, we need to restore
918 the working value and also the values of the final offsets, in case they
919 were set by a previous iteration of the same bracket.
920
921 If there isn't enough space in the offset vector, treat this as if it were
922 a non-capturing bracket. Don't worry about setting the flag for the error
923 case here; that is handled in the code for KET. */
924
925 case OP_CBRA:
926 case OP_SCBRA:
927 number = GET2(ecode, 1+LINK_SIZE);
928 offset = number << 1;
929
930 #ifdef PCRE_DEBUG
931 printf("start bracket %d\n", number);
932 printf("subject=");
933 pchars(eptr, 16, TRUE, md);
934 printf("\n");
935 #endif
936
937 if (offset < md->offset_max)
938 {
939 save_offset1 = md->offset_vector[offset];
940 save_offset2 = md->offset_vector[offset+1];
941 save_offset3 = md->offset_vector[md->offset_end - number];
942 save_capture_last = md->capture_last;
943 save_mark = md->mark;
944
945 DPRINTF(("saving %d %d %d\n", save_offset1, save_offset2, save_offset3));
946 md->offset_vector[md->offset_end - number] =
947 (int)(eptr - md->start_subject);
948
949 for (;;)
950 {
951 if (op >= OP_SBRA) md->match_function_type = MATCH_CBEGROUP;
952 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md,
953 eptrb, RM1);
954 if (rrc == MATCH_ONCE) break; /* Backing up through an atomic group */
955
956 /* If we backed up to a THEN, check whether it is within the current
957 branch by comparing the address of the THEN that is passed back with
958 the end of the branch. If it is within the current branch, and the
959 branch is one of two or more alternatives (it either starts or ends
960 with OP_ALT), we have reached the limit of THEN's action, so convert
961 the return code to NOMATCH, which will cause normal backtracking to
962 happen from now on. Otherwise, THEN is passed back to an outer
963 alternative. This implements Perl's treatment of parenthesized groups,
964 where a group not containing | does not affect the current alternative,
965 that is, (X) is NOT the same as (X|(*F)). */
966
967 if (rrc == MATCH_THEN)
968 {
969 next = ecode + GET(ecode,1);
970 if (md->start_match_ptr < next &&
971 (*ecode == OP_ALT || *next == OP_ALT))
972 rrc = MATCH_NOMATCH;
973 }
974
975 /* Anything other than NOMATCH is passed back. */
976
977 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
978 md->capture_last = save_capture_last;
979 ecode += GET(ecode, 1);
980 md->mark = save_mark;
981 if (*ecode != OP_ALT) break;
982 }
983
984 DPRINTF(("bracket %d failed\n", number));
985 md->offset_vector[offset] = save_offset1;
986 md->offset_vector[offset+1] = save_offset2;
987 md->offset_vector[md->offset_end - number] = save_offset3;
988
989 /* At this point, rrc will be one of MATCH_ONCE or MATCH_NOMATCH. */
990
991 RRETURN(rrc);
992 }
993
994 /* FALL THROUGH ... Insufficient room for saving captured contents. Treat
995 as a non-capturing bracket. */
996
997 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
998 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
999
1000 DPRINTF(("insufficient capture room: treat as non-capturing\n"));
1001
1002 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1003 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1004
1005 /* Non-capturing or atomic group, except for possessive with unlimited
1006 repeat and ONCE group with no captures. Loop for all the alternatives.
1007
1008 When we get to the final alternative within the brackets, we used to return
1009 the result of a recursive call to match() whatever happened so it was
1010 possible to reduce stack usage by turning this into a tail recursion,
1011 except in the case of a possibly empty group. However, now that there is
1012 the possiblity of (*THEN) occurring in the final alternative, this
1013 optimization is no longer always possible.
1014
1015 We can optimize if we know there are no (*THEN)s in the pattern; at present
1016 this is the best that can be done.
1017
1018 MATCH_ONCE is returned when the end of an atomic group is successfully
1019 reached, but subsequent matching fails. It passes back up the tree (causing
1020 captured values to be reset) until the original atomic group level is
1021 reached. This is tested by comparing md->once_target with the start of the
1022 group. At this point, the return is converted into MATCH_NOMATCH so that
1023 previous backup points can be taken. */
1024
1025 case OP_ONCE:
1026 case OP_BRA:
1027 case OP_SBRA:
1028 DPRINTF(("start non-capturing bracket\n"));
1029
1030 for (;;)
1031 {
1032 if (op >= OP_SBRA || op == OP_ONCE)
1033 md->match_function_type = MATCH_CBEGROUP;
1034
1035 /* If this is not a possibly empty group, and there are no (*THEN)s in
1036 the pattern, and this is the final alternative, optimize as described
1037 above. */
1038
1039 else if (!md->hasthen && ecode[GET(ecode, 1)] != OP_ALT)
1040 {
1041 ecode += PRIV(OP_lengths)[*ecode];
1042 goto TAIL_RECURSE;
1043 }
1044
1045 /* In all other cases, we have to make another call to match(). */
1046
1047 save_mark = md->mark;
1048 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md, eptrb,
1049 RM2);
1050
1051 /* See comment in the code for capturing groups above about handling
1052 THEN. */
1053
1054 if (rrc == MATCH_THEN)
1055 {
1056 next = ecode + GET(ecode,1);
1057 if (md->start_match_ptr < next &&
1058 (*ecode == OP_ALT || *next == OP_ALT))
1059 rrc = MATCH_NOMATCH;
1060 }
1061
1062 if (rrc != MATCH_NOMATCH)
1063 {
1064 if (rrc == MATCH_ONCE)
1065 {
1066 const pcre_uchar *scode = ecode;
1067 if (*scode != OP_ONCE) /* If not at start, find it */
1068 {
1069 while (*scode == OP_ALT) scode += GET(scode, 1);
1070 scode -= GET(scode, 1);
1071 }
1072 if (md->once_target == scode) rrc = MATCH_NOMATCH;
1073 }
1074 RRETURN(rrc);
1075 }
1076 ecode += GET(ecode, 1);
1077 md->mark = save_mark;
1078 if (*ecode != OP_ALT) break;
1079 }
1080
1081 RRETURN(MATCH_NOMATCH);
1082
1083 /* Handle possessive capturing brackets with an unlimited repeat. We come
1084 here from BRAZERO with allow_zero set TRUE. The offset_vector values are
1085 handled similarly to the normal case above. However, the matching is
1086 different. The end of these brackets will always be OP_KETRPOS, which
1087 returns MATCH_KETRPOS without going further in the pattern. By this means
1088 we can handle the group by iteration rather than recursion, thereby
1089 reducing the amount of stack needed. */
1090
1091 case OP_CBRAPOS:
1092 case OP_SCBRAPOS:
1093 allow_zero = FALSE;
1094
1095 POSSESSIVE_CAPTURE:
1096 number = GET2(ecode, 1+LINK_SIZE);
1097 offset = number << 1;
1098
1099 #ifdef PCRE_DEBUG
1100 printf("start possessive bracket %d\n", number);
1101 printf("subject=");
1102 pchars(eptr, 16, TRUE, md);
1103 printf("\n");
1104 #endif
1105
1106 if (offset < md->offset_max)
1107 {
1108 matched_once = FALSE;
1109 code_offset = (int)(ecode - md->start_code);
1110
1111 save_offset1 = md->offset_vector[offset];
1112 save_offset2 = md->offset_vector[offset+1];
1113 save_offset3 = md->offset_vector[md->offset_end - number];
1114 save_capture_last = md->capture_last;
1115
1116 DPRINTF(("saving %d %d %d\n", save_offset1, save_offset2, save_offset3));
1117
1118 /* Each time round the loop, save the current subject position for use
1119 when the group matches. For MATCH_MATCH, the group has matched, so we
1120 restart it with a new subject starting position, remembering that we had
1121 at least one match. For MATCH_NOMATCH, carry on with the alternatives, as
1122 usual. If we haven't matched any alternatives in any iteration, check to
1123 see if a previous iteration matched. If so, the group has matched;
1124 continue from afterwards. Otherwise it has failed; restore the previous
1125 capture values before returning NOMATCH. */
1126
1127 for (;;)
1128 {
1129 md->offset_vector[md->offset_end - number] =
1130 (int)(eptr - md->start_subject);
1131 if (op >= OP_SBRA) md->match_function_type = MATCH_CBEGROUP;
1132 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md,
1133 eptrb, RM63);
1134 if (rrc == MATCH_KETRPOS)
1135 {
1136 offset_top = md->end_offset_top;
1137 eptr = md->end_match_ptr;
1138 ecode = md->start_code + code_offset;
1139 save_capture_last = md->capture_last;
1140 matched_once = TRUE;
1141 continue;
1142 }
1143
1144 /* See comment in the code for capturing groups above about handling
1145 THEN. */
1146
1147 if (rrc == MATCH_THEN)
1148 {
1149 next = ecode + GET(ecode,1);
1150 if (md->start_match_ptr < next &&
1151 (*ecode == OP_ALT || *next == OP_ALT))
1152 rrc = MATCH_NOMATCH;
1153 }
1154
1155 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1156 md->capture_last = save_capture_last;
1157 ecode += GET(ecode, 1);
1158 if (*ecode != OP_ALT) break;
1159 }
1160
1161 if (!matched_once)
1162 {
1163 md->offset_vector[offset] = save_offset1;
1164 md->offset_vector[offset+1] = save_offset2;
1165 md->offset_vector[md->offset_end - number] = save_offset3;
1166 }
1167
1168 if (allow_zero || matched_once)
1169 {
1170 ecode += 1 + LINK_SIZE;
1171 break;
1172 }
1173
1174 RRETURN(MATCH_NOMATCH);
1175 }
1176
1177 /* FALL THROUGH ... Insufficient room for saving captured contents. Treat
1178 as a non-capturing bracket. */
1179
1180 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1181 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1182
1183 DPRINTF(("insufficient capture room: treat as non-capturing\n"));
1184
1185 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1186 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1187
1188 /* Non-capturing possessive bracket with unlimited repeat. We come here
1189 from BRAZERO with allow_zero = TRUE. The code is similar to the above,
1190 without the capturing complication. It is written out separately for speed
1191 and cleanliness. */
1192
1193 case OP_BRAPOS:
1194 case OP_SBRAPOS:
1195 allow_zero = FALSE;
1196
1197 POSSESSIVE_NON_CAPTURE:
1198 matched_once = FALSE;
1199 code_offset = (int)(ecode - md->start_code);
1200
1201 for (;;)
1202 {
1203 if (op >= OP_SBRA) md->match_function_type = MATCH_CBEGROUP;
1204 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md,
1205 eptrb, RM48);
1206 if (rrc == MATCH_KETRPOS)
1207 {
1208 offset_top = md->end_offset_top;
1209 eptr = md->end_match_ptr;
1210 ecode = md->start_code + code_offset;
1211 matched_once = TRUE;
1212 continue;
1213 }
1214
1215 /* See comment in the code for capturing groups above about handling
1216 THEN. */
1217
1218 if (rrc == MATCH_THEN)
1219 {
1220 next = ecode + GET(ecode,1);
1221 if (md->start_match_ptr < next &&
1222 (*ecode == OP_ALT || *next == OP_ALT))
1223 rrc = MATCH_NOMATCH;
1224 }
1225
1226 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1227 ecode += GET(ecode, 1);
1228 if (*ecode != OP_ALT) break;
1229 }
1230
1231 if (matched_once || allow_zero)
1232 {
1233 ecode += 1 + LINK_SIZE;
1234 break;
1235 }
1236 RRETURN(MATCH_NOMATCH);
1237
1238 /* Control never reaches here. */
1239
1240 /* Conditional group: compilation checked that there are no more than
1241 two branches. If the condition is false, skipping the first branch takes us
1242 past the end if there is only one branch, but that's OK because that is
1243 exactly what going to the ket would do. */
1244
1245 case OP_COND:
1246 case OP_SCOND:
1247 codelink = GET(ecode, 1);
1248
1249 /* Because of the way auto-callout works during compile, a callout item is
1250 inserted between OP_COND and an assertion condition. */
1251
1252 if (ecode[LINK_SIZE+1] == OP_CALLOUT)
1253 {
1254 if (PUBL(callout) != NULL)
1255 {
1256 PUBL(callout_block) cb;
1257 cb.version = 2; /* Version 1 of the callout block */
1258 cb.callout_number = ecode[LINK_SIZE+2];
1259 cb.offset_vector = md->offset_vector;
1260 #ifdef COMPILE_PCRE8
1261 cb.subject = (PCRE_SPTR)md->start_subject;
1262 #else
1263 cb.subject = (PCRE_SPTR16)md->start_subject;
1264 #endif
1265 cb.subject_length = (int)(md->end_subject - md->start_subject);
1266 cb.start_match = (int)(mstart - md->start_subject);
1267 cb.current_position = (int)(eptr - md->start_subject);
1268 cb.pattern_position = GET(ecode, LINK_SIZE + 3);
1269 cb.next_item_length = GET(ecode, 3 + 2*LINK_SIZE);
1270 cb.capture_top = offset_top/2;
1271 cb.capture_last = md->capture_last;
1272 cb.callout_data = md->callout_data;
1273 cb.mark = md->nomatch_mark;
1274 if ((rrc = (*PUBL(callout))(&cb)) > 0) RRETURN(MATCH_NOMATCH);
1275 if (rrc < 0) RRETURN(rrc);
1276 }
1277 ecode += PRIV(OP_lengths)[OP_CALLOUT];
1278 }
1279
1280 condcode = ecode[LINK_SIZE+1];
1281
1282 /* Now see what the actual condition is */
1283
1284 if (condcode == OP_RREF || condcode == OP_NRREF) /* Recursion test */
1285 {
1286 if (md->recursive == NULL) /* Not recursing => FALSE */
1287 {
1288 condition = FALSE;
1289 ecode += GET(ecode, 1);
1290 }
1291 else
1292 {
1293 int recno = GET2(ecode, LINK_SIZE + 2); /* Recursion group number*/
1294 condition = (recno == RREF_ANY || recno == md->recursive->group_num);
1295
1296 /* If the test is for recursion into a specific subpattern, and it is
1297 false, but the test was set up by name, scan the table to see if the
1298 name refers to any other numbers, and test them. The condition is true
1299 if any one is set. */
1300
1301 if (!condition && condcode == OP_NRREF)
1302 {
1303 pcre_uchar *slotA = md->name_table;
1304 for (i = 0; i < md->name_count; i++)
1305 {
1306 if (GET2(slotA, 0) == recno) break;
1307 slotA += md->name_entry_size;
1308 }
1309
1310 /* Found a name for the number - there can be only one; duplicate
1311 names for different numbers are allowed, but not vice versa. First
1312 scan down for duplicates. */
1313
1314 if (i < md->name_count)
1315 {
1316 pcre_uchar *slotB = slotA;
1317 while (slotB > md->name_table)
1318 {
1319 slotB -= md->name_entry_size;
1320 if (STRCMP_UC_UC(slotA + IMM2_SIZE, slotB + IMM2_SIZE) == 0)
1321 {
1322 condition = GET2(slotB, 0) == md->recursive->group_num;
1323 if (condition) break;
1324 }
1325 else break;
1326 }
1327
1328 /* Scan up for duplicates */
1329
1330 if (!condition)
1331 {
1332 slotB = slotA;
1333 for (i++; i < md->name_count; i++)
1334 {
1335 slotB += md->name_entry_size;
1336 if (STRCMP_UC_UC(slotA + IMM2_SIZE, slotB + IMM2_SIZE) == 0)
1337 {
1338 condition = GET2(slotB, 0) == md->recursive->group_num;
1339 if (condition) break;
1340 }
1341 else break;
1342 }
1343 }
1344 }
1345 }
1346
1347 /* Chose branch according to the condition */
1348
1349 ecode += condition? 1 + IMM2_SIZE : GET(ecode, 1);
1350 }
1351 }
1352
1353 else if (condcode == OP_CREF || condcode == OP_NCREF) /* Group used test */
1354 {
1355 offset = GET2(ecode, LINK_SIZE+2) << 1; /* Doubled ref number */
1356 condition = offset < offset_top && md->offset_vector[offset] >= 0;
1357
1358 /* If the numbered capture is unset, but the reference was by name,
1359 scan the table to see if the name refers to any other numbers, and test
1360 them. The condition is true if any one is set. This is tediously similar
1361 to the code above, but not close enough to try to amalgamate. */
1362
1363 if (!condition && condcode == OP_NCREF)
1364 {
1365 int refno = offset >> 1;
1366 pcre_uchar *slotA = md->name_table;
1367
1368 for (i = 0; i < md->name_count; i++)
1369 {
1370 if (GET2(slotA, 0) == refno) break;
1371 slotA += md->name_entry_size;
1372 }
1373
1374 /* Found a name for the number - there can be only one; duplicate names
1375 for different numbers are allowed, but not vice versa. First scan down
1376 for duplicates. */
1377
1378 if (i < md->name_count)
1379 {
1380 pcre_uchar *slotB = slotA;
1381 while (slotB > md->name_table)
1382 {
1383 slotB -= md->name_entry_size;
1384 if (STRCMP_UC_UC(slotA + IMM2_SIZE, slotB + IMM2_SIZE) == 0)
1385 {
1386 offset = GET2(slotB, 0) << 1;
1387 condition = offset < offset_top &&
1388 md->offset_vector[offset] >= 0;
1389 if (condition) break;
1390 }
1391 else break;
1392 }
1393
1394 /* Scan up for duplicates */
1395
1396 if (!condition)
1397 {
1398 slotB = slotA;
1399 for (i++; i < md->name_count; i++)
1400 {
1401 slotB += md->name_entry_size;
1402 if (STRCMP_UC_UC(slotA + IMM2_SIZE, slotB + IMM2_SIZE) == 0)
1403 {
1404 offset = GET2(slotB, 0) << 1;
1405 condition = offset < offset_top &&
1406 md->offset_vector[offset] >= 0;
1407 if (condition) break;
1408 }
1409 else break;
1410 }
1411 }
1412 }
1413 }
1414
1415 /* Chose branch according to the condition */
1416
1417 ecode += condition? 1 + IMM2_SIZE : GET(ecode, 1);
1418 }
1419
1420 else if (condcode == OP_DEF) /* DEFINE - always false */
1421 {
1422 condition = FALSE;
1423 ecode += GET(ecode, 1);
1424 }
1425
1426 /* The condition is an assertion. Call match() to evaluate it - setting
1427 md->match_function_type to MATCH_CONDASSERT causes it to stop at the end of
1428 an assertion. */
1429
1430 else
1431 {
1432 md->match_function_type = MATCH_CONDASSERT;
1433 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, NULL, RM3);
1434 if (rrc == MATCH_MATCH)
1435 {
1436 if (md->end_offset_top > offset_top)
1437 offset_top = md->end_offset_top; /* Captures may have happened */
1438 condition = TRUE;
1439 ecode += 1 + LINK_SIZE + GET(ecode, LINK_SIZE + 2);
1440 while (*ecode == OP_ALT) ecode += GET(ecode, 1);
1441 }
1442
1443 /* PCRE doesn't allow the effect of (*THEN) to escape beyond an
1444 assertion; it is therefore treated as NOMATCH. */
1445
1446 else if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN)
1447 {
1448 RRETURN(rrc); /* Need braces because of following else */
1449 }
1450 else
1451 {
1452 condition = FALSE;
1453 ecode += codelink;
1454 }
1455 }
1456
1457 /* We are now at the branch that is to be obeyed. As there is only one, can
1458 use tail recursion to avoid using another stack frame, except when there is
1459 unlimited repeat of a possibly empty group. In the latter case, a recursive
1460 call to match() is always required, unless the second alternative doesn't
1461 exist, in which case we can just plough on. Note that, for compatibility
1462 with Perl, the | in a conditional group is NOT treated as creating two
1463 alternatives. If a THEN is encountered in the branch, it propagates out to
1464 the enclosing alternative (unless nested in a deeper set of alternatives,
1465 of course). */
1466
1467 if (condition || *ecode == OP_ALT)
1468 {
1469 if (op != OP_SCOND)
1470 {
1471 ecode += 1 + LINK_SIZE;
1472 goto TAIL_RECURSE;
1473 }
1474
1475 md->match_function_type = MATCH_CBEGROUP;
1476 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM49);
1477 RRETURN(rrc);
1478 }
1479
1480 /* Condition false & no alternative; continue after the group. */
1481
1482 else
1483 {
1484 ecode += 1 + LINK_SIZE;
1485 }
1486 break;
1487
1488
1489 /* Before OP_ACCEPT there may be any number of OP_CLOSE opcodes,
1490 to close any currently open capturing brackets. */
1491
1492 case OP_CLOSE:
1493 number = GET2(ecode, 1);
1494 offset = number << 1;
1495
1496 #ifdef PCRE_DEBUG
1497 printf("end bracket %d at *ACCEPT", number);
1498 printf("\n");
1499 #endif
1500
1501 md->capture_last = number;
1502 if (offset >= md->offset_max) md->offset_overflow = TRUE; else
1503 {
1504 md->offset_vector[offset] =
1505 md->offset_vector[md->offset_end - number];
1506 md->offset_vector[offset+1] = (int)(eptr - md->start_subject);
1507 if (offset_top <= offset) offset_top = offset + 2;
1508 }
1509 ecode += 1 + IMM2_SIZE;
1510 break;
1511
1512
1513 /* End of the pattern, either real or forced. */
1514
1515 case OP_END:
1516 case OP_ACCEPT:
1517 case OP_ASSERT_ACCEPT:
1518
1519 /* If we have matched an empty string, fail if not in an assertion and not
1520 in a recursion if either PCRE_NOTEMPTY is set, or if PCRE_NOTEMPTY_ATSTART
1521 is set and we have matched at the start of the subject. In both cases,
1522 backtracking will then try other alternatives, if any. */
1523
1524 if (eptr == mstart && op != OP_ASSERT_ACCEPT &&
1525 md->recursive == NULL &&
1526 (md->notempty ||
1527 (md->notempty_atstart &&
1528 mstart == md->start_subject + md->start_offset)))
1529 RRETURN(MATCH_NOMATCH);
1530
1531 /* Otherwise, we have a match. */
1532
1533 md->end_match_ptr = eptr; /* Record where we ended */
1534 md->end_offset_top = offset_top; /* and how many extracts were taken */
1535 md->start_match_ptr = mstart; /* and the start (\K can modify) */
1536
1537 /* For some reason, the macros don't work properly if an expression is
1538 given as the argument to RRETURN when the heap is in use. */
1539
1540 rrc = (op == OP_END)? MATCH_MATCH : MATCH_ACCEPT;
1541 RRETURN(rrc);
1542
1543 /* Assertion brackets. Check the alternative branches in turn - the
1544 matching won't pass the KET for an assertion. If any one branch matches,
1545 the assertion is true. Lookbehind assertions have an OP_REVERSE item at the
1546 start of each branch to move the current point backwards, so the code at
1547 this level is identical to the lookahead case. When the assertion is part
1548 of a condition, we want to return immediately afterwards. The caller of
1549 this incarnation of the match() function will have set MATCH_CONDASSERT in
1550 md->match_function type, and one of these opcodes will be the first opcode
1551 that is processed. We use a local variable that is preserved over calls to
1552 match() to remember this case. */
1553
1554 case OP_ASSERT:
1555 case OP_ASSERTBACK:
1556 save_mark = md->mark;
1557 if (md->match_function_type == MATCH_CONDASSERT)
1558 {
1559 condassert = TRUE;
1560 md->match_function_type = 0;
1561 }
1562 else condassert = FALSE;
1563
1564 do
1565 {
1566 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, NULL, RM4);
1567 if (rrc == MATCH_MATCH || rrc == MATCH_ACCEPT)
1568 {
1569 mstart = md->start_match_ptr; /* In case \K reset it */
1570 break;
1571 }
1572 md->mark = save_mark;
1573
1574 /* A COMMIT failure must fail the entire assertion, without trying any
1575 subsequent branches. */
1576
1577 if (rrc == MATCH_COMMIT) RRETURN(MATCH_NOMATCH);
1578
1579 /* PCRE does not allow THEN to escape beyond an assertion; it
1580 is treated as NOMATCH. */
1581
1582 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
1583 ecode += GET(ecode, 1);
1584 }
1585 while (*ecode == OP_ALT);
1586
1587 if (*ecode == OP_KET) RRETURN(MATCH_NOMATCH);
1588
1589 /* If checking an assertion for a condition, return MATCH_MATCH. */
1590
1591 if (condassert) RRETURN(MATCH_MATCH);
1592
1593 /* Continue from after the assertion, updating the offsets high water
1594 mark, since extracts may have been taken during the assertion. */
1595
1596 do ecode += GET(ecode,1); while (*ecode == OP_ALT);
1597 ecode += 1 + LINK_SIZE;
1598 offset_top = md->end_offset_top;
1599 continue;
1600
1601 /* Negative assertion: all branches must fail to match. Encountering SKIP,
1602 PRUNE, or COMMIT means we must assume failure without checking subsequent
1603 branches. */
1604
1605 case OP_ASSERT_NOT:
1606 case OP_ASSERTBACK_NOT:
1607 save_mark = md->mark;
1608 if (md->match_function_type == MATCH_CONDASSERT)
1609 {
1610 condassert = TRUE;
1611 md->match_function_type = 0;
1612 }
1613 else condassert = FALSE;
1614
1615 do
1616 {
1617 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, NULL, RM5);
1618 md->mark = save_mark;
1619 if (rrc == MATCH_MATCH || rrc == MATCH_ACCEPT) RRETURN(MATCH_NOMATCH);
1620 if (rrc == MATCH_SKIP || rrc == MATCH_PRUNE || rrc == MATCH_COMMIT)
1621 {
1622 do ecode += GET(ecode,1); while (*ecode == OP_ALT);
1623 break;
1624 }
1625
1626 /* PCRE does not allow THEN to escape beyond an assertion; it is treated
1627 as NOMATCH. */
1628
1629 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
1630 ecode += GET(ecode,1);
1631 }
1632 while (*ecode == OP_ALT);
1633
1634 if (condassert) RRETURN(MATCH_MATCH); /* Condition assertion */
1635
1636 ecode += 1 + LINK_SIZE;
1637 continue;
1638
1639 /* Move the subject pointer back. This occurs only at the start of
1640 each branch of a lookbehind assertion. If we are too close to the start to
1641 move back, this match function fails. When working with UTF-8 we move
1642 back a number of characters, not bytes. */
1643
1644 case OP_REVERSE:
1645 #ifdef SUPPORT_UTF
1646 if (utf)
1647 {
1648 i = GET(ecode, 1);
1649 while (i-- > 0)
1650 {
1651 eptr--;
1652 if (eptr < md->start_subject) RRETURN(MATCH_NOMATCH);
1653 BACKCHAR(eptr);
1654 }
1655 }
1656 else
1657 #endif
1658
1659 /* No UTF-8 support, or not in UTF-8 mode: count is byte count */
1660
1661 {
1662 eptr -= GET(ecode, 1);
1663 if (eptr < md->start_subject) RRETURN(MATCH_NOMATCH);
1664 }
1665
1666 /* Save the earliest consulted character, then skip to next op code */
1667
1668 if (eptr < md->start_used_ptr) md->start_used_ptr = eptr;
1669 ecode += 1 + LINK_SIZE;
1670 break;
1671
1672 /* The callout item calls an external function, if one is provided, passing
1673 details of the match so far. This is mainly for debugging, though the
1674 function is able to force a failure. */
1675
1676 case OP_CALLOUT:
1677 if (PUBL(callout) != NULL)
1678 {
1679 PUBL(callout_block) cb;
1680 cb.version = 2; /* Version 1 of the callout block */
1681 cb.callout_number = ecode[1];
1682 cb.offset_vector = md->offset_vector;
1683 #ifdef COMPILE_PCRE8
1684 cb.subject = (PCRE_SPTR)md->start_subject;
1685 #else
1686 cb.subject = (PCRE_SPTR16)md->start_subject;
1687 #endif
1688 cb.subject_length = (int)(md->end_subject - md->start_subject);
1689 cb.start_match = (int)(mstart - md->start_subject);
1690 cb.current_position = (int)(eptr - md->start_subject);
1691 cb.pattern_position = GET(ecode, 2);
1692 cb.next_item_length = GET(ecode, 2 + LINK_SIZE);
1693 cb.capture_top = offset_top/2;
1694 cb.capture_last = md->capture_last;
1695 cb.callout_data = md->callout_data;
1696 cb.mark = md->nomatch_mark;
1697 if ((rrc = (*PUBL(callout))(&cb)) > 0) RRETURN(MATCH_NOMATCH);
1698 if (rrc < 0) RRETURN(rrc);
1699 }
1700 ecode += 2 + 2*LINK_SIZE;
1701 break;
1702
1703 /* Recursion either matches the current regex, or some subexpression. The
1704 offset data is the offset to the starting bracket from the start of the
1705 whole pattern. (This is so that it works from duplicated subpatterns.)
1706
1707 The state of the capturing groups is preserved over recursion, and
1708 re-instated afterwards. We don't know how many are started and not yet
1709 finished (offset_top records the completed total) so we just have to save
1710 all the potential data. There may be up to 65535 such values, which is too
1711 large to put on the stack, but using malloc for small numbers seems
1712 expensive. As a compromise, the stack is used when there are no more than
1713 REC_STACK_SAVE_MAX values to store; otherwise malloc is used.
1714
1715 There are also other values that have to be saved. We use a chained
1716 sequence of blocks that actually live on the stack. Thanks to Robin Houston
1717 for the original version of this logic. It has, however, been hacked around
1718 a lot, so he is not to blame for the current way it works. */
1719
1720 case OP_RECURSE:
1721 {
1722 recursion_info *ri;
1723 int recno;
1724
1725 callpat = md->start_code + GET(ecode, 1);
1726 recno = (callpat == md->start_code)? 0 :
1727 GET2(callpat, 1 + LINK_SIZE);
1728
1729 /* Check for repeating a recursion without advancing the subject pointer.
1730 This should catch convoluted mutual recursions. (Some simple cases are
1731 caught at compile time.) */
1732
1733 for (ri = md->recursive; ri != NULL; ri = ri->prevrec)
1734 if (recno == ri->group_num && eptr == ri->subject_position)
1735 RRETURN(PCRE_ERROR_RECURSELOOP);
1736
1737 /* Add to "recursing stack" */
1738
1739 new_recursive.group_num = recno;
1740 new_recursive.subject_position = eptr;
1741 new_recursive.prevrec = md->recursive;
1742 md->recursive = &new_recursive;
1743
1744 /* Where to continue from afterwards */
1745
1746 ecode += 1 + LINK_SIZE;
1747
1748 /* Now save the offset data */
1749
1750 new_recursive.saved_max = md->offset_end;
1751 if (new_recursive.saved_max <= REC_STACK_SAVE_MAX)
1752 new_recursive.offset_save = stacksave;
1753 else
1754 {
1755 new_recursive.offset_save =
1756 (int *)(PUBL(malloc))(new_recursive.saved_max * sizeof(int));
1757 if (new_recursive.offset_save == NULL) RRETURN(PCRE_ERROR_NOMEMORY);
1758 }
1759 memcpy(new_recursive.offset_save, md->offset_vector,
1760 new_recursive.saved_max * sizeof(int));
1761
1762 /* OK, now we can do the recursion. After processing each alternative,
1763 restore the offset data. If there were nested recursions, md->recursive
1764 might be changed, so reset it before looping. */
1765
1766 DPRINTF(("Recursing into group %d\n", new_recursive.group_num));
1767 cbegroup = (*callpat >= OP_SBRA);
1768 do
1769 {
1770 if (cbegroup) md->match_function_type = MATCH_CBEGROUP;
1771 RMATCH(eptr, callpat + PRIV(OP_lengths)[*callpat], offset_top,
1772 md, eptrb, RM6);
1773 memcpy(md->offset_vector, new_recursive.offset_save,
1774 new_recursive.saved_max * sizeof(int));
1775 md->recursive = new_recursive.prevrec;
1776 if (rrc == MATCH_MATCH || rrc == MATCH_ACCEPT)
1777 {
1778 DPRINTF(("Recursion matched\n"));
1779 if (new_recursive.offset_save != stacksave)
1780 (PUBL(free))(new_recursive.offset_save);
1781
1782 /* Set where we got to in the subject, and reset the start in case
1783 it was changed by \K. This *is* propagated back out of a recursion,
1784 for Perl compatibility. */
1785
1786 eptr = md->end_match_ptr;
1787 mstart = md->start_match_ptr;
1788 goto RECURSION_MATCHED; /* Exit loop; end processing */
1789 }
1790
1791 /* PCRE does not allow THEN or COMMIT to escape beyond a recursion; it
1792 is treated as NOMATCH. */
1793
1794 else if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN &&
1795 rrc != MATCH_COMMIT)
1796 {
1797 DPRINTF(("Recursion gave error %d\n", rrc));
1798 if (new_recursive.offset_save != stacksave)
1799 (PUBL(free))(new_recursive.offset_save);
1800 RRETURN(rrc);
1801 }
1802
1803 md->recursive = &new_recursive;
1804 callpat += GET(callpat, 1);
1805 }
1806 while (*callpat == OP_ALT);
1807
1808 DPRINTF(("Recursion didn't match\n"));
1809 md->recursive = new_recursive.prevrec;
1810 if (new_recursive.offset_save != stacksave)
1811 (PUBL(free))(new_recursive.offset_save);
1812 RRETURN(MATCH_NOMATCH);
1813 }
1814
1815 RECURSION_MATCHED:
1816 break;
1817
1818 /* An alternation is the end of a branch; scan along to find the end of the
1819 bracketed group and go to there. */
1820
1821 case OP_ALT:
1822 do ecode += GET(ecode,1); while (*ecode == OP_ALT);
1823 break;
1824
1825 /* BRAZERO, BRAMINZERO and SKIPZERO occur just before a bracket group,
1826 indicating that it may occur zero times. It may repeat infinitely, or not
1827 at all - i.e. it could be ()* or ()? or even (){0} in the pattern. Brackets
1828 with fixed upper repeat limits are compiled as a number of copies, with the
1829 optional ones preceded by BRAZERO or BRAMINZERO. */
1830
1831 case OP_BRAZERO:
1832 next = ecode + 1;
1833 RMATCH(eptr, next, offset_top, md, eptrb, RM10);
1834 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1835 do next += GET(next, 1); while (*next == OP_ALT);
1836 ecode = next + 1 + LINK_SIZE;
1837 break;
1838
1839 case OP_BRAMINZERO:
1840 next = ecode + 1;
1841 do next += GET(next, 1); while (*next == OP_ALT);
1842 RMATCH(eptr, next + 1+LINK_SIZE, offset_top, md, eptrb, RM11);
1843 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1844 ecode++;
1845 break;
1846
1847 case OP_SKIPZERO:
1848 next = ecode+1;
1849 do next += GET(next,1); while (*next == OP_ALT);
1850 ecode = next + 1 + LINK_SIZE;
1851 break;
1852
1853 /* BRAPOSZERO occurs before a possessive bracket group. Don't do anything
1854 here; just jump to the group, with allow_zero set TRUE. */
1855
1856 case OP_BRAPOSZERO:
1857 op = *(++ecode);
1858 allow_zero = TRUE;
1859 if (op == OP_CBRAPOS || op == OP_SCBRAPOS) goto POSSESSIVE_CAPTURE;
1860 goto POSSESSIVE_NON_CAPTURE;
1861
1862 /* End of a group, repeated or non-repeating. */
1863
1864 case OP_KET:
1865 case OP_KETRMIN:
1866 case OP_KETRMAX:
1867 case OP_KETRPOS:
1868 prev = ecode - GET(ecode, 1);
1869
1870 /* If this was a group that remembered the subject start, in order to break
1871 infinite repeats of empty string matches, retrieve the subject start from
1872 the chain. Otherwise, set it NULL. */
1873
1874 if (*prev >= OP_SBRA || *prev == OP_ONCE)
1875 {
1876 saved_eptr = eptrb->epb_saved_eptr; /* Value at start of group */
1877 eptrb = eptrb->epb_prev; /* Backup to previous group */
1878 }
1879 else saved_eptr = NULL;
1880
1881 /* If we are at the end of an assertion group or a non-capturing atomic
1882 group, stop matching and return MATCH_MATCH, but record the current high
1883 water mark for use by positive assertions. We also need to record the match
1884 start in case it was changed by \K. */
1885
1886 if ((*prev >= OP_ASSERT && *prev <= OP_ASSERTBACK_NOT) ||
1887 *prev == OP_ONCE_NC)
1888 {
1889 md->end_match_ptr = eptr; /* For ONCE_NC */
1890 md->end_offset_top = offset_top;
1891 md->start_match_ptr = mstart;
1892 RRETURN(MATCH_MATCH); /* Sets md->mark */
1893 }
1894
1895 /* For capturing groups we have to check the group number back at the start
1896 and if necessary complete handling an extraction by setting the offsets and
1897 bumping the high water mark. Whole-pattern recursion is coded as a recurse
1898 into group 0, so it won't be picked up here. Instead, we catch it when the
1899 OP_END is reached. Other recursion is handled here. We just have to record
1900 the current subject position and start match pointer and give a MATCH
1901 return. */
1902
1903 if (*prev == OP_CBRA || *prev == OP_SCBRA ||
1904 *prev == OP_CBRAPOS || *prev == OP_SCBRAPOS)
1905 {
1906 number = GET2(prev, 1+LINK_SIZE);
1907 offset = number << 1;
1908
1909 #ifdef PCRE_DEBUG
1910 printf("end bracket %d", number);
1911 printf("\n");
1912 #endif
1913
1914 /* Handle a recursively called group. */
1915
1916 if (md->recursive != NULL && md->recursive->group_num == number)
1917 {
1918 md->end_match_ptr = eptr;
1919 md->start_match_ptr = mstart;
1920 RRETURN(MATCH_MATCH);
1921 }
1922
1923 /* Deal with capturing */
1924
1925 md->capture_last = number;
1926 if (offset >= md->offset_max) md->offset_overflow = TRUE; else
1927 {
1928 /* If offset is greater than offset_top, it means that we are
1929 "skipping" a capturing group, and that group's offsets must be marked
1930 unset. In earlier versions of PCRE, all the offsets were unset at the
1931 start of matching, but this doesn't work because atomic groups and
1932 assertions can cause a value to be set that should later be unset.
1933 Example: matching /(?>(a))b|(a)c/ against "ac". This sets group 1 as
1934 part of the atomic group, but this is not on the final matching path,
1935 so must be unset when 2 is set. (If there is no group 2, there is no
1936 problem, because offset_top will then be 2, indicating no capture.) */
1937
1938 if (offset > offset_top)
1939 {
1940 int *iptr = md->offset_vector + offset_top;
1941 int *iend = md->offset_vector + offset;
1942 while (iptr < iend) *iptr++ = -1;
1943 }
1944
1945 /* Now make the extraction */
1946
1947 md->offset_vector[offset] =
1948 md->offset_vector[md->offset_end - number];
1949 md->offset_vector[offset+1] = (int)(eptr - md->start_subject);
1950 if (offset_top <= offset) offset_top = offset + 2;
1951 }
1952 }
1953
1954 /* For an ordinary non-repeating ket, just continue at this level. This
1955 also happens for a repeating ket if no characters were matched in the
1956 group. This is the forcible breaking of infinite loops as implemented in
1957 Perl 5.005. For a non-repeating atomic group that includes captures,
1958 establish a backup point by processing the rest of the pattern at a lower
1959 level. If this results in a NOMATCH return, pass MATCH_ONCE back to the
1960 original OP_ONCE level, thereby bypassing intermediate backup points, but
1961 resetting any captures that happened along the way. */
1962
1963 if (*ecode == OP_KET || eptr == saved_eptr)
1964 {
1965 if (*prev == OP_ONCE)
1966 {
1967 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM12);
1968 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1969 md->once_target = prev; /* Level at which to change to MATCH_NOMATCH */
1970 RRETURN(MATCH_ONCE);
1971 }
1972 ecode += 1 + LINK_SIZE; /* Carry on at this level */
1973 break;
1974 }
1975
1976 /* OP_KETRPOS is a possessive repeating ket. Remember the current position,
1977 and return the MATCH_KETRPOS. This makes it possible to do the repeats one
1978 at a time from the outer level, thus saving stack. */
1979
1980 if (*ecode == OP_KETRPOS)
1981 {
1982 md->end_match_ptr = eptr;
1983 md->end_offset_top = offset_top;
1984 RRETURN(MATCH_KETRPOS);
1985 }
1986
1987 /* The normal repeating kets try the rest of the pattern or restart from
1988 the preceding bracket, in the appropriate order. In the second case, we can
1989 use tail recursion to avoid using another stack frame, unless we have an
1990 an atomic group or an unlimited repeat of a group that can match an empty
1991 string. */
1992
1993 if (*ecode == OP_KETRMIN)
1994 {
1995 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM7);
1996 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1997 if (*prev == OP_ONCE)
1998 {
1999 RMATCH(eptr, prev, offset_top, md, eptrb, RM8);
2000 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2001 md->once_target = prev; /* Level at which to change to MATCH_NOMATCH */
2002 RRETURN(MATCH_ONCE);
2003 }
2004 if (*prev >= OP_SBRA) /* Could match an empty string */
2005 {
2006 RMATCH(eptr, prev, offset_top, md, eptrb, RM50);
2007 RRETURN(rrc);
2008 }
2009 ecode = prev;
2010 goto TAIL_RECURSE;
2011 }
2012 else /* OP_KETRMAX */
2013 {
2014 RMATCH(eptr, prev, offset_top, md, eptrb, RM13);
2015 if (rrc == MATCH_ONCE && md->once_target == prev) rrc = MATCH_NOMATCH;
2016 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2017 if (*prev == OP_ONCE)
2018 {
2019 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM9);
2020 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2021 md->once_target = prev;
2022 RRETURN(MATCH_ONCE);
2023 }
2024 ecode += 1 + LINK_SIZE;
2025 goto TAIL_RECURSE;
2026 }
2027 /* Control never gets here */
2028
2029 /* Not multiline mode: start of subject assertion, unless notbol. */
2030
2031 case OP_CIRC:
2032 if (md->notbol && eptr == md->start_subject) RRETURN(MATCH_NOMATCH);
2033
2034 /* Start of subject assertion */
2035
2036 case OP_SOD:
2037 if (eptr != md->start_subject) RRETURN(MATCH_NOMATCH);
2038 ecode++;
2039 break;
2040
2041 /* Multiline mode: start of subject unless notbol, or after any newline. */
2042
2043 case OP_CIRCM:
2044 if (md->notbol && eptr == md->start_subject) RRETURN(MATCH_NOMATCH);
2045 if (eptr != md->start_subject &&
2046 (eptr == md->end_subject || !WAS_NEWLINE(eptr)))
2047 RRETURN(MATCH_NOMATCH);
2048 ecode++;
2049 break;
2050
2051 /* Start of match assertion */
2052
2053 case OP_SOM:
2054 if (eptr != md->start_subject + md->start_offset) RRETURN(MATCH_NOMATCH);
2055 ecode++;
2056 break;
2057
2058 /* Reset the start of match point */
2059
2060 case OP_SET_SOM:
2061 mstart = eptr;
2062 ecode++;
2063 break;
2064
2065 /* Multiline mode: assert before any newline, or before end of subject
2066 unless noteol is set. */
2067
2068 case OP_DOLLM:
2069 if (eptr < md->end_subject)
2070 {
2071 if (!IS_NEWLINE(eptr))
2072 {
2073 if (md->partial != 0 &&
2074 eptr + 1 >= md->end_subject &&
2075 NLBLOCK->nltype == NLTYPE_FIXED &&
2076 NLBLOCK->nllen == 2 &&
2077 *eptr == NLBLOCK->nl[0])
2078 {
2079 md->hitend = TRUE;
2080 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);
2081 }
2082 RRETURN(MATCH_NOMATCH);
2083 }
2084 }
2085 else
2086 {
2087 if (md->noteol) RRETURN(MATCH_NOMATCH);
2088 SCHECK_PARTIAL();
2089 }
2090 ecode++;
2091 break;
2092
2093 /* Not multiline mode: assert before a terminating newline or before end of
2094 subject unless noteol is set. */
2095
2096 case OP_DOLL:
2097 if (md->noteol) RRETURN(MATCH_NOMATCH);
2098 if (!md->endonly) goto ASSERT_NL_OR_EOS;
2099
2100 /* ... else fall through for endonly */
2101
2102 /* End of subject assertion (\z) */
2103
2104 case OP_EOD:
2105 if (eptr < md->end_subject) RRETURN(MATCH_NOMATCH);
2106 SCHECK_PARTIAL();
2107 ecode++;
2108 break;
2109
2110 /* End of subject or ending \n assertion (\Z) */
2111
2112 case OP_EODN:
2113 ASSERT_NL_OR_EOS:
2114 if (eptr < md->end_subject &&
2115 (!IS_NEWLINE(eptr) || eptr != md->end_subject - md->nllen))
2116 {
2117 if (md->partial != 0 &&
2118 eptr + 1 >= md->end_subject &&
2119 NLBLOCK->nltype == NLTYPE_FIXED &&
2120 NLBLOCK->nllen == 2 &&
2121 *eptr == NLBLOCK->nl[0])
2122 {
2123 md->hitend = TRUE;
2124 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);
2125 }
2126 RRETURN(MATCH_NOMATCH);
2127 }
2128
2129 /* Either at end of string or \n before end. */
2130
2131 SCHECK_PARTIAL();
2132 ecode++;
2133 break;
2134
2135 /* Word boundary assertions */
2136
2137 case OP_NOT_WORD_BOUNDARY:
2138 case OP_WORD_BOUNDARY:
2139 {
2140
2141 /* Find out if the previous and current characters are "word" characters.
2142 It takes a bit more work in UTF-8 mode. Characters > 255 are assumed to
2143 be "non-word" characters. Remember the earliest consulted character for
2144 partial matching. */
2145
2146 #ifdef SUPPORT_UTF
2147 if (utf)
2148 {
2149 /* Get status of previous character */
2150
2151 if (eptr == md->start_subject) prev_is_word = FALSE; else
2152 {
2153 PCRE_PUCHAR lastptr = eptr - 1;
2154 BACKCHAR(lastptr);
2155 if (lastptr < md->start_used_ptr) md->start_used_ptr = lastptr;
2156 GETCHAR(c, lastptr);
2157 #ifdef SUPPORT_UCP
2158 if (md->use_ucp)
2159 {
2160 if (c == '_') prev_is_word = TRUE; else
2161 {
2162 int cat = UCD_CATEGORY(c);
2163 prev_is_word = (cat == ucp_L || cat == ucp_N);
2164 }
2165 }
2166 else
2167 #endif
2168 prev_is_word = c < 256 && (md->ctypes[c] & ctype_word) != 0;
2169 }
2170
2171 /* Get status of next character */
2172
2173 if (eptr >= md->end_subject)
2174 {
2175 SCHECK_PARTIAL();
2176 cur_is_word = FALSE;
2177 }
2178 else
2179 {
2180 GETCHAR(c, eptr);
2181 #ifdef SUPPORT_UCP
2182 if (md->use_ucp)
2183 {
2184 if (c == '_') cur_is_word = TRUE; else
2185 {
2186 int cat = UCD_CATEGORY(c);
2187 cur_is_word = (cat == ucp_L || cat == ucp_N);
2188 }
2189 }
2190 else
2191 #endif
2192 cur_is_word = c < 256 && (md->ctypes[c] & ctype_word) != 0;
2193 }
2194 }
2195 else
2196 #endif
2197
2198 /* Not in UTF-8 mode, but we may still have PCRE_UCP set, and for
2199 consistency with the behaviour of \w we do use it in this case. */
2200
2201 {
2202 /* Get status of previous character */
2203
2204 if (eptr == md->start_subject) prev_is_word = FALSE; else
2205 {
2206 if (eptr <= md->start_used_ptr) md->start_used_ptr = eptr - 1;
2207 #ifdef SUPPORT_UCP
2208 if (md->use_ucp)
2209 {
2210 c = eptr[-1];
2211 if (c == '_') prev_is_word = TRUE; else
2212 {
2213 int cat = UCD_CATEGORY(c);
2214 prev_is_word = (cat == ucp_L || cat == ucp_N);
2215 }
2216 }
2217 else
2218 #endif
2219 prev_is_word = MAX_255(eptr[-1])
2220 && ((md->ctypes[eptr[-1]] & ctype_word) != 0);
2221 }
2222
2223 /* Get status of next character */
2224
2225 if (eptr >= md->end_subject)
2226 {
2227 SCHECK_PARTIAL();
2228 cur_is_word = FALSE;
2229 }
2230 else
2231 #ifdef SUPPORT_UCP
2232 if (md->use_ucp)
2233 {
2234 c = *eptr;
2235 if (c == '_') cur_is_word = TRUE; else
2236 {
2237 int cat = UCD_CATEGORY(c);
2238 cur_is_word = (cat == ucp_L || cat == ucp_N);
2239 }
2240 }
2241 else
2242 #endif
2243 cur_is_word = MAX_255(*eptr)
2244 && ((md->ctypes[*eptr] & ctype_word) != 0);
2245 }
2246
2247 /* Now see if the situation is what we want */
2248
2249 if ((*ecode++ == OP_WORD_BOUNDARY)?
2250 cur_is_word == prev_is_word : cur_is_word != prev_is_word)
2251 RRETURN(MATCH_NOMATCH);
2252 }
2253 break;
2254
2255 /* Match any single character type except newline; have to take care with
2256 CRLF newlines and partial matching. */
2257
2258 case OP_ANY:
2259 if (IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH);
2260 if (md->partial != 0 &&
2261 eptr + 1 >= md->end_subject &&
2262 NLBLOCK->nltype == NLTYPE_FIXED &&
2263 NLBLOCK->nllen == 2 &&
2264 *eptr == NLBLOCK->nl[0])
2265 {
2266 md->hitend = TRUE;
2267 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);
2268 }
2269
2270 /* Fall through */
2271
2272 /* Match any single character whatsoever. */
2273
2274 case OP_ALLANY:
2275 if (eptr >= md->end_subject) /* DO NOT merge the eptr++ here; it must */
2276 { /* not be updated before SCHECK_PARTIAL. */
2277 SCHECK_PARTIAL();
2278 RRETURN(MATCH_NOMATCH);
2279 }
2280 eptr++;
2281 #ifdef SUPPORT_UTF
2282 if (utf) ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++);
2283 #endif
2284 ecode++;
2285 break;
2286
2287 /* Match a single byte, even in UTF-8 mode. This opcode really does match
2288 any byte, even newline, independent of the setting of PCRE_DOTALL. */
2289
2290 case OP_ANYBYTE:
2291 if (eptr >= md->end_subject) /* DO NOT merge the eptr++ here; it must */
2292 { /* not be updated before SCHECK_PARTIAL. */
2293 SCHECK_PARTIAL();
2294 RRETURN(MATCH_NOMATCH);
2295 }
2296 eptr++;
2297 ecode++;
2298 break;
2299
2300 case OP_NOT_DIGIT:
2301 if (eptr >= md->end_subject)
2302 {
2303 SCHECK_PARTIAL();
2304 RRETURN(MATCH_NOMATCH);
2305 }
2306 GETCHARINCTEST(c, eptr);
2307 if (
2308 #if defined SUPPORT_UTF || !(defined COMPILE_PCRE8)
2309 c < 256 &&
2310 #endif
2311 (md->ctypes[c] & ctype_digit) != 0
2312 )
2313 RRETURN(MATCH_NOMATCH);
2314 ecode++;
2315 break;
2316
2317 case OP_DIGIT:
2318 if (eptr >= md->end_subject)
2319 {
2320 SCHECK_PARTIAL();
2321 RRETURN(MATCH_NOMATCH);
2322 }
2323 GETCHARINCTEST(c, eptr);
2324 if (
2325 #if defined SUPPORT_UTF || !(defined COMPILE_PCRE8)
2326 c > 255 ||
2327 #endif
2328 (md->ctypes[c] & ctype_digit) == 0
2329 )
2330 RRETURN(MATCH_NOMATCH);
2331 ecode++;
2332 break;
2333
2334 case OP_NOT_WHITESPACE:
2335 if (eptr >= md->end_subject)
2336 {
2337 SCHECK_PARTIAL();
2338 RRETURN(MATCH_NOMATCH);
2339 }
2340 GETCHARINCTEST(c, eptr);
2341 if (
2342 #if defined SUPPORT_UTF || !(defined COMPILE_PCRE8)
2343 c < 256 &&
2344 #endif
2345 (md->ctypes[c] & ctype_space) != 0
2346 )
2347 RRETURN(MATCH_NOMATCH);
2348 ecode++;
2349 break;
2350
2351 case OP_WHITESPACE:
2352 if (eptr >= md->end_subject)
2353 {
2354 SCHECK_PARTIAL();
2355 RRETURN(MATCH_NOMATCH);
2356 }
2357 GETCHARINCTEST(c, eptr);
2358 if (
2359 #if defined SUPPORT_UTF || !(defined COMPILE_PCRE8)
2360 c > 255 ||
2361 #endif
2362 (md->ctypes[c] & ctype_space) == 0
2363 )
2364 RRETURN(MATCH_NOMATCH);
2365 ecode++;
2366 break;
2367
2368 case OP_NOT_WORDCHAR:
2369 if (eptr >= md->end_subject)
2370 {
2371 SCHECK_PARTIAL();
2372 RRETURN(MATCH_NOMATCH);
2373 }
2374 GETCHARINCTEST(c, eptr);
2375 if (
2376 #if defined SUPPORT_UTF || !(defined COMPILE_PCRE8)
2377 c < 256 &&
2378 #endif
2379 (md->ctypes[c] & ctype_word) != 0
2380 )
2381 RRETURN(MATCH_NOMATCH);
2382 ecode++;
2383 break;
2384
2385 case OP_WORDCHAR:
2386 if (eptr >= md->end_subject)
2387 {
2388 SCHECK_PARTIAL();
2389 RRETURN(MATCH_NOMATCH);
2390 }
2391 GETCHARINCTEST(c, eptr);
2392 if (
2393 #if defined SUPPORT_UTF || !(defined COMPILE_PCRE8)
2394 c > 255 ||
2395 #endif
2396 (md->ctypes[c] & ctype_word) == 0
2397 )
2398 RRETURN(MATCH_NOMATCH);
2399 ecode++;
2400 break;
2401
2402 case OP_ANYNL:
2403 if (eptr >= md->end_subject)
2404 {
2405 SCHECK_PARTIAL();
2406 RRETURN(MATCH_NOMATCH);
2407 }
2408 GETCHARINCTEST(c, eptr);
2409 switch(c)
2410 {
2411 default: RRETURN(MATCH_NOMATCH);
2412
2413 case 0x000d:
2414 if (eptr >= md->end_subject)
2415 {
2416 SCHECK_PARTIAL();
2417 }
2418 else if (*eptr == 0x0a) eptr++;
2419 break;
2420
2421 case 0x000a:
2422 break;
2423
2424 case 0x000b:
2425 case 0x000c:
2426 case 0x0085:
2427 case 0x2028:
2428 case 0x2029:
2429 if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
2430 break;
2431 }
2432 ecode++;
2433 break;
2434
2435 case OP_NOT_HSPACE:
2436 if (eptr >= md->end_subject)
2437 {
2438 SCHECK_PARTIAL();
2439 RRETURN(MATCH_NOMATCH);
2440 }
2441 GETCHARINCTEST(c, eptr);
2442 switch(c)
2443 {
2444 default: break;
2445 case 0x09: /* HT */
2446 case 0x20: /* SPACE */
2447 case 0xa0: /* NBSP */
2448 case 0x1680: /* OGHAM SPACE MARK */
2449 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
2450 case 0x2000: /* EN QUAD */
2451 case 0x2001: /* EM QUAD */
2452 case 0x2002: /* EN SPACE */
2453 case 0x2003: /* EM SPACE */
2454 case 0x2004: /* THREE-PER-EM SPACE */
2455 case 0x2005: /* FOUR-PER-EM SPACE */
2456 case 0x2006: /* SIX-PER-EM SPACE */
2457 case 0x2007: /* FIGURE SPACE */
2458 case 0x2008: /* PUNCTUATION SPACE */
2459 case 0x2009: /* THIN SPACE */
2460 case 0x200A: /* HAIR SPACE */
2461 case 0x202f: /* NARROW NO-BREAK SPACE */
2462 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
2463 case 0x3000: /* IDEOGRAPHIC SPACE */
2464 RRETURN(MATCH_NOMATCH);
2465 }
2466 ecode++;
2467 break;
2468
2469 case OP_HSPACE:
2470 if (eptr >= md->end_subject)
2471 {
2472 SCHECK_PARTIAL();
2473 RRETURN(MATCH_NOMATCH);
2474 }
2475 GETCHARINCTEST(c, eptr);
2476 switch(c)
2477 {
2478 default: RRETURN(MATCH_NOMATCH);
2479 case 0x09: /* HT */
2480 case 0x20: /* SPACE */
2481 case 0xa0: /* NBSP */
2482 case 0x1680: /* OGHAM SPACE MARK */
2483 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
2484 case 0x2000: /* EN QUAD */
2485 case 0x2001: /* EM QUAD */
2486 case 0x2002: /* EN SPACE */
2487 case 0x2003: /* EM SPACE */
2488 case 0x2004: /* THREE-PER-EM SPACE */
2489 case 0x2005: /* FOUR-PER-EM SPACE */
2490 case 0x2006: /* SIX-PER-EM SPACE */
2491 case 0x2007: /* FIGURE SPACE */
2492 case 0x2008: /* PUNCTUATION SPACE */
2493 case 0x2009: /* THIN SPACE */
2494 case 0x200A: /* HAIR SPACE */
2495 case 0x202f: /* NARROW NO-BREAK SPACE */
2496 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
2497 case 0x3000: /* IDEOGRAPHIC SPACE */
2498 break;
2499 }
2500 ecode++;
2501 break;
2502
2503 case OP_NOT_VSPACE:
2504 if (eptr >= md->end_subject)
2505 {
2506 SCHECK_PARTIAL();
2507 RRETURN(MATCH_NOMATCH);
2508 }
2509 GETCHARINCTEST(c, eptr);
2510 switch(c)
2511 {
2512 default: break;
2513 case 0x0a: /* LF */
2514 case 0x0b: /* VT */
2515 case 0x0c: /* FF */
2516 case 0x0d: /* CR */
2517 case 0x85: /* NEL */
2518 case 0x2028: /* LINE SEPARATOR */
2519 case 0x2029: /* PARAGRAPH SEPARATOR */
2520 RRETURN(MATCH_NOMATCH);
2521 }
2522 ecode++;
2523 break;
2524
2525 case OP_VSPACE:
2526 if (eptr >= md->end_subject)
2527 {
2528 SCHECK_PARTIAL();
2529 RRETURN(MATCH_NOMATCH);
2530 }
2531 GETCHARINCTEST(c, eptr);
2532 switch(c)
2533 {
2534 default: RRETURN(MATCH_NOMATCH);
2535 case 0x0a: /* LF */
2536 case 0x0b: /* VT */
2537 case 0x0c: /* FF */
2538 case 0x0d: /* CR */
2539 case 0x85: /* NEL */
2540 case 0x2028: /* LINE SEPARATOR */
2541 case 0x2029: /* PARAGRAPH SEPARATOR */
2542 break;
2543 }
2544 ecode++;
2545 break;
2546
2547 #ifdef SUPPORT_UCP
2548 /* Check the next character by Unicode property. We will get here only
2549 if the support is in the binary; otherwise a compile-time error occurs. */
2550
2551 case OP_PROP:
2552 case OP_NOTPROP:
2553 if (eptr >= md->end_subject)
2554 {
2555 SCHECK_PARTIAL();
2556 RRETURN(MATCH_NOMATCH);
2557 }
2558 GETCHARINCTEST(c, eptr);
2559 {
2560 const pcre_uint8 chartype = UCD_CHARTYPE(c);
2561
2562 switch(ecode[1])
2563 {
2564 case PT_ANY:
2565 if (op == OP_NOTPROP) RRETURN(MATCH_NOMATCH);
2566 break;
2567
2568 case PT_LAMP:
2569 if ((chartype == ucp_Lu ||
2570 chartype == ucp_Ll ||
2571 chartype == ucp_Lt) == (op == OP_NOTPROP))
2572 RRETURN(MATCH_NOMATCH);
2573 break;
2574
2575 case PT_GC:
2576 if ((ecode[2] != PRIV(ucp_gentype)[chartype]) == (op == OP_PROP))
2577 RRETURN(MATCH_NOMATCH);
2578 break;
2579
2580 case PT_PC:
2581 if ((ecode[2] != chartype) == (op == OP_PROP))
2582 RRETURN(MATCH_NOMATCH);
2583 break;
2584
2585 case PT_SC:
2586 if ((ecode[2] != UCD_SCRIPT(c)) == (op == OP_PROP))
2587 RRETURN(MATCH_NOMATCH);
2588 break;
2589
2590 /* These are specials */
2591
2592 case PT_ALNUM:
2593 if ((PRIV(ucp_gentype)[chartype] == ucp_L ||
2594 PRIV(ucp_gentype)[chartype] == ucp_N) == (op == OP_NOTPROP))
2595 RRETURN(MATCH_NOMATCH);
2596 break;
2597
2598 case PT_SPACE: /* Perl space */
2599 if ((PRIV(ucp_gentype)[chartype] == ucp_Z ||
2600 c == CHAR_HT || c == CHAR_NL || c == CHAR_FF || c == CHAR_CR)
2601 == (op == OP_NOTPROP))
2602 RRETURN(MATCH_NOMATCH);
2603 break;
2604
2605 case PT_PXSPACE: /* POSIX space */
2606 if ((PRIV(ucp_gentype)[chartype] == ucp_Z ||
2607 c == CHAR_HT || c == CHAR_NL || c == CHAR_VT ||
2608 c == CHAR_FF || c == CHAR_CR)
2609 == (op == OP_NOTPROP))
2610 RRETURN(MATCH_NOMATCH);
2611 break;
2612
2613 case PT_WORD:
2614 if ((PRIV(ucp_gentype)[chartype] == ucp_L ||
2615 PRIV(ucp_gentype)[chartype] == ucp_N ||
2616 c == CHAR_UNDERSCORE) == (op == OP_NOTPROP))
2617 RRETURN(MATCH_NOMATCH);
2618 break;
2619
2620 /* This should never occur */
2621
2622 default:
2623 RRETURN(PCRE_ERROR_INTERNAL);
2624 }
2625
2626 ecode += 3;
2627 }
2628 break;
2629
2630 /* Match an extended Unicode sequence. We will get here only if the support
2631 is in the binary; otherwise a compile-time error occurs. */
2632
2633 case OP_EXTUNI:
2634 if (eptr >= md->end_subject)
2635 {
2636 SCHECK_PARTIAL();
2637 RRETURN(MATCH_NOMATCH);
2638 }
2639 GETCHARINCTEST(c, eptr);
2640 if (UCD_CATEGORY(c) == ucp_M) RRETURN(MATCH_NOMATCH);
2641 while (eptr < md->end_subject)
2642 {
2643 int len = 1;
2644 if (!utf) c = *eptr; else { GETCHARLEN(c, eptr, len); }
2645 if (UCD_CATEGORY(c) != ucp_M) break;
2646 eptr += len;
2647 }
2648 CHECK_PARTIAL();
2649 ecode++;
2650 break;
2651 #endif
2652
2653
2654 /* Match a back reference, possibly repeatedly. Look past the end of the
2655 item to see if there is repeat information following. The code is similar
2656 to that for character classes, but repeated for efficiency. Then obey
2657 similar code to character type repeats - written out again for speed.
2658 However, if the referenced string is the empty string, always treat
2659 it as matched, any number of times (otherwise there could be infinite
2660 loops). */
2661
2662 case OP_REF:
2663 case OP_REFI:
2664 caseless = op == OP_REFI;
2665 offset = GET2(ecode, 1) << 1; /* Doubled ref number */
2666 ecode += 1 + IMM2_SIZE;
2667
2668 /* If the reference is unset, there are two possibilities:
2669
2670 (a) In the default, Perl-compatible state, set the length negative;
2671 this ensures that every attempt at a match fails. We can't just fail
2672 here, because of the possibility of quantifiers with zero minima.
2673
2674 (b) If the JavaScript compatibility flag is set, set the length to zero
2675 so that the back reference matches an empty string.
2676
2677 Otherwise, set the length to the length of what was matched by the
2678 referenced subpattern. */
2679
2680 if (offset >= offset_top || md->offset_vector[offset] < 0)
2681 length = (md->jscript_compat)? 0 : -1;
2682 else
2683 length = md->offset_vector[offset+1] - md->offset_vector[offset];
2684
2685 /* Set up for repetition, or handle the non-repeated case */
2686
2687 switch (*ecode)
2688 {
2689 case OP_CRSTAR:
2690 case OP_CRMINSTAR:
2691 case OP_CRPLUS:
2692 case OP_CRMINPLUS:
2693 case OP_CRQUERY:
2694 case OP_CRMINQUERY:
2695 c = *ecode++ - OP_CRSTAR;
2696 minimize = (c & 1) != 0;
2697 min = rep_min[c]; /* Pick up values from tables; */
2698 max = rep_max[c]; /* zero for max => infinity */
2699 if (max == 0) max = INT_MAX;
2700 break;
2701
2702 case OP_CRRANGE:
2703 case OP_CRMINRANGE:
2704 minimize = (*ecode == OP_CRMINRANGE);
2705 min = GET2(ecode, 1);
2706 max = GET2(ecode, 1 + IMM2_SIZE);
2707 if (max == 0) max = INT_MAX;
2708 ecode += 1 + 2 * IMM2_SIZE;
2709 break;
2710
2711 default: /* No repeat follows */
2712 if ((length = match_ref(offset, eptr, length, md, caseless)) < 0)
2713 {
2714 if (length == -2) eptr = md->end_subject; /* Partial match */
2715 CHECK_PARTIAL();
2716 RRETURN(MATCH_NOMATCH);
2717 }
2718 eptr += length;
2719 continue; /* With the main loop */
2720 }
2721
2722 /* Handle repeated back references. If the length of the reference is
2723 zero, just continue with the main loop. If the length is negative, it
2724 means the reference is unset in non-Java-compatible mode. If the minimum is
2725 zero, we can continue at the same level without recursion. For any other
2726 minimum, carrying on will result in NOMATCH. */
2727
2728 if (length == 0) continue;
2729 if (length < 0 && min == 0) continue;
2730
2731 /* First, ensure the minimum number of matches are present. We get back
2732 the length of the reference string explicitly rather than passing the
2733 address of eptr, so that eptr can be a register variable. */
2734
2735 for (i = 1; i <= min; i++)
2736 {
2737 int slength;
2738 if ((slength = match_ref(offset, eptr, length, md, caseless)) < 0)
2739 {
2740 if (slength == -2) eptr = md->end_subject; /* Partial match */
2741 CHECK_PARTIAL();
2742 RRETURN(MATCH_NOMATCH);
2743 }
2744 eptr += slength;
2745 }
2746
2747 /* If min = max, continue at the same level without recursion.
2748 They are not both allowed to be zero. */
2749
2750 if (min == max) continue;
2751
2752 /* If minimizing, keep trying and advancing the pointer */
2753
2754 if (minimize)
2755 {
2756 for (fi = min;; fi++)
2757 {
2758 int slength;
2759 RMATCH(eptr, ecode, offset_top, md, eptrb, RM14);
2760 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2761 if (fi >= max) RRETURN(MATCH_NOMATCH);
2762 if ((slength = match_ref(offset, eptr, length, md, caseless)) < 0)
2763 {
2764 if (slength == -2) eptr = md->end_subject; /* Partial match */
2765 CHECK_PARTIAL();
2766 RRETURN(MATCH_NOMATCH);
2767 }
2768 eptr += slength;
2769 }
2770 /* Control never gets here */
2771 }
2772
2773 /* If maximizing, find the longest string and work backwards */
2774
2775 else
2776 {
2777 pp = eptr;
2778 for (i = min; i < max; i++)
2779 {
2780 int slength;
2781 if ((slength = match_ref(offset, eptr, length, md, caseless)) < 0)
2782 {
2783 /* Can't use CHECK_PARTIAL because we don't want to update eptr in
2784 the soft partial matching case. */
2785
2786 if (slength == -2 && md->partial != 0 &&
2787 md->end_subject > md->start_used_ptr)
2788 {
2789 md->hitend = TRUE;
2790 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);
2791 }
2792 break;
2793 }
2794 eptr += slength;
2795 }
2796
2797 while (eptr >= pp)
2798 {
2799 RMATCH(eptr, ecode, offset_top, md, eptrb, RM15);
2800 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2801 eptr -= length;
2802 }
2803 RRETURN(MATCH_NOMATCH);
2804 }
2805 /* Control never gets here */
2806
2807 /* Match a bit-mapped character class, possibly repeatedly. This op code is
2808 used when all the characters in the class have values in the range 0-255,
2809 and either the matching is caseful, or the characters are in the range
2810 0-127 when UTF-8 processing is enabled. The only difference between
2811 OP_CLASS and OP_NCLASS occurs when a data character outside the range is
2812 encountered.
2813
2814 First, look past the end of the item to see if there is repeat information
2815 following. Then obey similar code to character type repeats - written out
2816 again for speed. */
2817
2818 case OP_NCLASS:
2819 case OP_CLASS:
2820 {
2821 /* The data variable is saved across frames, so the byte map needs to
2822 be stored there. */
2823 #define BYTE_MAP ((pcre_uint8 *)data)
2824 data = ecode + 1; /* Save for matching */
2825 ecode += 1 + (32 / sizeof(pcre_uchar)); /* Advance past the item */
2826
2827 switch (*ecode)
2828 {
2829 case OP_CRSTAR:
2830 case OP_CRMINSTAR:
2831 case OP_CRPLUS:
2832 case OP_CRMINPLUS:
2833 case OP_CRQUERY:
2834 case OP_CRMINQUERY:
2835 c = *ecode++ - OP_CRSTAR;
2836 minimize = (c & 1) != 0;
2837 min = rep_min[c]; /* Pick up values from tables; */
2838 max = rep_max[c]; /* zero for max => infinity */
2839 if (max == 0) max = INT_MAX;
2840 break;
2841
2842 case OP_CRRANGE:
2843 case OP_CRMINRANGE:
2844 minimize = (*ecode == OP_CRMINRANGE);
2845 min = GET2(ecode, 1);
2846 max = GET2(ecode, 1 + IMM2_SIZE);
2847 if (max == 0) max = INT_MAX;
2848 ecode += 1 + 2 * IMM2_SIZE;
2849 break;
2850
2851 default: /* No repeat follows */
2852 min = max = 1;
2853 break;
2854 }
2855
2856 /* First, ensure the minimum number of matches are present. */
2857
2858 #ifdef SUPPORT_UTF
2859 if (utf)
2860 {
2861 for (i = 1; i <= min; i++)
2862 {
2863 if (eptr >= md->end_subject)
2864 {
2865 SCHECK_PARTIAL();
2866 RRETURN(MATCH_NOMATCH);
2867 }
2868 GETCHARINC(c, eptr);
2869 if (c > 255)
2870 {
2871 if (op == OP_CLASS) RRETURN(MATCH_NOMATCH);
2872 }
2873 else
2874 if ((BYTE_MAP[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
2875 }
2876 }
2877 else
2878 #endif
2879 /* Not UTF mode */
2880 {
2881 for (i = 1; i <= min; i++)
2882 {
2883 if (eptr >= md->end_subject)
2884 {
2885 SCHECK_PARTIAL();
2886 RRETURN(MATCH_NOMATCH);
2887 }
2888 c = *eptr++;
2889 #ifndef COMPILE_PCRE8
2890 if (c > 255)
2891 {
2892 if (op == OP_CLASS) RRETURN(MATCH_NOMATCH);
2893 }
2894 else
2895 #endif
2896 if ((BYTE_MAP[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
2897 }
2898 }
2899
2900 /* If max == min we can continue with the main loop without the
2901 need to recurse. */
2902
2903 if (min == max) continue;
2904
2905 /* If minimizing, keep testing the rest of the expression and advancing
2906 the pointer while it matches the class. */
2907
2908 if (minimize)
2909 {
2910 #ifdef SUPPORT_UTF
2911 if (utf)
2912 {
2913 for (fi = min;; fi++)
2914 {
2915 RMATCH(eptr, ecode, offset_top, md, eptrb, RM16);
2916 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2917 if (fi >= max) RRETURN(MATCH_NOMATCH);
2918 if (eptr >= md->end_subject)
2919 {
2920 SCHECK_PARTIAL();
2921 RRETURN(MATCH_NOMATCH);
2922 }
2923 GETCHARINC(c, eptr);
2924 if (c > 255)
2925 {
2926 if (op == OP_CLASS) RRETURN(MATCH_NOMATCH);
2927 }
2928 else
2929 if ((BYTE_MAP[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
2930 }
2931 }
2932 else
2933 #endif
2934 /* Not UTF mode */
2935 {
2936 for (fi = min;; fi++)
2937 {
2938 RMATCH(eptr, ecode, offset_top, md, eptrb, RM17);
2939 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2940 if (fi >= max) RRETURN(MATCH_NOMATCH);
2941 if (eptr >= md->end_subject)
2942 {
2943 SCHECK_PARTIAL();
2944 RRETURN(MATCH_NOMATCH);
2945 }
2946 c = *eptr++;
2947 #ifndef COMPILE_PCRE8
2948 if (c > 255)
2949 {
2950 if (op == OP_CLASS) RRETURN(MATCH_NOMATCH);
2951 }
2952 else
2953 #endif
2954 if ((BYTE_MAP[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
2955 }
2956 }
2957 /* Control never gets here */
2958 }
2959
2960 /* If maximizing, find the longest possible run, then work backwards. */
2961
2962 else
2963 {
2964 pp = eptr;
2965
2966 #ifdef SUPPORT_UTF
2967 if (utf)
2968 {
2969 for (i = min; i < max; i++)
2970 {
2971 int len = 1;
2972 if (eptr >= md->end_subject)
2973 {
2974 SCHECK_PARTIAL();
2975 break;
2976 }
2977 GETCHARLEN(c, eptr, len);
2978 if (c > 255)
2979 {
2980 if (op == OP_CLASS) break;
2981 }
2982 else
2983 if ((BYTE_MAP[c/8] & (1 << (c&7))) == 0) break;
2984 eptr += len;
2985 }
2986 for (;;)
2987 {
2988 RMATCH(eptr, ecode, offset_top, md, eptrb, RM18);
2989 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2990 if (eptr-- == pp) break; /* Stop if tried at original pos */
2991 BACKCHAR(eptr);
2992 }
2993 }
2994 else
2995 #endif
2996 /* Not UTF mode */
2997 {
2998 for (i = min; i < max; i++)
2999 {
3000 if (eptr >= md->end_subject)
3001 {
3002 SCHECK_PARTIAL();
3003 break;
3004 }
3005 c = *eptr;
3006 #ifndef COMPILE_PCRE8
3007 if (c > 255)
3008 {
3009 if (op == OP_CLASS) break;
3010 }
3011 else
3012 #endif
3013 if ((BYTE_MAP[c/8] & (1 << (c&7))) == 0) break;
3014 eptr++;
3015 }
3016 while (eptr >= pp)
3017 {
3018 RMATCH(eptr, ecode, offset_top, md, eptrb, RM19);
3019 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3020 eptr--;
3021 }
3022 }
3023
3024 RRETURN(MATCH_NOMATCH);
3025 }
3026 #undef BYTE_MAP
3027 }
3028 /* Control never gets here */
3029
3030
3031 /* Match an extended character class. This opcode is encountered only
3032 when UTF-8 mode mode is supported. Nevertheless, we may not be in UTF-8
3033 mode, because Unicode properties are supported in non-UTF-8 mode. */
3034
3035 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
3036 case OP_XCLASS:
3037 {
3038 data = ecode + 1 + LINK_SIZE; /* Save for matching */
3039 ecode += GET(ecode, 1); /* Advance past the item */
3040
3041 switch (*ecode)
3042 {
3043 case OP_CRSTAR:
3044 case OP_CRMINSTAR:
3045 case OP_CRPLUS:
3046 case OP_CRMINPLUS:
3047 case OP_CRQUERY:
3048 case OP_CRMINQUERY:
3049 c = *ecode++ - OP_CRSTAR;
3050 minimize = (c & 1) != 0;
3051 min = rep_min[c]; /* Pick up values from tables; */
3052 max = rep_max[c]; /* zero for max => infinity */
3053 if (max == 0) max = INT_MAX;
3054 break;
3055
3056 case OP_CRRANGE:
3057 case OP_CRMINRANGE:
3058 minimize = (*ecode == OP_CRMINRANGE);
3059 min = GET2(ecode, 1);
3060 max = GET2(ecode, 1 + IMM2_SIZE);
3061 if (max == 0) max = INT_MAX;
3062 ecode += 1 + 2 * IMM2_SIZE;
3063 break;
3064
3065 default: /* No repeat follows */
3066 min = max = 1;
3067 break;
3068 }
3069
3070 /* First, ensure the minimum number of matches are present. */
3071
3072 for (i = 1; i <= min; i++)
3073 {
3074 if (eptr >= md->end_subject)
3075 {
3076 SCHECK_PARTIAL();
3077 RRETURN(MATCH_NOMATCH);
3078 }
3079 GETCHARINCTEST(c, eptr);
3080 if (!PRIV(xclass)(c, data, utf)) RRETURN(MATCH_NOMATCH);
3081 }
3082
3083 /* If max == min we can continue with the main loop without the
3084 need to recurse. */
3085
3086 if (min == max) continue;
3087
3088 /* If minimizing, keep testing the rest of the expression and advancing
3089 the pointer while it matches the class. */
3090
3091 if (minimize)
3092 {
3093 for (fi = min;; fi++)
3094 {
3095 RMATCH(eptr, ecode, offset_top, md, eptrb, RM20);
3096 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3097 if (fi >= max) RRETURN(MATCH_NOMATCH);
3098 if (eptr >= md->end_subject)
3099 {
3100 SCHECK_PARTIAL();
3101 RRETURN(MATCH_NOMATCH);
3102 }
3103 GETCHARINCTEST(c, eptr);
3104 if (!PRIV(xclass)(c, data, utf)) RRETURN(MATCH_NOMATCH);
3105 }
3106 /* Control never gets here */
3107 }
3108
3109 /* If maximizing, find the longest possible run, then work backwards. */
3110
3111 else
3112 {
3113 pp = eptr;
3114 for (i = min; i < max; i++)
3115 {
3116 int len = 1;
3117 if (eptr >= md->end_subject)
3118 {
3119 SCHECK_PARTIAL();
3120 break;
3121 }
3122 #ifdef SUPPORT_UTF
3123 GETCHARLENTEST(c, eptr, len);
3124 #else
3125 c = *eptr;
3126 #endif
3127 if (!PRIV(xclass)(c, data, utf)) break;
3128 eptr += len;
3129 }
3130 for(;;)
3131 {
3132 RMATCH(eptr, ecode, offset_top, md, eptrb, RM21);
3133 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3134 if (eptr-- == pp) break; /* Stop if tried at original pos */
3135 #ifdef SUPPORT_UTF
3136 if (utf) BACKCHAR(eptr);
3137 #endif
3138 }
3139 RRETURN(MATCH_NOMATCH);
3140 }
3141
3142 /* Control never gets here */
3143 }
3144 #endif /* End of XCLASS */
3145
3146 /* Match a single character, casefully */
3147
3148 case OP_CHAR:
3149 #ifdef SUPPORT_UTF
3150 if (utf)
3151 {
3152 length = 1;
3153 ecode++;
3154 GETCHARLEN(fc, ecode, length);
3155 if (length > md->end_subject - eptr)
3156 {
3157 CHECK_PARTIAL(); /* Not SCHECK_PARTIAL() */
3158 RRETURN(MATCH_NOMATCH);
3159 }
3160 while (length-- > 0) if (*ecode++ != *eptr++) RRETURN(MATCH_NOMATCH);
3161 }
3162 else
3163 #endif
3164 /* Not UTF mode */
3165 {
3166 if (md->end_subject - eptr < 1)
3167 {
3168 SCHECK_PARTIAL(); /* This one can use SCHECK_PARTIAL() */
3169 RRETURN(MATCH_NOMATCH);
3170 }
3171 if (ecode[1] != *eptr++) RRETURN(MATCH_NOMATCH);
3172 ecode += 2;
3173 }
3174 break;
3175
3176 /* Match a single character, caselessly. If we are at the end of the
3177 subject, give up immediately. */
3178
3179 case OP_CHARI:
3180 if (eptr >= md->end_subject)
3181 {
3182 SCHECK_PARTIAL();
3183 RRETURN(MATCH_NOMATCH);
3184 }
3185
3186 #ifdef SUPPORT_UTF
3187 if (utf)
3188 {
3189 length = 1;
3190 ecode++;
3191 GETCHARLEN(fc, ecode, length);
3192
3193 /* If the pattern character's value is < 128, we have only one byte, and
3194 we know that its other case must also be one byte long, so we can use the
3195 fast lookup table. We know that there is at least one byte left in the
3196 subject. */
3197
3198 if (fc < 128)
3199 {
3200 if (md->lcc[fc]
3201 != TABLE_GET(*eptr, md->lcc, *eptr)) RRETURN(MATCH_NOMATCH);
3202 ecode++;
3203 eptr++;
3204 }
3205
3206 /* Otherwise we must pick up the subject character. Note that we cannot
3207 use the value of "length" to check for sufficient bytes left, because the
3208 other case of the character may have more or fewer bytes. */
3209
3210 else
3211 {
3212 unsigned int dc;
3213 GETCHARINC(dc, eptr);
3214 ecode += length;
3215
3216 /* If we have Unicode property support, we can use it to test the other
3217 case of the character, if there is one. */
3218
3219 if (fc != dc)
3220 {
3221 #ifdef SUPPORT_UCP
3222 if (dc != UCD_OTHERCASE(fc))
3223 #endif
3224 RRETURN(MATCH_NOMATCH);
3225 }
3226 }
3227 }
3228 else
3229 #endif /* SUPPORT_UTF */
3230
3231 /* Not UTF mode */
3232 {
3233 if (TABLE_GET(ecode[1], md->lcc, ecode[1])
3234 != TABLE_GET(*eptr, md->lcc, *eptr)) RRETURN(MATCH_NOMATCH);
3235 eptr++;
3236 ecode += 2;
3237 }
3238 break;
3239
3240 /* Match a single character repeatedly. */
3241
3242 case OP_EXACT:
3243 case OP_EXACTI:
3244 min = max = GET2(ecode, 1);
3245 ecode += 1 + IMM2_SIZE;
3246 goto REPEATCHAR;
3247
3248 case OP_POSUPTO:
3249 case OP_POSUPTOI:
3250 possessive = TRUE;
3251 /* Fall through */
3252
3253 case OP_UPTO:
3254 case OP_UPTOI:
3255 case OP_MINUPTO:
3256 case OP_MINUPTOI:
3257 min = 0;
3258 max = GET2(ecode, 1);
3259 minimize = *ecode == OP_MINUPTO || *ecode == OP_MINUPTOI;
3260 ecode += 1 + IMM2_SIZE;
3261 goto REPEATCHAR;
3262
3263 case OP_POSSTAR:
3264 case OP_POSSTARI:
3265 possessive = TRUE;
3266 min = 0;
3267 max = INT_MAX;
3268 ecode++;
3269 goto REPEATCHAR;
3270
3271 case OP_POSPLUS:
3272 case OP_POSPLUSI:
3273 possessive = TRUE;
3274 min = 1;
3275 max = INT_MAX;
3276 ecode++;
3277 goto REPEATCHAR;
3278
3279 case OP_POSQUERY:
3280 case OP_POSQUERYI:
3281 possessive = TRUE;
3282 min = 0;
3283 max = 1;
3284 ecode++;
3285 goto REPEATCHAR;
3286
3287 case OP_STAR:
3288 case OP_STARI:
3289 case OP_MINSTAR:
3290 case OP_MINSTARI:
3291 case OP_PLUS:
3292 case OP_PLUSI:
3293 case OP_MINPLUS:
3294 case OP_MINPLUSI:
3295 case OP_QUERY:
3296 case OP_QUERYI:
3297 case OP_MINQUERY:
3298 case OP_MINQUERYI:
3299 c = *ecode++ - ((op < OP_STARI)? OP_STAR : OP_STARI);
3300 minimize = (c & 1) != 0;
3301 min = rep_min[c]; /* Pick up values from tables; */
3302 max = rep_max[c]; /* zero for max => infinity */
3303 if (max == 0) max = INT_MAX;
3304
3305 /* Common code for all repeated single-character matches. */
3306
3307 REPEATCHAR:
3308 #ifdef SUPPORT_UTF
3309 if (utf)
3310 {
3311 length = 1;
3312 charptr = ecode;
3313 GETCHARLEN(fc, ecode, length);
3314 ecode += length;
3315
3316 /* Handle multibyte character matching specially here. There is
3317 support for caseless matching if UCP support is present. */
3318
3319 if (length > 1)
3320 {
3321 #ifdef SUPPORT_UCP
3322 unsigned int othercase;
3323 if (op >= OP_STARI && /* Caseless */
3324 (othercase = UCD_OTHERCASE(fc)) != fc)
3325 oclength = PRIV(ord2utf)(othercase, occhars);
3326 else oclength = 0;
3327 #endif /* SUPPORT_UCP */
3328
3329 for (i = 1; i <= min; i++)
3330 {
3331 if (eptr <= md->end_subject - length &&
3332 memcmp(eptr, charptr, IN_UCHARS(length)) == 0) eptr += length;
3333 #ifdef SUPPORT_UCP
3334 else if (oclength > 0 &&
3335 eptr <= md->end_subject - oclength &&
3336 memcmp(eptr, occhars, IN_UCHARS(oclength)) == 0) eptr += oclength;
3337 #endif /* SUPPORT_UCP */
3338 else
3339 {
3340 CHECK_PARTIAL();
3341 RRETURN(MATCH_NOMATCH);
3342 }
3343 }
3344
3345 if (min == max) continue;
3346
3347 if (minimize)
3348 {
3349 for (fi = min;; fi++)
3350 {
3351 RMATCH(eptr, ecode, offset_top, md, eptrb, RM22);
3352 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3353 if (fi >= max) RRETURN(MATCH_NOMATCH);
3354 if (eptr <= md->end_subject - length &&
3355 memcmp(eptr, charptr, IN_UCHARS(length)) == 0) eptr += length;
3356 #ifdef SUPPORT_UCP
3357 else if (oclength > 0 &&
3358 eptr <= md->end_subject - oclength &&
3359 memcmp(eptr, occhars, IN_UCHARS(oclength)) == 0) eptr += oclength;
3360 #endif /* SUPPORT_UCP */
3361 else
3362 {
3363 CHECK_PARTIAL();
3364 RRETURN(MATCH_NOMATCH);
3365 }
3366 }
3367 /* Control never gets here */
3368 }
3369
3370 else /* Maximize */
3371 {
3372 pp = eptr;
3373 for (i = min; i < max; i++)
3374 {
3375 if (eptr <= md->end_subject - length &&
3376 memcmp(eptr, charptr, IN_UCHARS(length)) == 0) eptr += length;
3377 #ifdef SUPPORT_UCP
3378 else if (oclength > 0 &&
3379 eptr <= md->end_subject - oclength &&
3380 memcmp(eptr, occhars, IN_UCHARS(oclength)) == 0) eptr += oclength;
3381 #endif /* SUPPORT_UCP */
3382 else
3383 {
3384 CHECK_PARTIAL();
3385 break;
3386 }
3387 }
3388
3389 if (possessive) continue;
3390
3391 for(;;)
3392 {
3393 RMATCH(eptr, ecode, offset_top, md, eptrb, RM23);
3394 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3395 if (eptr == pp) { RRETURN(MATCH_NOMATCH); }
3396 #ifdef SUPPORT_UCP
3397 eptr--;
3398 BACKCHAR(eptr);
3399 #else /* without SUPPORT_UCP */
3400 eptr -= length;
3401 #endif /* SUPPORT_UCP */
3402 }
3403 }
3404 /* Control never gets here */
3405 }
3406
3407 /* If the length of a UTF-8 character is 1, we fall through here, and
3408 obey the code as for non-UTF-8 characters below, though in this case the
3409 value of fc will always be < 128. */
3410 }
3411 else
3412 #endif /* SUPPORT_UTF */
3413 /* When not in UTF-8 mode, load a single-byte character. */
3414 fc = *ecode++;
3415
3416 /* The value of fc at this point is always one character, though we may
3417 or may not be in UTF mode. The code is duplicated for the caseless and
3418 caseful cases, for speed, since matching characters is likely to be quite
3419 common. First, ensure the minimum number of matches are present. If min =
3420 max, continue at the same level without recursing. Otherwise, if
3421 minimizing, keep trying the rest of the expression and advancing one
3422 matching character if failing, up to the maximum. Alternatively, if
3423 maximizing, find the maximum number of characters and work backwards. */
3424
3425 DPRINTF(("matching %c{%d,%d} against subject %.*s\n", fc, min, max,
3426 max, (char *)eptr));
3427
3428 if (op >= OP_STARI) /* Caseless */
3429 {
3430 #ifdef COMPILE_PCRE8
3431 /* fc must be < 128 if UTF is enabled. */
3432 foc = md->fcc[fc];
3433 #else
3434 #ifdef SUPPORT_UTF
3435 #ifdef SUPPORT_UCP
3436 if (utf && fc > 127)
3437 foc = UCD_OTHERCASE(fc);
3438 #else
3439 if (utf && fc > 127)
3440 foc = fc;
3441 #endif /* SUPPORT_UCP */
3442 else
3443 #endif /* SUPPORT_UTF */
3444 foc = TABLE_GET(fc, md->fcc, fc);
3445 #endif /* COMPILE_PCRE8 */
3446
3447 for (i = 1; i <= min; i++)
3448 {
3449 if (eptr >= md->end_subject)
3450 {
3451 SCHECK_PARTIAL();
3452 RRETURN(MATCH_NOMATCH);
3453 }
3454 if (fc != *eptr && foc != *eptr) RRETURN(MATCH_NOMATCH);
3455 eptr++;
3456 }
3457 if (min == max) continue;
3458 if (minimize)
3459 {
3460 for (fi = min;; fi++)
3461 {
3462 RMATCH(eptr, ecode, offset_top, md, eptrb, RM24);
3463 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3464 if (fi >= max) RRETURN(MATCH_NOMATCH);
3465 if (eptr >= md->end_subject)
3466 {
3467 SCHECK_PARTIAL();
3468 RRETURN(MATCH_NOMATCH);
3469 }
3470 if (fc != *eptr && foc != *eptr) RRETURN(MATCH_NOMATCH);
3471 eptr++;
3472 }
3473 /* Control never gets here */
3474 }
3475 else /* Maximize */
3476 {
3477 pp = eptr;
3478 for (i = min; i < max; i++)
3479 {
3480 if (eptr >= md->end_subject)
3481 {
3482 SCHECK_PARTIAL();
3483 break;
3484 }
3485 if (fc != *eptr && foc != *eptr) break;
3486 eptr++;
3487 }
3488
3489 if (possessive) continue;
3490
3491 while (eptr >= pp)
3492 {
3493 RMATCH(eptr, ecode, offset_top, md, eptrb, RM25);
3494 eptr--;
3495 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3496 }
3497 RRETURN(MATCH_NOMATCH);
3498 }
3499 /* Control never gets here */
3500 }
3501
3502 /* Caseful comparisons (includes all multi-byte characters) */
3503
3504 else
3505 {
3506 for (i = 1; i <= min; i++)
3507 {
3508 if (eptr >= md->end_subject)
3509 {
3510 SCHECK_PARTIAL();
3511 RRETURN(MATCH_NOMATCH);
3512 }
3513 if (fc != *eptr++) RRETURN(MATCH_NOMATCH);
3514 }
3515
3516 if (min == max) continue;
3517
3518 if (minimize)
3519 {
3520 for (fi = min;; fi++)
3521 {
3522 RMATCH(eptr, ecode, offset_top, md, eptrb, RM26);
3523 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3524 if (fi >= max) RRETURN(MATCH_NOMATCH);
3525 if (eptr >= md->end_subject)
3526 {
3527 SCHECK_PARTIAL();
3528 RRETURN(MATCH_NOMATCH);
3529 }
3530 if (fc != *eptr++) RRETURN(MATCH_NOMATCH);
3531 }
3532 /* Control never gets here */
3533 }
3534 else /* Maximize */
3535 {
3536 pp = eptr;
3537 for (i = min; i < max; i++)
3538 {
3539 if (eptr >= md->end_subject)
3540 {
3541 SCHECK_PARTIAL();
3542 break;
3543 }
3544 if (fc != *eptr) break;
3545 eptr++;
3546 }
3547 if (possessive) continue;
3548
3549 while (eptr >= pp)
3550 {
3551 RMATCH(eptr, ecode, offset_top, md, eptrb, RM27);
3552 eptr--;
3553 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3554 }
3555 RRETURN(MATCH_NOMATCH);
3556 }
3557 }
3558 /* Control never gets here */
3559
3560 /* Match a negated single one-byte character. The character we are
3561 checking can be multibyte. */
3562
3563 case OP_NOT:
3564 case OP_NOTI:
3565 if (eptr >= md->end_subject)
3566 {
3567 SCHECK_PARTIAL();
3568 RRETURN(MATCH_NOMATCH);
3569 }
3570 #ifdef SUPPORT_UTF
3571 if (utf)
3572 {
3573 unsigned int ch, och;
3574
3575 ecode++;
3576 GETCHARINC(ch, ecode);
3577 GETCHARINC(c, eptr);
3578
3579 if (op == OP_NOT)
3580 {
3581 if (ch == c) RRETURN(MATCH_NOMATCH);
3582 }
3583 else
3584 {
3585 #ifdef SUPPORT_UCP
3586 if (ch > 127)
3587 och = UCD_OTHERCASE(ch);
3588 #else
3589 if (ch > 127)
3590 och = ch;
3591 #endif /* SUPPORT_UCP */
3592 else
3593 och = TABLE_GET(ch, md->fcc, ch);
3594 if (ch == c || och == c) RRETURN(MATCH_NOMATCH);
3595 }
3596 }
3597 else
3598 #endif
3599 {
3600 unsigned int ch = ecode[1];
3601 c = *eptr++;
3602 if (ch == c || (op == OP_NOTI && TABLE_GET(ch, md->fcc, ch) == c))
3603 RRETURN(MATCH_NOMATCH);
3604 ecode += 2;
3605 }
3606 break;
3607
3608 /* Match a negated single one-byte character repeatedly. This is almost a
3609 repeat of the code for a repeated single character, but I haven't found a
3610 nice way of commoning these up that doesn't require a test of the
3611 positive/negative option for each character match. Maybe that wouldn't add
3612 very much to the time taken, but character matching *is* what this is all
3613 about... */
3614
3615 case OP_NOTEXACT:
3616 case OP_NOTEXACTI:
3617 min = max = GET2(ecode, 1);
3618 ecode += 1 + IMM2_SIZE;
3619 goto REPEATNOTCHAR;
3620
3621 case OP_NOTUPTO:
3622 case OP_NOTUPTOI:
3623 case OP_NOTMINUPTO:
3624 case OP_NOTMINUPTOI:
3625 min = 0;
3626 max = GET2(ecode, 1);
3627 minimize = *ecode == OP_NOTMINUPTO || *ecode == OP_NOTMINUPTOI;
3628 ecode += 1 + IMM2_SIZE;
3629 goto REPEATNOTCHAR;
3630
3631 case OP_NOTPOSSTAR:
3632 case OP_NOTPOSSTARI:
3633 possessive = TRUE;
3634 min = 0;
3635 max = INT_MAX;
3636 ecode++;
3637 goto REPEATNOTCHAR;
3638
3639 case OP_NOTPOSPLUS:
3640 case OP_NOTPOSPLUSI:
3641 possessive = TRUE;
3642 min = 1;
3643 max = INT_MAX;
3644 ecode++;
3645 goto REPEATNOTCHAR;
3646
3647 case OP_NOTPOSQUERY:
3648 case OP_NOTPOSQUERYI:
3649 possessive = TRUE;
3650 min = 0;
3651 max = 1;
3652 ecode++;
3653 goto REPEATNOTCHAR;
3654
3655 case OP_NOTPOSUPTO:
3656 case OP_NOTPOSUPTOI:
3657 possessive = TRUE;
3658 min = 0;
3659 max = GET2(ecode, 1);
3660 ecode += 1 + IMM2_SIZE;
3661 goto REPEATNOTCHAR;
3662
3663 case OP_NOTSTAR:
3664 case OP_NOTSTARI:
3665 case OP_NOTMINSTAR:
3666 case OP_NOTMINSTARI:
3667 case OP_NOTPLUS:
3668 case OP_NOTPLUSI:
3669 case OP_NOTMINPLUS:
3670 case OP_NOTMINPLUSI:
3671 case OP_NOTQUERY:
3672 case OP_NOTQUERYI:
3673 case OP_NOTMINQUERY:
3674 case OP_NOTMINQUERYI:
3675 c = *ecode++ - ((op >= OP_NOTSTARI)? OP_NOTSTARI: OP_NOTSTAR);
3676 minimize = (c & 1) != 0;
3677 min = rep_min[c]; /* Pick up values from tables; */
3678 max = rep_max[c]; /* zero for max => infinity */
3679 if (max == 0) max = INT_MAX;
3680
3681 /* Common code for all repeated single-byte matches. */
3682
3683 REPEATNOTCHAR:
3684 GETCHARINCTEST(fc, ecode);
3685
3686 /* The code is duplicated for the caseless and caseful cases, for speed,
3687 since matching characters is likely to be quite common. First, ensure the
3688 minimum number of matches are present. If min = max, continue at the same
3689 level without recursing. Otherwise, if minimizing, keep trying the rest of
3690 the expression and advancing one matching character if failing, up to the
3691 maximum. Alternatively, if maximizing, find the maximum number of
3692 characters and work backwards. */
3693
3694 DPRINTF(("negative matching %c{%d,%d} against subject %.*s\n", fc, min, max,
3695 max, (char *)eptr));
3696
3697 if (op >= OP_NOTSTARI) /* Caseless */
3698 {
3699 #ifdef SUPPORT_UTF
3700 #ifdef SUPPORT_UCP
3701 if (utf && fc > 127)
3702 foc = UCD_OTHERCASE(fc);
3703 #else
3704 if (utf && fc > 127)
3705 foc = fc;
3706 #endif /* SUPPORT_UCP */
3707 else
3708 #endif /* SUPPORT_UTF */
3709 foc = TABLE_GET(fc, md->fcc, fc);
3710
3711 #ifdef SUPPORT_UTF
3712 if (utf)
3713 {
3714 unsigned int d;
3715 for (i = 1; i <= min; i++)
3716 {
3717 if (eptr >= md->end_subject)
3718 {
3719 SCHECK_PARTIAL();
3720 RRETURN(MATCH_NOMATCH);
3721 }
3722 GETCHARINC(d, eptr);
3723 if (fc == d || (unsigned int)foc == d) RRETURN(MATCH_NOMATCH);
3724 }
3725 }
3726 else
3727 #endif
3728 /* Not UTF mode */
3729 {
3730 for (i = 1; i <= min; i++)
3731 {
3732 if (eptr >= md->end_subject)
3733 {
3734 SCHECK_PARTIAL();
3735 RRETURN(MATCH_NOMATCH);
3736 }
3737 if (fc == *eptr || foc == *eptr) RRETURN(MATCH_NOMATCH);
3738 eptr++;
3739 }
3740 }
3741
3742 if (min == max) continue;
3743
3744 if (minimize)
3745 {
3746 #ifdef SUPPORT_UTF
3747 if (utf)
3748 {
3749 unsigned int d;
3750 for (fi = min;; fi++)
3751 {
3752 RMATCH(eptr, ecode, offset_top, md, eptrb, RM28);
3753 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3754 if (fi >= max) RRETURN(MATCH_NOMATCH);
3755 if (eptr >= md->end_subject)
3756 {
3757 SCHECK_PARTIAL();
3758 RRETURN(MATCH_NOMATCH);
3759 }
3760 GETCHARINC(d, eptr);
3761 if (fc == d || (unsigned int)foc == d) RRETURN(MATCH_NOMATCH);
3762 }
3763 }
3764 else
3765 #endif
3766 /* Not UTF mode */
3767 {
3768 for (fi = min;; fi++)
3769 {
3770 RMATCH(eptr, ecode, offset_top, md, eptrb, RM29);
3771 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3772 if (fi >= max) RRETURN(MATCH_NOMATCH);
3773 if (eptr >= md->end_subject)
3774 {
3775 SCHECK_PARTIAL();
3776 RRETURN(MATCH_NOMATCH);
3777 }
3778 if (fc == *eptr || foc == *eptr) RRETURN(MATCH_NOMATCH);
3779 eptr++;
3780 }
3781 }
3782 /* Control never gets here */
3783 }
3784
3785 /* Maximize case */
3786
3787 else
3788 {
3789 pp = eptr;
3790
3791 #ifdef SUPPORT_UTF
3792 if (utf)
3793 {
3794 unsigned int d;
3795 for (i = min; i < max; i++)
3796 {
3797 int len = 1;
3798 if (eptr >= md->end_subject)
3799 {
3800 SCHECK_PARTIAL();
3801 break;
3802 }
3803 GETCHARLEN(d, eptr, len);
3804 if (fc == d || (unsigned int)foc == d) break;
3805 eptr += len;
3806 }
3807 if (possessive) continue;
3808 for(;;)
3809 {
3810 RMATCH(eptr, ecode, offset_top, md, eptrb, RM30);
3811 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3812 if (eptr-- == pp) break; /* Stop if tried at original pos */
3813 BACKCHAR(eptr);
3814 }
3815 }
3816 else
3817 #endif
3818 /* Not UTF mode */
3819 {
3820 for (i = min; i < max; i++)
3821 {
3822 if (eptr >= md->end_subject)
3823 {
3824 SCHECK_PARTIAL();
3825 break;
3826 }
3827 if (fc == *eptr || foc == *eptr) break;
3828 eptr++;
3829 }
3830 if (possessive) continue;
3831 while (eptr >= pp)
3832 {
3833 RMATCH(eptr, ecode, offset_top, md, eptrb, RM31);
3834 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3835 eptr--;
3836 }
3837 }
3838
3839 RRETURN(MATCH_NOMATCH);
3840 }
3841 /* Control never gets here */
3842 }
3843
3844 /* Caseful comparisons */
3845
3846 else
3847 {
3848 #ifdef SUPPORT_UTF
3849 if (utf)
3850 {
3851 unsigned int d;
3852 for (i = 1; i <= min; i++)
3853 {
3854 if (eptr >= md->end_subject)
3855 {
3856 SCHECK_PARTIAL();
3857 RRETURN(MATCH_NOMATCH);
3858 }
3859 GETCHARINC(d, eptr);
3860 if (fc == d) RRETURN(MATCH_NOMATCH);
3861 }
3862 }
3863 else
3864 #endif
3865 /* Not UTF mode */
3866 {
3867 for (i = 1; i <= min; i++)
3868 {
3869 if (eptr >= md->end_subject)
3870 {
3871 SCHECK_PARTIAL();
3872 RRETURN(MATCH_NOMATCH);
3873 }
3874 if (fc == *eptr++) RRETURN(MATCH_NOMATCH);
3875 }
3876 }
3877
3878 if (min == max) continue;
3879
3880 if (minimize)
3881 {
3882 #ifdef SUPPORT_UTF
3883 if (utf)
3884 {
3885 unsigned int d;
3886 for (fi = min;; fi++)
3887 {
3888 RMATCH(eptr, ecode, offset_top, md, eptrb, RM32);
3889 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3890 if (fi >= max) RRETURN(MATCH_NOMATCH);
3891 if (eptr >= md->end_subject)
3892 {
3893 SCHECK_PARTIAL();
3894 RRETURN(MATCH_NOMATCH);
3895 }
3896 GETCHARINC(d, eptr);
3897 if (fc == d) RRETURN(MATCH_NOMATCH);
3898 }
3899 }
3900 else
3901 #endif
3902 /* Not UTF mode */
3903 {
3904 for (fi = min;; fi++)
3905 {
3906 RMATCH(eptr, ecode, offset_top, md, eptrb, RM33);
3907 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3908 if (fi >= max) RRETURN(MATCH_NOMATCH);
3909 if (eptr >= md->end_subject)
3910 {
3911 SCHECK_PARTIAL();
3912 RRETURN(MATCH_NOMATCH);
3913 }
3914 if (fc == *eptr++) RRETURN(MATCH_NOMATCH);
3915 }
3916 }
3917 /* Control never gets here */
3918 }
3919
3920 /* Maximize case */
3921
3922 else
3923 {
3924 pp = eptr;
3925
3926 #ifdef SUPPORT_UTF
3927 if (utf)
3928 {
3929 unsigned int d;
3930 for (i = min; i < max; i++)
3931 {
3932 int len = 1;
3933 if (eptr >= md->end_subject)
3934 {
3935 SCHECK_PARTIAL();
3936 break;
3937 }
3938 GETCHARLEN(d, eptr, len);
3939 if (fc == d) break;
3940 eptr += len;
3941 }
3942 if (possessive) continue;
3943 for(;;)
3944 {
3945 RMATCH(eptr, ecode, offset_top, md, eptrb, RM34);
3946 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3947 if (eptr-- == pp) break; /* Stop if tried at original pos */
3948 BACKCHAR(eptr);
3949 }
3950 }
3951 else
3952 #endif
3953 /* Not UTF mode */
3954 {
3955 for (i = min; i < max; i++)
3956 {
3957 if (eptr >= md->end_subject)
3958 {
3959 SCHECK_PARTIAL();
3960 break;
3961 }
3962 if (fc == *eptr) break;
3963 eptr++;
3964 }
3965 if (possessive) continue;
3966 while (eptr >= pp)
3967 {
3968 RMATCH(eptr, ecode, offset_top, md, eptrb, RM35);
3969 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3970 eptr--;
3971 }
3972 }
3973
3974 RRETURN(MATCH_NOMATCH);
3975 }
3976 }
3977 /* Control never gets here */
3978
3979 /* Match a single character type repeatedly; several different opcodes
3980 share code. This is very similar to the code for single characters, but we
3981 repeat it in the interests of efficiency. */
3982
3983 case OP_TYPEEXACT:
3984 min = max = GET2(ecode, 1);
3985 minimize = TRUE;
3986 ecode += 1 + IMM2_SIZE;
3987 goto REPEATTYPE;
3988
3989 case OP_TYPEUPTO:
3990 case OP_TYPEMINUPTO:
3991 min = 0;
3992 max = GET2(ecode, 1);
3993 minimize = *ecode == OP_TYPEMINUPTO;
3994 ecode += 1 + IMM2_SIZE;
3995 goto REPEATTYPE;
3996
3997 case OP_TYPEPOSSTAR:
3998 possessive = TRUE;
3999 min = 0;
4000 max = INT_MAX;
4001 ecode++;
4002 goto REPEATTYPE;
4003
4004 case OP_TYPEPOSPLUS:
4005 possessive = TRUE;
4006 min = 1;
4007 max = INT_MAX;
4008 ecode++;
4009 goto REPEATTYPE;
4010
4011 case OP_TYPEPOSQUERY:
4012 possessive = TRUE;
4013 min = 0;
4014 max = 1;
4015 ecode++;
4016 goto REPEATTYPE;
4017
4018 case OP_TYPEPOSUPTO:
4019 possessive = TRUE;
4020 min = 0;
4021 max = GET2(ecode, 1);
4022 ecode += 1 + IMM2_SIZE;
4023 goto REPEATTYPE;
4024
4025 case OP_TYPESTAR:
4026 case OP_TYPEMINSTAR:
4027 case OP_TYPEPLUS:
4028 case OP_TYPEMINPLUS:
4029 case OP_TYPEQUERY:
4030 case OP_TYPEMINQUERY:
4031 c = *ecode++ - OP_TYPESTAR;
4032 minimize = (c & 1) != 0;
4033 min = rep_min[c]; /* Pick up values from tables; */
4034 max = rep_max[c]; /* zero for max => infinity */
4035 if (max == 0) max = INT_MAX;
4036
4037 /* Common code for all repeated single character type matches. Note that
4038 in UTF-8 mode, '.' matches a character of any length, but for the other
4039 character types, the valid characters are all one-byte long. */
4040
4041 REPEATTYPE:
4042 ctype = *ecode++; /* Code for the character type */
4043
4044 #ifdef SUPPORT_UCP
4045 if (ctype == OP_PROP || ctype == OP_NOTPROP)
4046 {
4047 prop_fail_result = ctype == OP_NOTPROP;
4048 prop_type = *ecode++;
4049 prop_value = *ecode++;
4050 }
4051 else prop_type = -1;
4052 #endif
4053
4054 /* First, ensure the minimum number of matches are present. Use inline
4055 code for maximizing the speed, and do the type test once at the start
4056 (i.e. keep it out of the loop). Separate the UTF-8 code completely as that
4057 is tidier. Also separate the UCP code, which can be the same for both UTF-8
4058 and single-bytes. */
4059
4060 if (min > 0)
4061 {
4062 #ifdef SUPPORT_UCP
4063 if (prop_type >= 0)
4064 {
4065 switch(prop_type)
4066 {
4067 case PT_ANY:
4068 if (prop_fail_result) RRETURN(MATCH_NOMATCH);
4069 for (i = 1; i <= min; i++)
4070 {
4071 if (eptr >= md->end_subject)
4072 {
4073 SCHECK_PARTIAL();
4074 RRETURN(MATCH_NOMATCH);
4075 }
4076 GETCHARINCTEST(c, eptr);
4077 }
4078 break;
4079
4080 case PT_LAMP:
4081 for (i = 1; i <= min; i++)
4082 {
4083 int chartype;
4084 if (eptr >= md->end_subject)
4085 {
4086 SCHECK_PARTIAL();
4087 RRETURN(MATCH_NOMATCH);
4088 }
4089 GETCHARINCTEST(c, eptr);
4090 chartype = UCD_CHARTYPE(c);
4091 if ((chartype == ucp_Lu ||
4092 chartype == ucp_Ll ||
4093 chartype == ucp_Lt) == prop_fail_result)
4094 RRETURN(MATCH_NOMATCH);
4095 }
4096 break;
4097
4098 case PT_GC:
4099 for (i = 1; i <= min; i++)
4100 {
4101 if (eptr >= md->end_subject)
4102 {
4103 SCHECK_PARTIAL();
4104 RRETURN(MATCH_NOMATCH);
4105 }
4106 GETCHARINCTEST(c, eptr);
4107 if ((UCD_CATEGORY(c) == prop_value) == prop_fail_result)
4108 RRETURN(MATCH_NOMATCH);
4109 }
4110 break;
4111
4112 case PT_PC:
4113 for (i = 1; i <= min; i++)
4114 {
4115 if (eptr >= md->end_subject)
4116 {
4117 SCHECK_PARTIAL();
4118 RRETURN(MATCH_NOMATCH);
4119 }
4120 GETCHARINCTEST(c, eptr);
4121 if ((UCD_CHARTYPE(c) == prop_value) == prop_fail_result)
4122 RRETURN(MATCH_NOMATCH);
4123 }
4124 break;
4125
4126 case PT_SC:
4127 for (i = 1; i <= min; i++)
4128 {
4129 if (eptr >= md->end_subject)
4130 {
4131 SCHECK_PARTIAL();
4132 RRETURN(MATCH_NOMATCH);
4133 }
4134 GETCHARINCTEST(c, eptr);
4135 if ((UCD_SCRIPT(c) == prop_value) == prop_fail_result)
4136 RRETURN(MATCH_NOMATCH);
4137 }
4138 break;
4139
4140 case PT_ALNUM:
4141 for (i = 1; i <= min; i++)
4142 {
4143 int category;
4144 if (eptr >= md->end_subject)
4145 {
4146 SCHECK_PARTIAL();
4147 RRETURN(MATCH_NOMATCH);
4148 }
4149 GETCHARINCTEST(c, eptr);
4150 category = UCD_CATEGORY(c);
4151 if ((category == ucp_L || category == ucp_N) == prop_fail_result)
4152 RRETURN(MATCH_NOMATCH);
4153 }
4154 break;
4155
4156 case PT_SPACE: /* Perl space */
4157 for (i = 1; i <= min; i++)
4158 {
4159 if (eptr >= md->end_subject)
4160 {
4161 SCHECK_PARTIAL();
4162 RRETURN(MATCH_NOMATCH);
4163 }
4164 GETCHARINCTEST(c, eptr);
4165 if ((UCD_CATEGORY(c) == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
4166 c == CHAR_FF || c == CHAR_CR)
4167 == prop_fail_result)
4168 RRETURN(MATCH_NOMATCH);
4169 }
4170 break;
4171
4172 case PT_PXSPACE: /* POSIX space */
4173 for (i = 1; i <= min; i++)
4174 {
4175 if (eptr >= md->end_subject)
4176 {
4177 SCHECK_PARTIAL();
4178 RRETURN(MATCH_NOMATCH);
4179 }
4180 GETCHARINCTEST(c, eptr);
4181 if ((UCD_CATEGORY(c) == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
4182 c == CHAR_VT || c == CHAR_FF || c == CHAR_CR)
4183 == prop_fail_result)
4184 RRETURN(MATCH_NOMATCH);
4185 }
4186 break;
4187
4188 case PT_WORD:
4189 for (i = 1; i <= min; i++)
4190 {
4191 int category;
4192 if (eptr >= md->end_subject)
4193 {
4194 SCHECK_PARTIAL();
4195 RRETURN(MATCH_NOMATCH);
4196 }
4197 GETCHARINCTEST(c, eptr);
4198 category = UCD_CATEGORY(c);
4199 if ((category == ucp_L || category == ucp_N || c == CHAR_UNDERSCORE)
4200 == prop_fail_result)
4201 RRETURN(MATCH_NOMATCH);
4202 }
4203 break;
4204
4205 /* This should not occur */
4206
4207 default:
4208 RRETURN(PCRE_ERROR_INTERNAL);
4209 }
4210 }
4211
4212 /* Match extended Unicode sequences. We will get here only if the
4213 support is in the binary; otherwise a compile-time error occurs. */
4214
4215 else if (ctype == OP_EXTUNI)
4216 {
4217 for (i = 1; i <= min; i++)
4218 {
4219 if (eptr >= md->end_subject)
4220 {
4221 SCHECK_PARTIAL();
4222 RRETURN(MATCH_NOMATCH);
4223 }
4224 GETCHARINCTEST(c, eptr);
4225 if (UCD_CATEGORY(c) == ucp_M) RRETURN(MATCH_NOMATCH);
4226 while (eptr < md->end_subject)
4227 {
4228 int len = 1;
4229 if (!utf) c = *eptr; else { GETCHARLEN(c, eptr, len); }
4230 if (UCD_CATEGORY(c) != ucp_M) break;
4231 eptr += len;
4232 }
4233 CHECK_PARTIAL();
4234 }
4235 }
4236
4237 else
4238 #endif /* SUPPORT_UCP */
4239
4240 /* Handle all other cases when the coding is UTF-8 */
4241
4242 #ifdef SUPPORT_UTF
4243 if (utf) switch(ctype)
4244 {
4245 case OP_ANY:
4246 for (i = 1; i <= min; i++)
4247 {
4248 if (eptr >= md->end_subject)
4249 {
4250 SCHECK_PARTIAL();
4251 RRETURN(MATCH_NOMATCH);
4252 }
4253 if (IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH);
4254 if (md->partial != 0 &&
4255 eptr + 1 >= md->end_subject &&
4256 NLBLOCK->nltype == NLTYPE_FIXED &&
4257 NLBLOCK->nllen == 2 &&
4258 *eptr == NLBLOCK->nl[0])
4259 {
4260 md->hitend = TRUE;
4261 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);
4262 }
4263 eptr++;
4264 ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++);
4265 }
4266 break;
4267
4268 case OP_ALLANY:
4269 for (i = 1; i <= min; i++)
4270 {
4271 if (eptr >= md->end_subject)
4272 {
4273 SCHECK_PARTIAL();
4274 RRETURN(MATCH_NOMATCH);
4275 }
4276 eptr++;
4277 ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++);
4278 }
4279 break;
4280
4281 case OP_ANYBYTE:
4282 if (eptr > md->end_subject - min) RRETURN(MATCH_NOMATCH);
4283 eptr += min;
4284 break;
4285
4286 case OP_ANYNL:
4287 for (i = 1; i <= min; i++)
4288 {
4289 if (eptr >= md->end_subject)
4290 {
4291 SCHECK_PARTIAL();
4292 RRETURN(MATCH_NOMATCH);
4293 }
4294 GETCHARINC(c, eptr);
4295 switch(c)
4296 {
4297 default: RRETURN(MATCH_NOMATCH);
4298
4299 case 0x000d:
4300 if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
4301 break;
4302
4303 case 0x000a:
4304 break;
4305
4306 case 0x000b:
4307 case 0x000c:
4308 case 0x0085:
4309 case 0x2028:
4310 case 0x2029:
4311 if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
4312 break;
4313 }
4314 }
4315 break;
4316
4317 case OP_NOT_HSPACE:
4318 for (i = 1; i <= min; i++)
4319 {
4320 if (eptr >= md->end_subject)
4321 {
4322 SCHECK_PARTIAL();
4323 RRETURN(MATCH_NOMATCH);
4324 }
4325 GETCHARINC(c, eptr);
4326 switch(c)
4327 {
4328 default: break;
4329 case 0x09: /* HT */
4330 case 0x20: /* SPACE */
4331 case 0xa0: /* NBSP */
4332 case 0x1680: /* OGHAM SPACE MARK */
4333 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
4334 case 0x2000: /* EN QUAD */
4335 case 0x2001: /* EM QUAD */
4336 case 0x2002: /* EN SPACE */
4337 case 0x2003: /* EM SPACE */
4338 case 0x2004: /* THREE-PER-EM SPACE */
4339 case 0x2005: /* FOUR-PER-EM SPACE */
4340 case 0x2006: /* SIX-PER-EM SPACE */
4341 case 0x2007: /* FIGURE SPACE */
4342 case 0x2008: /* PUNCTUATION SPACE */
4343 case 0x2009: /* THIN SPACE */
4344 case 0x200A: /* HAIR SPACE */
4345 case 0x202f: /* NARROW NO-BREAK SPACE */
4346 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
4347 case 0x3000: /* IDEOGRAPHIC SPACE */
4348 RRETURN(MATCH_NOMATCH);
4349 }
4350 }
4351 break;
4352
4353 case OP_HSPACE:
4354 for (i = 1; i <= min; i++)
4355 {
4356 if (eptr >= md->end_subject)
4357 {
4358 SCHECK_PARTIAL();
4359 RRETURN(MATCH_NOMATCH);
4360 }
4361 GETCHARINC(c, eptr);
4362 switch(c)
4363 {
4364 default: RRETURN(MATCH_NOMATCH);
4365 case 0x09: /* HT */
4366 case 0x20: /* SPACE */
4367 case 0xa0: /* NBSP */
4368 case 0x1680: /* OGHAM SPACE MARK */
4369 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
4370 case 0x2000: /* EN QUAD */
4371 case 0x2001: /* EM QUAD */
4372 case 0x2002: /* EN SPACE */
4373 case 0x2003: /* EM SPACE */
4374 case 0x2004: /* THREE-PER-EM SPACE */
4375 case 0x2005: /* FOUR-PER-EM SPACE */
4376 case 0x2006: /* SIX-PER-EM SPACE */
4377 case 0x2007: /* FIGURE SPACE */
4378 case 0x2008: /* PUNCTUATION SPACE */
4379 case 0x2009: /* THIN SPACE */
4380 case 0x200A: /* HAIR SPACE */
4381 case 0x202f: /* NARROW NO-BREAK SPACE */
4382 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
4383 case 0x3000: /* IDEOGRAPHIC SPACE */
4384 break;
4385 }
4386 }
4387 break;
4388
4389 case OP_NOT_VSPACE:
4390 for (i = 1; i <= min; i++)
4391 {
4392 if (eptr >= md->end_subject)
4393 {
4394 SCHECK_PARTIAL();
4395 RRETURN(MATCH_NOMATCH);
4396 }
4397 GETCHARINC(c, eptr);
4398 switch(c)
4399 {
4400 default: break;
4401 case 0x0a: /* LF */
4402 case 0x0b: /* VT */
4403 case 0x0c: /* FF */
4404 case 0x0d: /* CR */
4405 case 0x85: /* NEL */
4406 case 0x2028: /* LINE SEPARATOR */
4407 case 0x2029: /* PARAGRAPH SEPARATOR */
4408 RRETURN(MATCH_NOMATCH);
4409 }
4410 }
4411 break;
4412
4413 case OP_VSPACE:
4414 for (i = 1; i <= min; i++)
4415 {
4416 if (eptr >= md->end_subject)
4417 {
4418 SCHECK_PARTIAL();
4419 RRETURN(MATCH_NOMATCH);
4420 }
4421 GETCHARINC(c, eptr);
4422 switch(c)
4423 {
4424 default: RRETURN(MATCH_NOMATCH);
4425 case 0x0a: /* LF */
4426 case 0x0b: /* VT */
4427 case 0x0c: /* FF */
4428 case 0x0d: /* CR */
4429 case 0x85: /* NEL */
4430 case 0x2028: /* LINE SEPARATOR */
4431 case 0x2029: /* PARAGRAPH SEPARATOR */
4432 break;
4433 }
4434 }
4435 break;
4436
4437 case OP_NOT_DIGIT:
4438 for (i = 1; i <= min; i++)
4439 {
4440 if (eptr >= md->end_subject)
4441 {
4442 SCHECK_PARTIAL();
4443 RRETURN(MATCH_NOMATCH);
4444 }
4445 GETCHARINC(c, eptr);
4446 if (c < 128 && (md->ctypes[c] & ctype_digit) != 0)
4447 RRETURN(MATCH_NOMATCH);
4448 }
4449 break;
4450
4451 case OP_DIGIT:
4452 for (i = 1; i <= min; i++)
4453 {
4454 if (eptr >= md->end_subject)
4455 {
4456 SCHECK_PARTIAL();
4457 RRETURN(MATCH_NOMATCH);
4458 }
4459 if (*eptr >= 128 || (md->ctypes[*eptr] & ctype_digit) == 0)
4460 RRETURN(MATCH_NOMATCH);
4461 eptr++;
4462 /* No need to skip more bytes - we know it's a 1-byte character */
4463 }
4464 break;
4465
4466 case OP_NOT_WHITESPACE:
4467 for (i = 1; i <= min; i++)
4468 {
4469 if (eptr >= md->end_subject)
4470 {
4471 SCHECK_PARTIAL();
4472 RRETURN(MATCH_NOMATCH);
4473 }
4474 if (*eptr < 128 && (md->ctypes[*eptr] & ctype_space) != 0)
4475 RRETURN(MATCH_NOMATCH);
4476 eptr++;
4477 ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++);
4478 }
4479 break;
4480
4481 case OP_WHITESPACE:
4482 for (i = 1; i <= min; i++)
4483 {
4484 if (eptr >= md->end_subject)
4485 {
4486 SCHECK_PARTIAL();
4487 RRETURN(MATCH_NOMATCH);
4488 }
4489 if (*eptr >= 128 || (md->ctypes[*eptr] & ctype_space) == 0)
4490 RRETURN(MATCH_NOMATCH);
4491 eptr++;
4492 /* No need to skip more bytes - we know it's a 1-byte character */
4493 }
4494 break;
4495
4496 case OP_NOT_WORDCHAR:
4497 for (i = 1; i <= min; i++)
4498 {
4499 if (eptr >= md->end_subject)
4500 {
4501 SCHECK_PARTIAL();
4502 RRETURN(MATCH_NOMATCH);
4503 }
4504 if (*eptr < 128 && (md->ctypes[*eptr] & ctype_word) != 0)
4505 RRETURN(MATCH_NOMATCH);
4506 eptr++;
4507 ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++);
4508 }
4509 break;
4510
4511 case OP_WORDCHAR:
4512 for (i = 1; i <= min; i++)
4513 {
4514 if (eptr >= md->end_subject)
4515 {
4516 SCHECK_PARTIAL();
4517 RRETURN(MATCH_NOMATCH);
4518 }
4519 if (*eptr >= 128 || (md->ctypes[*eptr] & ctype_word) == 0)
4520 RRETURN(MATCH_NOMATCH);
4521 eptr++;
4522 /* No need to skip more bytes - we know it's a 1-byte character */
4523 }
4524 break;
4525
4526 default:
4527 RRETURN(PCRE_ERROR_INTERNAL);
4528 } /* End switch(ctype) */
4529
4530 else
4531 #endif /* SUPPORT_UTF */
4532
4533 /* Code for the non-UTF-8 case for minimum matching of operators other
4534 than OP_PROP and OP_NOTPROP. */
4535
4536 switch(ctype)
4537 {
4538 case OP_ANY:
4539 for (i = 1; i <= min; i++)
4540 {
4541 if (eptr >= md->end_subject)
4542 {
4543 SCHECK_PARTIAL();
4544 RRETURN(MATCH_NOMATCH);
4545 }
4546 if (IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH);
4547 if (md->partial != 0 &&
4548 eptr + 1 >= md->end_subject &&
4549 NLBLOCK->nltype == NLTYPE_FIXED &&
4550 NLBLOCK->nllen == 2 &&
4551 *eptr == NLBLOCK->nl[0])
4552 {
4553 md->hitend = TRUE;
4554 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);
4555 }
4556 eptr++;
4557 }
4558 break;
4559
4560 case OP_ALLANY:
4561 if (eptr > md->end_subject - min)
4562 {
4563 SCHECK_PARTIAL();
4564 RRETURN(MATCH_NOMATCH);
4565 }
4566 eptr += min;
4567 break;
4568
4569 case OP_ANYBYTE:
4570 if (eptr > md->end_subject - min)
4571 {
4572 SCHECK_PARTIAL();
4573 RRETURN(MATCH_NOMATCH);
4574 }
4575 eptr += min;
4576 break;
4577
4578 case OP_ANYNL:
4579 for (i = 1; i <= min; i++)
4580 {
4581 if (eptr >= md->end_subject)
4582 {
4583 SCHECK_PARTIAL();
4584 RRETURN(MATCH_NOMATCH);
4585 }
4586 switch(*eptr++)
4587 {
4588 default: RRETURN(MATCH_NOMATCH);
4589
4590 case 0x000d:
4591 if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
4592 break;
4593
4594 case 0x000a:
4595 break;
4596
4597 case 0x000b:
4598 case 0x000c:
4599 case 0x0085:
4600 #ifdef COMPILE_PCRE16
4601 case 0x2028:
4602 case 0x2029:
4603 #endif
4604 if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
4605 break;
4606 }
4607 }
4608 break;
4609
4610 case OP_NOT_HSPACE:
4611 for (i = 1; i <= min; i++)
4612 {
4613 if (eptr >= md->end_subject)
4614 {
4615 SCHECK_PARTIAL();
4616 RRETURN(MATCH_NOMATCH);
4617 }
4618 switch(*eptr++)
4619 {
4620 default: break;
4621 case 0x09: /* HT */
4622 case 0x20: /* SPACE */
4623 case 0xa0: /* NBSP */
4624 #ifdef COMPILE_PCRE16
4625 case 0x1680: /* OGHAM SPACE MARK */
4626 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
4627 case 0x2000: /* EN QUAD */
4628 case 0x2001: /* EM QUAD */
4629 case 0x2002: /* EN SPACE */
4630 case 0x2003: /* EM SPACE */
4631 case 0x2004: /* THREE-PER-EM SPACE */
4632 case 0x2005: /* FOUR-PER-EM SPACE */
4633 case 0x2006: /* SIX-PER-EM SPACE */
4634 case 0x2007: /* FIGURE SPACE */
4635 case 0x2008: /* PUNCTUATION SPACE */
4636 case 0x2009: /* THIN SPACE */
4637 case 0x200A: /* HAIR SPACE */
4638 case 0x202f: /* NARROW NO-BREAK SPACE */
4639 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
4640 case 0x3000: /* IDEOGRAPHIC SPACE */
4641 #endif
4642 RRETURN(MATCH_NOMATCH);
4643 }
4644 }
4645 break;
4646
4647 case OP_HSPACE:
4648 for (i = 1; i <= min; i++)
4649 {
4650 if (eptr >= md->end_subject)
4651 {
4652 SCHECK_PARTIAL();
4653 RRETURN(MATCH_NOMATCH);
4654 }
4655 switch(*eptr++)
4656 {
4657 default: RRETURN(MATCH_NOMATCH);
4658 case 0x09: /* HT */
4659 case 0x20: /* SPACE */
4660 case 0xa0: /* NBSP */
4661 #ifdef COMPILE_PCRE16
4662 case 0x1680: /* OGHAM SPACE MARK */
4663 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
4664 case 0x2000: /* EN QUAD */
4665 case 0x2001: /* EM QUAD */
4666 case 0x2002: /* EN SPACE */
4667 case 0x2003: /* EM SPACE */
4668 case 0x2004: /* THREE-PER-EM SPACE */
4669 case 0x2005: /* FOUR-PER-EM SPACE */
4670 case 0x2006: /* SIX-PER-EM SPACE */
4671 case 0x2007: /* FIGURE SPACE */
4672 case 0x2008: /* PUNCTUATION SPACE */
4673 case 0x2009: /* THIN SPACE */
4674 case 0x200A: /* HAIR SPACE */
4675 case 0x202f: /* NARROW NO-BREAK SPACE */
4676 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
4677 case 0x3000: /* IDEOGRAPHIC SPACE */
4678 #endif
4679 break;
4680 }
4681 }
4682 break;
4683
4684 case OP_NOT_VSPACE:
4685 for (i = 1; i <= min; i++)
4686 {
4687 if (eptr >= md->end_subject)
4688 {
4689 SCHECK_PARTIAL();
4690 RRETURN(MATCH_NOMATCH);
4691 }
4692 switch(*eptr++)
4693 {
4694 default: break;
4695 case 0x0a: /* LF */
4696 case 0x0b: /* VT */
4697 case 0x0c: /* FF */
4698 case 0x0d: /* CR */
4699 case 0x85: /* NEL */
4700 #ifdef COMPILE_PCRE16
4701 case 0x2028: /* LINE SEPARATOR */
4702 case 0x2029: /* PARAGRAPH SEPARATOR */
4703 #endif
4704 RRETURN(MATCH_NOMATCH);
4705 }
4706 }
4707 break;
4708
4709 case OP_VSPACE:
4710 for (i = 1; i <= min; i++)
4711 {
4712 if (eptr >= md->end_subject)
4713 {
4714 SCHECK_PARTIAL();
4715 RRETURN(MATCH_NOMATCH);
4716 }
4717 switch(*eptr++)
4718 {
4719 default: RRETURN(MATCH_NOMATCH);
4720 case 0x0a: /* LF */
4721 case 0x0b: /* VT */
4722 case 0x0c: /* FF */
4723 case 0x0d: /* CR */
4724 case 0x85: /* NEL */
4725 #ifdef COMPILE_PCRE16
4726 case 0x2028: /* LINE SEPARATOR */
4727 case 0x2029: /* PARAGRAPH SEPARATOR */
4728 #endif
4729 break;
4730 }
4731 }
4732 break;
4733
4734 case OP_NOT_DIGIT:
4735 for (i = 1; i <= min; i++)
4736 {
4737 if (eptr >= md->end_subject)
4738 {
4739 SCHECK_PARTIAL();
4740 RRETURN(MATCH_NOMATCH);
4741 }
4742 if (MAX_255(*eptr) && (md->ctypes[*eptr] & ctype_digit) != 0)
4743 RRETURN(MATCH_NOMATCH);
4744 eptr++;
4745 }
4746 break;
4747
4748 case OP_DIGIT:
4749 for (i = 1; i <= min; i++)
4750 {
4751 if (eptr >= md->end_subject)
4752 {
4753 SCHECK_PARTIAL();
4754 RRETURN(MATCH_NOMATCH);
4755 }
4756 if (!MAX_255(*eptr) || (md->ctypes[*eptr] & ctype_digit) == 0)
4757 RRETURN(MATCH_NOMATCH);
4758 eptr++;
4759 }
4760 break;
4761
4762 case OP_NOT_WHITESPACE:
4763 for (i = 1; i <= min; i++)
4764 {
4765 if (eptr >= md->end_subject)
4766 {
4767 SCHECK_PARTIAL();
4768 RRETURN(MATCH_NOMATCH);
4769 }
4770 if (MAX_255(*eptr) && (md->ctypes[*eptr] & ctype_space) != 0)
4771 RRETURN(MATCH_NOMATCH);
4772 eptr++;
4773 }
4774 break;
4775
4776 case OP_WHITESPACE:
4777 for (i = 1; i <= min; i++)
4778 {
4779 if (eptr >= md->end_subject)
4780 {
4781 SCHECK_PARTIAL();
4782 RRETURN(MATCH_NOMATCH);
4783 }
4784 if (!MAX_255(*eptr) || (md->ctypes[*eptr] & ctype_space) == 0)
4785 RRETURN(MATCH_NOMATCH);
4786 eptr++;
4787 }
4788 break;
4789
4790 case OP_NOT_WORDCHAR:
4791 for (i = 1; i <= min; i++)
4792 {
4793 if (eptr >= md->end_subject)
4794 {
4795 SCHECK_PARTIAL();
4796 RRETURN(MATCH_NOMATCH);
4797 }
4798 if (MAX_255(*eptr) && (md->ctypes[*eptr] & ctype_word) != 0)
4799 RRETURN(MATCH_NOMATCH);
4800 eptr++;
4801 }
4802 break;
4803
4804 case OP_WORDCHAR:
4805 for (i = 1; i <= min; i++)
4806 {
4807 if (eptr >= md->end_subject)
4808 {
4809 SCHECK_PARTIAL();
4810 RRETURN(MATCH_NOMATCH);
4811 }
4812 if (!MAX_255(*eptr) || (md->ctypes[*eptr] & ctype_word) == 0)
4813 RRETURN(MATCH_NOMATCH);
4814 eptr++;
4815 }
4816 break;
4817
4818 default:
4819 RRETURN(PCRE_ERROR_INTERNAL);
4820 }
4821 }
4822
4823 /* If min = max, continue at the same level without recursing */
4824
4825 if (min == max) continue;
4826
4827 /* If minimizing, we have to test the rest of the pattern before each
4828 subsequent match. Again, separate the UTF-8 case for speed, and also
4829 separate the UCP cases. */
4830
4831 if (minimize)
4832 {
4833 #ifdef SUPPORT_UCP
4834 if (prop_type >= 0)
4835 {
4836 switch(prop_type)
4837 {
4838 case PT_ANY:
4839 for (fi = min;; fi++)
4840 {
4841 RMATCH(eptr, ecode, offset_top, md, eptrb, RM36);
4842 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4843 if (fi >= max) RRETURN(MATCH_NOMATCH);
4844 if (eptr >= md->end_subject)
4845 {
4846 SCHECK_PARTIAL();
4847 RRETURN(MATCH_NOMATCH);
4848 }
4849 GETCHARINCTEST(c, eptr);
4850 if (prop_fail_result) RRETURN(MATCH_NOMATCH);
4851 }
4852 /* Control never gets here */
4853
4854 case PT_LAMP:
4855 for (fi = min;; fi++)
4856 {
4857 int chartype;
4858 RMATCH(eptr, ecode, offset_top, md, eptrb, RM37);
4859 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4860 if (fi >= max) RRETURN(MATCH_NOMATCH);
4861 if (eptr >= md->end_subject)
4862 {
4863 SCHECK_PARTIAL();
4864 RRETURN(MATCH_NOMATCH);
4865 }
4866 GETCHARINCTEST(c, eptr);
4867 chartype = UCD_CHARTYPE(c);
4868 if ((chartype == ucp_Lu ||
4869 chartype == ucp_Ll ||
4870 chartype == ucp_Lt) == prop_fail_result)
4871 RRETURN(MATCH_NOMATCH);
4872 }
4873 /* Control never gets here */
4874
4875 case PT_GC:
4876 for (fi = min;; fi++)
4877 {
4878 RMATCH(eptr, ecode, offset_top, md, eptrb, RM38);
4879 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4880 if (fi >= max) RRETURN(MATCH_NOMATCH);
4881 if (eptr >= md->end_subject)
4882 {
4883 SCHECK_PARTIAL();
4884 RRETURN(MATCH_NOMATCH);
4885 }
4886 GETCHARINCTEST(c, eptr);
4887 if ((UCD_CATEGORY(c) == prop_value) == prop_fail_result)
4888 RRETURN(MATCH_NOMATCH);
4889 }
4890 /* Control never gets here */
4891
4892 case PT_PC:
4893 for (fi = min;; fi++)
4894 {
4895 RMATCH(eptr, ecode, offset_top, md, eptrb, RM39);
4896 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4897 if (fi >= max) RRETURN(MATCH_NOMATCH);
4898 if (eptr >= md->end_subject)
4899 {
4900 SCHECK_PARTIAL();
4901 RRETURN(MATCH_NOMATCH);
4902 }
4903 GETCHARINCTEST(c, eptr);
4904 if ((UCD_CHARTYPE(c) == prop_value) == prop_fail_result)
4905 RRETURN(MATCH_NOMATCH);
4906 }
4907 /* Control never gets here */
4908
4909 case PT_SC:
4910 for (fi = min;; fi++)
4911 {
4912 RMATCH(eptr, ecode, offset_top, md, eptrb, RM40);
4913 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4914 if (fi >= max) RRETURN(MATCH_NOMATCH);
4915 if (eptr >= md->end_subject)
4916 {
4917 SCHECK_PARTIAL();
4918 RRETURN(MATCH_NOMATCH);
4919 }
4920 GETCHARINCTEST(c, eptr);
4921 if ((UCD_SCRIPT(c) == prop_value) == prop_fail_result)
4922 RRETURN(MATCH_NOMATCH);
4923 }
4924 /* Control never gets here */
4925
4926 case PT_ALNUM:
4927 for (fi = min;; fi++)
4928 {
4929 int category;
4930 RMATCH(eptr, ecode, offset_top, md, eptrb, RM59);
4931 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4932 if (fi >= max) RRETURN(MATCH_NOMATCH);
4933 if (eptr >= md->end_subject)
4934 {
4935 SCHECK_PARTIAL();
4936 RRETURN(MATCH_NOMATCH);
4937 }
4938 GETCHARINCTEST(c, eptr);
4939 category = UCD_CATEGORY(c);
4940 if ((category == ucp_L || category == ucp_N) == prop_fail_result)
4941 RRETURN(MATCH_NOMATCH);
4942 }
4943 /* Control never gets here */
4944
4945 case PT_SPACE: /* Perl space */
4946 for (fi = min;; fi++)
4947 {
4948 RMATCH(eptr, ecode, offset_top, md, eptrb, RM60);
4949 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4950 if (fi >= max) RRETURN(MATCH_NOMATCH);
4951 if (eptr >= md->end_subject)
4952 {
4953 SCHECK_PARTIAL();
4954 RRETURN(MATCH_NOMATCH);
4955 }
4956 GETCHARINCTEST(c, eptr);
4957 if ((UCD_CATEGORY(c) == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
4958 c == CHAR_FF || c == CHAR_CR)
4959 == prop_fail_result)
4960 RRETURN(MATCH_NOMATCH);
4961 }
4962 /* Control never gets here */
4963
4964 case PT_PXSPACE: /* POSIX space */
4965 for (fi = min;; fi++)
4966 {
4967 RMATCH(eptr, ecode, offset_top, md, eptrb, RM61);
4968 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4969 if (fi >= max) RRETURN(MATCH_NOMATCH);
4970 if (eptr >= md->end_subject)
4971 {
4972 SCHECK_PARTIAL();
4973 RRETURN(MATCH_NOMATCH);
4974 }
4975 GETCHARINCTEST(c, eptr);
4976 if ((UCD_CATEGORY(c) == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
4977 c == CHAR_VT || c == CHAR_FF || c == CHAR_CR)
4978 == prop_fail_result)
4979 RRETURN(MATCH_NOMATCH);
4980 }
4981 /* Control never gets here */
4982
4983 case PT_WORD:
4984 for (fi = min;; fi++)
4985 {
4986 int category;
4987 RMATCH(eptr, ecode, offset_top, md, eptrb, RM62);
4988 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4989 if (fi >= max) RRETURN(MATCH_NOMATCH);
4990 if (eptr >= md->end_subject)
4991 {
4992 SCHECK_PARTIAL();
4993 RRETURN(MATCH_NOMATCH);
4994 }
4995 GETCHARINCTEST(c, eptr);
4996 category = UCD_CATEGORY(c);
4997 if ((category == ucp_L ||
4998 category == ucp_N ||
4999 c == CHAR_UNDERSCORE)
5000 == prop_fail_result)
5001 RRETURN(MATCH_NOMATCH);
5002 }
5003 /* Control never gets here */
5004
5005 /* This should never occur */
5006
5007 default:
5008 RRETURN(PCRE_ERROR_INTERNAL);
5009 }
5010 }
5011
5012 /* Match extended Unicode sequences. We will get here only if the
5013 support is in the binary; otherwise a compile-time error occurs. */
5014
5015 else if (ctype == OP_EXTUNI)
5016 {
5017 for (fi = min;; fi++)
5018 {
5019 RMATCH(eptr, ecode, offset_top, md, eptrb, RM41);
5020 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5021 if (fi >= max) RRETURN(MATCH_NOMATCH);
5022 if (eptr >= md->end_subject)
5023 {
5024 SCHECK_PARTIAL();
5025 RRETURN(MATCH_NOMATCH);
5026 }
5027 GETCHARINCTEST(c, eptr);
5028 if (UCD_CATEGORY(c) == ucp_M) RRETURN(MATCH_NOMATCH);
5029 while (eptr < md->end_subject)
5030 {
5031 int len = 1;
5032 if (!utf) c = *eptr; else { GETCHARLEN(c, eptr, len); }
5033 if (UCD_CATEGORY(c) != ucp_M) break;
5034 eptr += len;
5035 }
5036 CHECK_PARTIAL();
5037 }
5038 }
5039 else
5040 #endif /* SUPPORT_UCP */
5041
5042 #ifdef SUPPORT_UTF
5043 if (utf)
5044 {
5045 for (fi = min;; fi++)
5046 {
5047 RMATCH(eptr, ecode, offset_top, md, eptrb, RM42);
5048 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5049 if (fi >= max) RRETURN(MATCH_NOMATCH);
5050 if (eptr >= md->end_subject)
5051 {
5052 SCHECK_PARTIAL();
5053 RRETURN(MATCH_NOMATCH);
5054 }
5055 if (ctype == OP_ANY && IS_NEWLINE(eptr))
5056 RRETURN(MATCH_NOMATCH);
5057 GETCHARINC(c, eptr);
5058 switch(ctype)
5059 {
5060 case OP_ANY: /* This is the non-NL case */
5061 if (md->partial != 0 && /* Take care with CRLF partial */
5062 eptr >= md->end_subject &&
5063 NLBLOCK->nltype == NLTYPE_FIXED &&
5064 NLBLOCK->nllen == 2 &&
5065 c == NLBLOCK->nl[0])
5066 {
5067 md->hitend = TRUE;
5068 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);
5069 }
5070 break;
5071
5072 case OP_ALLANY:
5073 case OP_ANYBYTE:
5074 break;
5075
5076 case OP_ANYNL:
5077 switch(c)
5078 {
5079 default: RRETURN(MATCH_NOMATCH);
5080 case 0x000d:
5081 if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
5082 break;
5083 case 0x000a:
5084 break;
5085
5086 case 0x000b:
5087 case 0x000c:
5088 case 0x0085:
5089 case 0x2028:
5090 case 0x2029:
5091 if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
5092 break;
5093 }
5094 break;
5095
5096 case OP_NOT_HSPACE:
5097 switch(c)
5098 {
5099 default: break;
5100 case 0x09: /* HT */
5101 case 0x20: /* SPACE */
5102 case 0xa0: /* NBSP */
5103 case 0x1680: /* OGHAM SPACE MARK */
5104 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
5105 case 0x2000: /* EN QUAD */
5106 case 0x2001: /* EM QUAD */
5107 case 0x2002: /* EN SPACE */
5108 case 0x2003: /* EM SPACE */
5109 case 0x2004: /* THREE-PER-EM SPACE */
5110 case 0x2005: /* FOUR-PER-EM SPACE */
5111 case 0x2006: /* SIX-PER-EM SPACE */
5112 case 0x2007: /* FIGURE SPACE */
5113 case 0x2008: /* PUNCTUATION SPACE */
5114 case 0x2009: /* THIN SPACE */
5115 case 0x200A: /* HAIR SPACE */
5116 case 0x202f: /* NARROW NO-BREAK SPACE */
5117 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
5118 case 0x3000: /* IDEOGRAPHIC SPACE */
5119 RRETURN(MATCH_NOMATCH);
5120 }
5121 break;
5122
5123 case OP_HSPACE:
5124 switch(c)
5125 {
5126 default: RRETURN(MATCH_NOMATCH);
5127 case 0x09: /* HT */
5128 case 0x20: /* SPACE */
5129 case 0xa0: /* NBSP */
5130 case 0x1680: /* OGHAM SPACE MARK */
5131 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
5132 case 0x2000: /* EN QUAD */
5133 case 0x2001: /* EM QUAD */
5134 case 0x2002: /* EN SPACE */
5135 case 0x2003: /* EM SPACE */
5136 case 0x2004: /* THREE-PER-EM SPACE */
5137 case 0x2005: /* FOUR-PER-EM SPACE */
5138 case 0x2006: /* SIX-PER-EM SPACE */
5139 case 0x2007: /* FIGURE SPACE */
5140 case 0x2008: /* PUNCTUATION SPACE */
5141 case 0x2009: /* THIN SPACE */
5142 case 0x200A: /* HAIR SPACE */
5143 case 0x202f: /* NARROW NO-BREAK SPACE */
5144 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
5145 case 0x3000: /* IDEOGRAPHIC SPACE */
5146 break;
5147 }
5148 break;
5149
5150 case OP_NOT_VSPACE:
5151 switch(c)
5152 {
5153 default: break;
5154 case 0x0a: /* LF */
5155 case 0x0b: /* VT */
5156 case 0x0c: /* FF */
5157 case 0x0d: /* CR */
5158 case 0x85: /* NEL */
5159 case 0x2028: /* LINE SEPARATOR */
5160 case 0x2029: /* PARAGRAPH SEPARATOR */
5161 RRETURN(MATCH_NOMATCH);
5162 }
5163 break;
5164
5165 case OP_VSPACE:
5166 switch(c)
5167 {
5168 default: RRETURN(MATCH_NOMATCH);
5169 case 0x0a: /* LF */
5170 case 0x0b: /* VT */
5171 case 0x0c: /* FF */
5172 case 0x0d: /* CR */
5173 case 0x85: /* NEL */
5174 case 0x2028: /* LINE SEPARATOR */
5175 case 0x2029: /* PARAGRAPH SEPARATOR */
5176 break;
5177 }
5178 break;
5179
5180 case OP_NOT_DIGIT:
5181 if (c < 256 && (md->ctypes[c] & ctype_digit) != 0)
5182 RRETURN(MATCH_NOMATCH);
5183 break;
5184
5185 case OP_DIGIT:
5186 if (c >= 256 || (md->ctypes[c] & ctype_digit) == 0)
5187 RRETURN(MATCH_NOMATCH);
5188 break;
5189
5190 case OP_NOT_WHITESPACE:
5191 if (c < 256 && (md->ctypes[c] & ctype_space) != 0)
5192 RRETURN(MATCH_NOMATCH);
5193 break;
5194
5195 case OP_WHITESPACE:
5196 if (c >= 256 || (md->ctypes[c] & ctype_space) == 0)
5197 RRETURN(MATCH_NOMATCH);
5198 break;
5199
5200 case OP_NOT_WORDCHAR:
5201 if (c < 256 && (md->ctypes[c] & ctype_word) != 0)
5202 RRETURN(MATCH_NOMATCH);
5203 break;
5204
5205 case OP_WORDCHAR:
5206 if (c >= 256 || (md->ctypes[c] & ctype_word) == 0)
5207 RRETURN(MATCH_NOMATCH);
5208 break;
5209
5210 default:
5211 RRETURN(PCRE_ERROR_INTERNAL);
5212 }
5213 }
5214 }
5215 else
5216 #endif
5217 /* Not UTF mode */
5218 {
5219 for (fi = min;; fi++)
5220 {
5221 RMATCH(eptr, ecode, offset_top, md, eptrb, RM43);
5222 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5223 if (fi >= max) RRETURN(MATCH_NOMATCH);
5224 if (eptr >= md->end_subject)
5225 {
5226 SCHECK_PARTIAL();
5227 RRETURN(MATCH_NOMATCH);
5228 }
5229 if (ctype == OP_ANY && IS_NEWLINE(eptr))
5230 RRETURN(MATCH_NOMATCH);
5231 c = *eptr++;
5232 switch(ctype)
5233 {
5234 case OP_ANY: /* This is the non-NL case */
5235 if (md->partial != 0 && /* Take care with CRLF partial */
5236 eptr >= md->end_subject &&
5237 NLBLOCK->nltype == NLTYPE_FIXED &&
5238 NLBLOCK->nllen == 2 &&
5239 c == NLBLOCK->nl[0])
5240 {
5241 md->hitend = TRUE;
5242 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);
5243 }
5244 break;
5245
5246 case OP_ALLANY:
5247 case OP_ANYBYTE:
5248 break;
5249
5250 case OP_ANYNL:
5251 switch(c)
5252 {
5253 default: RRETURN(MATCH_NOMATCH);
5254 case 0x000d:
5255 if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
5256 break;
5257
5258 case 0x000a:
5259 break;
5260
5261 case 0x000b:
5262 case 0x000c:
5263 case 0x0085:
5264 #ifdef COMPILE_PCRE16
5265 case 0x2028:
5266 case 0x2029:
5267 #endif
5268 if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
5269 break;
5270 }
5271 break;
5272
5273 case OP_NOT_HSPACE:
5274 switch(c)
5275 {
5276 default: break;
5277 case 0x09: /* HT */
5278 case 0x20: /* SPACE */
5279 case 0xa0: /* NBSP */
5280 #ifdef COMPILE_PCRE16
5281 case 0x1680: /* OGHAM SPACE MARK */
5282 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
5283 case 0x2000: /* EN QUAD */
5284 case 0x2001: /* EM QUAD */
5285 case 0x2002: /* EN SPACE */
5286 case 0x2003: /* EM SPACE */
5287 case 0x2004: /* THREE-PER-EM SPACE */
5288 case 0x2005: /* FOUR-PER-EM SPACE */
5289 case 0x2006: /* SIX-PER-EM SPACE */
5290 case 0x2007: /* FIGURE SPACE */
5291 case 0x2008: /* PUNCTUATION SPACE */
5292 case 0x2009: /* THIN SPACE */
5293 case 0x200A: /* HAIR SPACE */
5294 case 0x202f: /* NARROW NO-BREAK SPACE */
5295 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
5296 case 0x3000: /* IDEOGRAPHIC SPACE */
5297 #endif
5298 RRETURN(MATCH_NOMATCH);
5299 }
5300 break;
5301
5302 case OP_HSPACE:
5303 switch(c)
5304 {
5305 default: RRETURN(MATCH_NOMATCH);
5306 case 0x09: /* HT */
5307 case 0x20: /* SPACE */
5308 case 0xa0: /* NBSP */
5309 #ifdef COMPILE_PCRE16
5310 case 0x1680: /* OGHAM SPACE MARK */
5311 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
5312 case 0x2000: /* EN QUAD */
5313 case 0x2001: /* EM QUAD */
5314 case 0x2002: /* EN SPACE */
5315 case 0x2003: /* EM SPACE */
5316 case 0x2004: /* THREE-PER-EM SPACE */
5317 case 0x2005: /* FOUR-PER-EM SPACE */
5318 case 0x2006: /* SIX-PER-EM SPACE */
5319 case 0x2007: /* FIGURE SPACE */
5320 case 0x2008: /* PUNCTUATION SPACE */
5321 case 0x2009: /* THIN SPACE */
5322 case 0x200A: /* HAIR SPACE */
5323 case 0x202f: /* NARROW NO-BREAK SPACE */
5324 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
5325 case 0x3000: /* IDEOGRAPHIC SPACE */
5326 #endif
5327 break;
5328 }
5329 break;
5330
5331 case OP_NOT_VSPACE:
5332 switch(c)
5333 {
5334 default: break;
5335 case 0x0a: /* LF */
5336 case 0x0b: /* VT */
5337 case 0x0c: /* FF */
5338 case 0x0d: /* CR */
5339 case 0x85: /* NEL */
5340 #ifdef COMPILE_PCRE16
5341 case 0x2028: /* LINE SEPARATOR */
5342 case 0x2029: /* PARAGRAPH SEPARATOR */
5343 #endif
5344 RRETURN(MATCH_NOMATCH);
5345 }
5346 break;
5347
5348 case OP_VSPACE:
5349 switch(c)
5350 {
5351 default: RRETURN(MATCH_NOMATCH);
5352 case 0x0a: /* LF */
5353 case 0x0b: /* VT */
5354 case 0x0c: /* FF */
5355 case 0x0d: /* CR */
5356 case 0x85: /* NEL */
5357 #ifdef COMPILE_PCRE16
5358 case 0x2028: /* LINE SEPARATOR */
5359 case 0x2029: /* PARAGRAPH SEPARATOR */
5360 #endif
5361 break;
5362 }
5363 break;
5364
5365 case OP_NOT_DIGIT:
5366 if (MAX_255(c) && (md->ctypes[c] & ctype_digit) != 0) RRETURN(MATCH_NOMATCH);
5367 break;
5368
5369 case OP_DIGIT:
5370 if (!MAX_255(c) || (md->ctypes[c] & ctype_digit) == 0) RRETURN(MATCH_NOMATCH);
5371 break;
5372
5373 case OP_NOT_WHITESPACE:
5374 if (MAX_255(c) && (md->ctypes[c] & ctype_space) != 0) RRETURN(MATCH_NOMATCH);
5375 break;
5376
5377 case OP_WHITESPACE:
5378 if (!MAX_255(c) || (md->ctypes[c] & ctype_space) == 0) RRETURN(MATCH_NOMATCH);
5379 break;
5380
5381 case OP_NOT_WORDCHAR:
5382 if (MAX_255(c) && (md->ctypes[c] & ctype_word) != 0) RRETURN(MATCH_NOMATCH);
5383 break;
5384
5385 case OP_WORDCHAR:
5386 if (!MAX_255(c) || (md->ctypes[c] & ctype_word) == 0) RRETURN(MATCH_NOMATCH);
5387 break;
5388
5389 default:
5390 RRETURN(PCRE_ERROR_INTERNAL);
5391 }
5392 }
5393 }
5394 /* Control never gets here */
5395 }
5396
5397 /* If maximizing, it is worth using inline code for speed, doing the type
5398 test once at the start (i.e. keep it out of the loop). Again, keep the
5399 UTF-8 and UCP stuff separate. */
5400
5401 else
5402 {
5403 pp = eptr; /* Remember where we started */
5404
5405 #ifdef SUPPORT_UCP
5406 if (prop_type >= 0)
5407 {
5408 switch(prop_type)
5409 {
5410 case PT_ANY:
5411 for (i = min; i < max; i++)
5412 {
5413 int len = 1;
5414 if (eptr >= md->end_subject)
5415 {
5416 SCHECK_PARTIAL();
5417 break;
5418 }
5419 GETCHARLENTEST(c, eptr, len);
5420 if (prop_fail_result) break;
5421 eptr+= len;
5422 }
5423 break;
5424
5425 case PT_LAMP:
5426 for (i = min; i < max; i++)
5427 {
5428 int chartype;
5429 int len = 1;
5430 if (eptr >= md->end_subject)
5431 {
5432 SCHECK_PARTIAL();
5433 break;
5434 }
5435 GETCHARLENTEST(c, eptr, len);
5436 chartype = UCD_CHARTYPE(c);
5437 if ((chartype == ucp_Lu ||
5438 chartype == ucp_Ll ||
5439 chartype == ucp_Lt) == prop_fail_result)
5440 break;
5441 eptr+= len;
5442 }
5443 break;
5444
5445 case PT_GC:
5446 for (i = min; i < max; i++)
5447 {
5448 int len = 1;
5449 if (eptr >= md->end_subject)
5450 {
5451 SCHECK_PARTIAL();
5452 break;
5453 }
5454 GETCHARLENTEST(c, eptr, len);
5455 if ((UCD_CATEGORY(c) == prop_value) == prop_fail_result) break;
5456 eptr+= len;
5457 }
5458 break;
5459
5460 case PT_PC:
5461 for (i = min; i < max; i++)
5462 {
5463 int len = 1;
5464 if (eptr >= md->end_subject)
5465 {
5466 SCHECK_PARTIAL();
5467 break;
5468 }
5469 GETCHARLENTEST(c, eptr, len);
5470 if ((UCD_CHARTYPE(c) == prop_value) == prop_fail_result) break;
5471 eptr+= len;
5472 }
5473 break;
5474
5475 case PT_SC:
5476 for (i = min; i < max; i++)
5477 {
5478 int len = 1;
5479 if (eptr >= md->end_subject)
5480 {
5481 SCHECK_PARTIAL();
5482 break;
5483 }
5484 GETCHARLENTEST(c, eptr, len);
5485 if ((UCD_SCRIPT(c) == prop_value) == prop_fail_result) break;
5486 eptr+= len;
5487 }
5488 break;
5489
5490 case PT_ALNUM:
5491 for (i = min; i < max; i++)
5492 {
5493 int category;
5494 int len = 1;
5495 if (eptr >= md->end_subject)
5496 {
5497 SCHECK_PARTIAL();
5498 break;
5499 }
5500 GETCHARLENTEST(c, eptr, len);
5501 category = UCD_CATEGORY(c);
5502 if ((category == ucp_L || category == ucp_N) == prop_fail_result)
5503 break;
5504 eptr+= len;
5505 }
5506 break;
5507
5508 case PT_SPACE: /* Perl space */
5509 for (i = min; i < max; i++)
5510 {
5511 int len = 1;
5512 if (eptr >= md->end_subject)
5513 {
5514 SCHECK_PARTIAL();
5515 break;
5516 }
5517 GETCHARLENTEST(c, eptr, len);
5518 if ((UCD_CATEGORY(c) == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
5519 c == CHAR_FF || c == CHAR_CR)
5520 == prop_fail_result)
5521 break;
5522 eptr+= len;
5523 }
5524 break;
5525
5526 case PT_PXSPACE: /* POSIX space */
5527 for (i = min; i < max; i++)
5528 {
5529 int len = 1;
5530 if (eptr >= md->end_subject)
5531 {
5532 SCHECK_PARTIAL();
5533 break;
5534 }
5535 GETCHARLENTEST(c, eptr, len);
5536 if ((UCD_CATEGORY(c) == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
5537 c == CHAR_VT || c == CHAR_FF || c == CHAR_CR)
5538 == prop_fail_result)
5539 break;
5540 eptr+= len;
5541 }
5542 break;
5543
5544 case PT_WORD:
5545 for (i = min; i < max; i++)
5546 {
5547 int category;
5548 int len = 1;
5549 if (eptr >= md->end_subject)
5550 {
5551 SCHECK_PARTIAL();
5552 break;
5553 }
5554 GETCHARLENTEST(c, eptr, len);
5555 category = UCD_CATEGORY(c);
5556 if ((category == ucp_L || category == ucp_N ||
5557 c == CHAR_UNDERSCORE) == prop_fail_result)
5558 break;
5559 eptr+= len;
5560 }
5561 break;
5562
5563 default:
5564 RRETURN(PCRE_ERROR_INTERNAL);
5565 }
5566
5567 /* eptr is now past the end of the maximum run */
5568
5569 if (possessive) continue;
5570 for(;;)
5571 {
5572 RMATCH(eptr, ecode, offset_top, md, eptrb, RM44);
5573 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5574 if (eptr-- == pp) break; /* Stop if tried at original pos */
5575 if (utf) BACKCHAR(eptr);
5576 }
5577 }
5578
5579 /* Match extended Unicode sequences. We will get here only if the
5580 support is in the binary; otherwise a compile-time error occurs. */
5581
5582 else if (ctype == OP_EXTUNI)
5583 {
5584 for (i = min; i < max; i++)
5585 {
5586 int len = 1;
5587 if (eptr >= md->end_subject)
5588 {
5589 SCHECK_PARTIAL();
5590 break;
5591 }
5592 if (!utf) c = *eptr; else { GETCHARLEN(c, eptr, len); }
5593 if (UCD_CATEGORY(c) == ucp_M) break;
5594 eptr += len;
5595 while (eptr < md->end_subject)
5596 {
5597 len = 1;
5598 if (!utf) c = *eptr; else { GETCHARLEN(c, eptr, len); }
5599 if (UCD_CATEGORY(c) != ucp_M) break;
5600 eptr += len;
5601 }
5602 CHECK_PARTIAL();
5603 }
5604
5605 /* eptr is now past the end of the maximum run */
5606
5607 if (possessive) continue;
5608
5609 for(;;)
5610 {
5611 RMATCH(eptr, ecode, offset_top, md, eptrb, RM45);
5612 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5613 if (eptr-- == pp) break; /* Stop if tried at original pos */
5614 for (;;) /* Move back over one extended */
5615 {
5616 if (!utf) c = *eptr; else
5617 {
5618 BACKCHAR(eptr);
5619 GETCHAR(c, eptr);
5620 }
5621 if (UCD_CATEGORY(c) != ucp_M) break;
5622 eptr--;
5623 }
5624 }
5625 }
5626
5627 else
5628 #endif /* SUPPORT_UCP */
5629
5630 #ifdef SUPPORT_UTF
5631 if (utf)
5632 {
5633 switch(ctype)
5634 {
5635 case OP_ANY:
5636 if (max < INT_MAX)
5637 {
5638 for (i = min; i < max; i++)
5639 {
5640 if (eptr >= md->end_subject)
5641 {
5642 SCHECK_PARTIAL();
5643 break;
5644 }
5645 if (IS_NEWLINE(eptr)) break;
5646 if (md->partial != 0 && /* Take care with CRLF partial */
5647 eptr + 1 >= md->end_subject &&
5648 NLBLOCK->nltype == NLTYPE_FIXED &&
5649 NLBLOCK->nllen == 2 &&
5650 *eptr == NLBLOCK->nl[0])
5651 {
5652 md->hitend = TRUE;
5653 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);
5654 }
5655 eptr++;
5656 ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++);
5657 }
5658 }
5659
5660 /* Handle unlimited UTF-8 repeat */
5661
5662 else
5663 {
5664 for (i = min; i < max; i++)
5665 {
5666 if (eptr >= md->end_subject)
5667 {
5668 SCHECK_PARTIAL();
5669 break;
5670 }
5671 if (IS_NEWLINE(eptr)) break;
5672 if (md->partial != 0 && /* Take care with CRLF partial */
5673 eptr + 1 >= md->end_subject &&
5674 NLBLOCK->nltype == NLTYPE_FIXED &&
5675 NLBLOCK->nllen == 2 &&
5676 *eptr == NLBLOCK->nl[0])
5677 {
5678 md->hitend = TRUE;
5679 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);
5680 }
5681 eptr++;
5682 ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++);
5683 }
5684 }
5685 break;
5686
5687 case OP_ALLANY:
5688 if (max < INT_MAX)
5689 {
5690 for (i = min; i < max; i++)
5691 {
5692 if (eptr >= md->end_subject)
5693 {
5694 SCHECK_PARTIAL();
5695 break;
5696 }
5697 eptr++;
5698 ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++);
5699 }
5700 }
5701 else
5702 {
5703 eptr = md->end_subject; /* Unlimited UTF-8 repeat */
5704 SCHECK_PARTIAL();
5705 }
5706 break;
5707
5708 /* The byte case is the same as non-UTF8 */
5709
5710 case OP_ANYBYTE:
5711 c = max - min;
5712 if (c > (unsigned int)(md->end_subject - eptr))
5713 {
5714 eptr = md->end_subject;
5715 SCHECK_PARTIAL();
5716 }
5717 else eptr += c;
5718 break;
5719
5720 case OP_ANYNL:
5721 for (i = min; i < max; i++)
5722 {
5723 int len = 1;
5724 if (eptr >= md->end_subject)
5725 {
5726 SCHECK_PARTIAL();
5727 break;
5728 }
5729 GETCHARLEN(c, eptr, len);
5730 if (c == 0x000d)
5731 {
5732 if (++eptr >= md->end_subject) break;
5733 if (*eptr == 0x000a) eptr++;
5734 }
5735 else
5736 {
5737 if (c != 0x000a &&
5738 (md->bsr_anycrlf ||
5739 (c != 0x000b && c != 0x000c &&
5740 c != 0x0085 && c != 0x2028 && c != 0x2029)))
5741 break;
5742 eptr += len;
5743 }
5744 }
5745 break;
5746
5747 case OP_NOT_HSPACE:
5748 case OP_HSPACE:
5749 for (i = min; i < max; i++)
5750 {
5751 BOOL gotspace;
5752 int len = 1;
5753 if (eptr >= md->end_subject)
5754 {
5755 SCHECK_PARTIAL();
5756 break;
5757 }
5758 GETCHARLEN(c, eptr, len);
5759 switch(c)
5760 {
5761 default: gotspace = FALSE; break;
5762 case 0x09: /* HT */
5763 case 0x20: /* SPACE */
5764 case 0xa0: /* NBSP */
5765 case 0x1680: /* OGHAM SPACE MARK */
5766 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
5767 case 0x2000: /* EN QUAD */
5768 case 0x2001: /* EM QUAD */
5769 case 0x2002: /* EN SPACE */
5770 case 0x2003: /* EM SPACE */
5771 case 0x2004: /* THREE-PER-EM SPACE */
5772 case 0x2005: /* FOUR-PER-EM SPACE */
5773 case 0x2006: /* SIX-PER-EM SPACE */
5774 case 0x2007: /* FIGURE SPACE */
5775 case 0x2008: /* PUNCTUATION SPACE */
5776 case 0x2009: /* THIN SPACE */
5777 case 0x200A: /* HAIR SPACE */
5778 case 0x202f: /* NARROW NO-BREAK SPACE */
5779 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
5780 case 0x3000: /* IDEOGRAPHIC SPACE */
5781 gotspace = TRUE;
5782 break;
5783 }
5784 if (gotspace == (ctype == OP_NOT_HSPACE)) break;
5785 eptr += len;
5786 }
5787 break;
5788
5789 case OP_NOT_VSPACE:
5790 case OP_VSPACE:
5791 for (i = min; i < max; i++)
5792 {
5793 BOOL gotspace;
5794 int len = 1;
5795 if (eptr >= md->end_subject)
5796 {
5797 SCHECK_PARTIAL();
5798 break;
5799 }
5800 GETCHARLEN(c, eptr, len);
5801 switch(c)
5802 {
5803 default: gotspace = FALSE; break;
5804 case 0x0a: /* LF */
5805 case 0x0b: /* VT */
5806 case 0x0c: /* FF */
5807 case 0x0d: /* CR */
5808 case 0x85: /* NEL */
5809 case 0x2028: /* LINE SEPARATOR */
5810 case 0x2029: /* PARAGRAPH SEPARATOR */
5811 gotspace = TRUE;
5812 break;
5813 }
5814 if (gotspace == (ctype == OP_NOT_VSPACE)) break;
5815 eptr += len;
5816 }
5817 break;
5818
5819 case OP_NOT_DIGIT:
5820 for (i = min; i < max; i++)
5821 {
5822 int len = 1;
5823 if (eptr >= md->end_subject)
5824 {
5825 SCHECK_PARTIAL();
5826 break;
5827 }
5828 GETCHARLEN(c, eptr, len);
5829 if (c < 256 && (md->ctypes[c] & ctype_digit) != 0) break;
5830 eptr+= len;
5831 }
5832 break;
5833
5834 case OP_DIGIT:
5835 for (i = min; i < max; i++)
5836 {
5837 int len = 1;
5838 if (eptr >= md->end_subject)
5839 {
5840 SCHECK_PARTIAL();
5841 break;
5842 }
5843 GETCHARLEN(c, eptr, len);
5844 if (c >= 256 ||(md->ctypes[c] & ctype_digit) == 0) break;
5845 eptr+= len;
5846 }
5847 break;
5848
5849 case OP_NOT_WHITESPACE:
5850 for (i = min; i < max; i++)
5851 {
5852 int len = 1;
5853 if (eptr >= md->end_subject)
5854 {
5855 SCHECK_PARTIAL();
5856 break;
5857 }
5858 GETCHARLEN(c, eptr, len);
5859 if (c < 256 && (md->ctypes[c] & ctype_space) != 0) break;
5860 eptr+= len;
5861 }
5862 break;
5863
5864 case OP_WHITESPACE:
5865 for (i = min; i < max; i++)
5866 {
5867 int len = 1;
5868 if (eptr >= md->end_subject)
5869 {
5870 SCHECK_PARTIAL();
5871 break;
5872 }
5873 GETCHARLEN(c, eptr, len);
5874 if (c >= 256 ||(md->ctypes[c] & ctype_space) == 0) break;
5875 eptr+= len;
5876 }
5877 break;
5878
5879 case OP_NOT_WORDCHAR:
5880 for (i = min; i < max; i++)
5881 {
5882 int len = 1;
5883 if (eptr >= md->end_subject)
5884 {
5885 SCHECK_PARTIAL();
5886 break;
5887 }
5888 GETCHARLEN(c, eptr, len);
5889 if (c < 256 && (md->ctypes[c] & ctype_word) != 0) break;
5890 eptr+= len;
5891 }
5892 break;
5893
5894 case OP_WORDCHAR:
5895 for (i = min; i < max; i++)
5896 {
5897 int len = 1;
5898 if (eptr >= md->end_subject)
5899 {
5900 SCHECK_PARTIAL();
5901 break;
5902 }
5903 GETCHARLEN(c, eptr, len);
5904 if (c >= 256 || (md->ctypes[c] & ctype_word) == 0) break;
5905 eptr+= len;
5906 }
5907 break;
5908
5909 default:
5910 RRETURN(PCRE_ERROR_INTERNAL);
5911 }
5912
5913 /* eptr is now past the end of the maximum run. If possessive, we are
5914 done (no backing up). Otherwise, match at this position; anything other
5915 than no match is immediately returned. For nomatch, back up one
5916 character, unless we are matching \R and the last thing matched was
5917 \r\n, in which case, back up two bytes. */
5918
5919 if (possessive) continue;
5920 for(;;)
5921 {
5922 RMATCH(eptr, ecode, offset_top, md, eptrb, RM46);
5923 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5924 if (eptr-- == pp) break; /* Stop if tried at original pos */
5925 BACKCHAR(eptr);
5926 if (ctype == OP_ANYNL && eptr > pp && *eptr == '\n' &&
5927 eptr[-1] == '\r') eptr--;
5928 }
5929 }
5930 else
5931 #endif /* SUPPORT_UTF */
5932 /* Not UTF mode */
5933 {
5934 switch(ctype)
5935 {
5936 case OP_ANY:
5937 for (i = min; i < max; i++)
5938 {
5939 if (eptr >= md->end_subject)
5940 {
5941 SCHECK_PARTIAL();
5942 break;
5943 }
5944 if (IS_NEWLINE(eptr)) break;
5945 if (md->partial != 0 && /* Take care with CRLF partial */
5946 eptr + 1 >= md->end_subject &&
5947 NLBLOCK->nltype == NLTYPE_FIXED &&
5948 NLBLOCK->nllen == 2 &&
5949 *eptr == NLBLOCK->nl[0])
5950 {
5951 md->hitend = TRUE;
5952 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);
5953 }
5954 eptr++;
5955 }
5956 break;
5957
5958 case OP_ALLANY:
5959 case OP_ANYBYTE:
5960 c = max - min;
5961 if (c > (unsigned int)(md->end_subject - eptr))
5962 {
5963 eptr = md->end_subject;
5964 SCHECK_PARTIAL();
5965 }
5966 else eptr += c;
5967 break;
5968
5969 case OP_ANYNL:
5970 for (i = min; i < max; i++)
5971 {
5972 if (eptr >= md->end_subject)
5973 {
5974 SCHECK_PARTIAL();
5975 break;
5976 }
5977 c = *eptr;
5978 if (c == 0x000d)
5979 {
5980 if (++eptr >= md->end_subject) break;
5981 if (*eptr == 0x000a) eptr++;
5982 }
5983 else
5984 {
5985 if (c != 0x000a && (md->bsr_anycrlf ||
5986 (c != 0x000b && c != 0x000c && c != 0x0085
5987 #ifdef COMPILE_PCRE16
5988 && c != 0x2028 && c != 0x2029
5989 #endif
5990 ))) break;
5991 eptr++;
5992 }
5993 }
5994 break;
5995
5996 case OP_NOT_HSPACE:
5997 for (i = min; i < max; i++)
5998 {
5999 if (eptr >= md->end_subject)
6000 {
6001 SCHECK_PARTIAL();
6002 break;
6003 }
6004 c = *eptr;
6005 if (c == 0x09 || c == 0x20 || c == 0xa0
6006 #ifdef COMPILE_PCRE16
6007 || c == 0x1680 || c == 0x180e || (c >= 0x2000 && c <= 0x200A)
6008 || c == 0x202f || c == 0x205f || c == 0x3000
6009 #endif
6010 ) break;
6011 eptr++;
6012 }
6013 break;
6014
6015 case OP_HSPACE:
6016 for (i = min; i < max; i++)
6017 {
6018 if (eptr >= md->end_subject)
6019 {
6020 SCHECK_PARTIAL();
6021 break;
6022 }
6023 c = *eptr;
6024 if (c != 0x09 && c != 0x20 && c != 0xa0
6025 #ifdef COMPILE_PCRE16
6026 && c != 0x1680 && c != 0x180e && (c < 0x2000 || c > 0x200A)
6027 && c != 0x202f && c != 0x205f && c != 0x3000
6028 #endif
6029 ) break;
6030 eptr++;
6031 }
6032 break;
6033
6034 case OP_NOT_VSPACE:
6035 for (i = min; i < max; i++)
6036 {
6037 if (eptr >= md->end_subject)
6038 {
6039 SCHECK_PARTIAL();
6040 break;
6041 }
6042 c = *eptr;
6043 if (c == 0x0a || c == 0x0b || c == 0x0c || c == 0x0d || c == 0x85
6044 #ifdef COMPILE_PCRE16
6045 || c == 0x2028 || c == 0x2029
6046 #endif
6047 ) break;
6048 eptr++;
6049 }
6050 break;
6051
6052 case OP_VSPACE:
6053 for (i = min; i < max; i++)
6054 {
6055 if (eptr >= md->end_subject)
6056 {
6057 SCHECK_PARTIAL();
6058 break;
6059 }
6060 c = *eptr;
6061 if (c != 0x0a && c != 0x0b && c != 0x0c && c != 0x0d && c != 0x85
6062 #ifdef COMPILE_PCRE16
6063 && c != 0x2028 && c != 0x2029
6064 #endif
6065 ) break;
6066 eptr++;
6067 }
6068 break;
6069
6070 case OP_NOT_DIGIT:
6071 for (i = min; i < max; i++)
6072 {
6073 if (eptr >= md->end_subject)
6074 {
6075 SCHECK_PARTIAL();
6076 break;
6077 }
6078 if (MAX_255(*eptr) && (md->ctypes[*eptr] & ctype_digit) != 0) break;
6079 eptr++;
6080 }
6081 break;
6082
6083 case OP_DIGIT:
6084 for (i = min; i < max; i++)
6085 {
6086 if (eptr >= md->end_subject)
6087 {
6088 SCHECK_PARTIAL();
6089 break;
6090 }
6091 if (!MAX_255(*eptr) || (md->ctypes[*eptr] & ctype_digit) == 0) break;
6092 eptr++;
6093 }
6094 break;
6095
6096 case OP_NOT_WHITESPACE:
6097 for (i = min; i < max; i++)
6098 {
6099 if (eptr >= md->end_subject)
6100 {
6101 SCHECK_PARTIAL();
6102 break;
6103 }
6104 if (MAX_255(*eptr) && (md->ctypes[*eptr] & ctype_space) != 0) break;
6105 eptr++;
6106 }
6107 break;
6108
6109 case OP_WHITESPACE:
6110 for (i = min; i < max; i++)
6111 {
6112 if (eptr >= md->end_subject)
6113 {
6114 SCHECK_PARTIAL();
6115 break;
6116 }
6117 if (!MAX_255(*eptr) || (md->ctypes[*eptr] & ctype_space) == 0) break;
6118 eptr++;
6119 }
6120 break;
6121
6122 case OP_NOT_WORDCHAR:
6123 for (i = min; i < max; i++)
6124 {
6125 if (eptr >= md->end_subject)
6126 {
6127 SCHECK_PARTIAL();
6128 break;
6129 }
6130 if (MAX_255(*eptr) && (md->ctypes[*eptr] & ctype_word) != 0) break;
6131 eptr++;
6132 }
6133 break;
6134
6135 case OP_WORDCHAR:
6136 for (i = min; i < max; i++)
6137 {
6138 if (eptr >= md->end_subject)
6139 {
6140 SCHECK_PARTIAL();
6141 break;
6142 }
6143 if (!MAX_255(*eptr) || (md->ctypes[*eptr] & ctype_word) == 0) break;
6144 eptr++;
6145 }
6146 break;
6147
6148 default:
6149 RRETURN(PCRE_ERROR_INTERNAL);
6150 }
6151
6152 /* eptr is now past the end of the maximum run. If possessive, we are
6153 done (no backing up). Otherwise, match at this position; anything other
6154 than no match is immediately returned. For nomatch, back up one
6155 character (byte), unless we are matching \R and the last thing matched
6156 was \r\n, in which case, back up two bytes. */
6157
6158 if (possessive) continue;
6159 while (eptr >= pp)
6160 {
6161 RMATCH(eptr, ecode, offset_top, md, eptrb, RM47);
6162 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
6163 eptr--;
6164 if (ctype == OP_ANYNL && eptr > pp && *eptr == '\n' &&
6165 eptr[-1] == '\r') eptr--;
6166 }
6167 }
6168
6169 /* Get here if we can't make it match with any permitted repetitions */
6170
6171 RRETURN(MATCH_NOMATCH);
6172 }
6173 /* Control never gets here */
6174
6175 /* There's been some horrible disaster. Arrival here can only mean there is
6176 something seriously wrong in the code above or the OP_xxx definitions. */
6177
6178 default:
6179 DPRINTF(("Unknown opcode %d\n", *ecode));
6180 RRETURN(PCRE_ERROR_UNKNOWN_OPCODE);
6181 }
6182
6183 /* Do not stick any code in here without much thought; it is assumed
6184 that "continue" in the code above comes out to here to repeat the main
6185 loop. */
6186
6187 } /* End of main loop */
6188 /* Control never reaches here */
6189
6190
6191 /* When compiling to use the heap rather than the stack for recursive calls to
6192 match(), the RRETURN() macro jumps here. The number that is saved in
6193 frame->Xwhere indicates which label we actually want to return to. */
6194
6195 #ifdef NO_RECURSE
6196 #define LBL(val) case val: goto L_RM##val;
6197 HEAP_RETURN:
6198 switch (frame->Xwhere)
6199 {
6200 LBL( 1) LBL( 2) LBL( 3) LBL( 4) LBL( 5) LBL( 6) LBL( 7) LBL( 8)
6201 LBL( 9) LBL(10) LBL(11) LBL(12) LBL(13) LBL(14) LBL(15) LBL(17)
6202 LBL(19) LBL(24) LBL(25) LBL(26) LBL(27) LBL(29) LBL(31) LBL(33)
6203 LBL(35) LBL(43) LBL(47) LBL(48) LBL(49) LBL(50) LBL(51) LBL(52)
6204 LBL(53) LBL(54) LBL(55) LBL(56) LBL(57) LBL(58) LBL(63) LBL(64)
6205 LBL(65) LBL(66)
6206 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
6207 LBL(21)
6208 #endif
6209 #ifdef SUPPORT_UTF
6210 LBL(16) LBL(18) LBL(20)
6211 LBL(22) LBL(23) LBL(28) LBL(30)
6212 LBL(32) LBL(34) LBL(42) LBL(46)
6213 #ifdef SUPPORT_UCP
6214 LBL(36) LBL(37) LBL(38) LBL(39) LBL(40) LBL(41) LBL(44) LBL(45)
6215 LBL(59) LBL(60) LBL(61) LBL(62)
6216 #endif /* SUPPORT_UCP */
6217 #endif /* SUPPORT_UTF */
6218 default:
6219 DPRINTF(("jump error in pcre match: label %d non-existent\n", frame->Xwhere));
6220
6221 printf("+++jump error in pcre match: label %d non-existent\n", frame->Xwhere);
6222
6223 return PCRE_ERROR_INTERNAL;
6224 }
6225 #undef LBL
6226 #endif /* NO_RECURSE */
6227 }
6228
6229
6230 /***************************************************************************
6231 ****************************************************************************
6232 RECURSION IN THE match() FUNCTION
6233
6234 Undefine all the macros that were defined above to handle this. */
6235
6236 #ifdef NO_RECURSE
6237 #undef eptr
6238 #undef ecode
6239 #undef mstart
6240 #undef offset_top
6241 #undef eptrb
6242 #undef flags
6243
6244 #undef callpat
6245 #undef charptr
6246 #undef data
6247 #undef next
6248 #undef pp
6249 #undef prev
6250 #undef saved_eptr
6251
6252 #undef new_recursive
6253
6254 #undef cur_is_word
6255 #undef condition
6256 #undef prev_is_word
6257
6258 #undef ctype
6259 #undef length
6260 #undef max
6261 #undef min
6262 #undef number
6263 #undef offset
6264 #undef op
6265 #undef save_capture_last
6266 #undef save_offset1
6267 #undef save_offset2
6268 #undef save_offset3
6269 #undef stacksave
6270
6271 #undef newptrb
6272
6273 #endif
6274
6275 /* These two are defined as macros in both cases */
6276
6277 #undef fc
6278 #undef fi
6279
6280 /***************************************************************************
6281 ***************************************************************************/
6282
6283
6284 #ifdef NO_RECURSE
6285 /*************************************************
6286 * Release allocated heap frames *
6287 *************************************************/
6288
6289 /* This function releases all the allocated frames. The base frame is on the
6290 machine stack, and so must not be freed.
6291
6292 Argument: the address of the base frame
6293 Returns: nothing
6294 */
6295
6296 static void
release_match_heapframes(heapframe * frame_base)6297 release_match_heapframes (heapframe *frame_base)
6298 {
6299 heapframe *nextframe = frame_base->Xnextframe;
6300 while (nextframe != NULL)
6301 {
6302 heapframe *oldframe = nextframe;
6303 nextframe = nextframe->Xnextframe;
6304 (PUBL(stack_free))(oldframe);
6305 }
6306 }
6307 #endif
6308
6309
6310 /*************************************************
6311 * Execute a Regular Expression *
6312 *************************************************/
6313
6314 /* This function applies a compiled re to a subject string and picks out
6315 portions of the string if it matches. Two elements in the vector are set for
6316 each substring: the offsets to the start and end of the substring.
6317
6318 Arguments:
6319 argument_re points to the compiled expression
6320 extra_data points to extra data or is NULL
6321 subject points to the subject string
6322 length length of subject string (may contain binary zeros)
6323 start_offset where to start in the subject string
6324 options option bits
6325 offsets points to a vector of ints to be filled in with offsets
6326 offsetcount the number of elements in the vector
6327
6328 Returns: > 0 => success; value is the number of elements filled in
6329 = 0 => success, but offsets is not big enough
6330 -1 => failed to match
6331 < -1 => some kind of unexpected problem
6332 */
6333
6334 #ifdef COMPILE_PCRE8
6335 PCRE_EXP_DEFN int PCRE_CALL_CONVENTION
pcre_exec(const pcre * argument_re,const pcre_extra * extra_data,PCRE_SPTR subject,int length,int start_offset,int options,int * offsets,int offsetcount)6336 pcre_exec(const pcre *argument_re, const pcre_extra *extra_data,
6337 PCRE_SPTR subject, int length, int start_offset, int options, int *offsets,
6338 int offsetcount)
6339 #else
6340 PCRE_EXP_DEFN int PCRE_CALL_CONVENTION
6341 pcre16_exec(const pcre16 *argument_re, const pcre16_extra *extra_data,
6342 PCRE_SPTR16 subject, int length, int start_offset, int options, int *offsets,
6343 int offsetcount)
6344 #endif
6345 {
6346 int rc, ocount, arg_offset_max;
6347 int newline;
6348 BOOL using_temporary_offsets = FALSE;
6349 BOOL anchored;
6350 BOOL startline;
6351 BOOL firstline;
6352 BOOL utf;
6353 BOOL has_first_char = FALSE;
6354 BOOL has_req_char = FALSE;
6355 pcre_uchar first_char = 0;
6356 pcre_uchar first_char2 = 0;
6357 pcre_uchar req_char = 0;
6358 pcre_uchar req_char2 = 0;
6359 match_data match_block;
6360 match_data *md = &match_block;
6361 const pcre_uint8 *tables;
6362 const pcre_uint8 *start_bits = NULL;
6363 PCRE_PUCHAR start_match = (PCRE_PUCHAR)subject + start_offset;
6364 PCRE_PUCHAR end_subject;
6365 PCRE_PUCHAR start_partial = NULL;
6366 PCRE_PUCHAR req_char_ptr = start_match - 1;
6367
6368 const pcre_study_data *study;
6369 const REAL_PCRE *re = (const REAL_PCRE *)argument_re;
6370
6371 #ifdef NO_RECURSE
6372 heapframe frame_zero;
6373 frame_zero.Xprevframe = NULL; /* Marks the top level */
6374 frame_zero.Xnextframe = NULL; /* None are allocated yet */
6375 md->match_frames_base = &frame_zero;
6376 #endif
6377
6378 /* Check for the special magic call that measures the size of the stack used
6379 per recursive call of match(). Without the funny casting for sizeof, a Windows
6380 compiler gave this error: "unary minus operator applied to unsigned type,
6381 result still unsigned". Hopefully the cast fixes that. */
6382
6383 if (re == NULL && extra_data == NULL && subject == NULL && length == -999 &&
6384 start_offset == -999)
6385 #ifdef NO_RECURSE
6386 return -((int)sizeof(heapframe));
6387 #else
6388 return match(NULL, NULL, NULL, 0, NULL, NULL, 0);
6389 #endif
6390
6391 /* Plausibility checks */
6392
6393 if ((options & ~PUBLIC_EXEC_OPTIONS) != 0) return PCRE_ERROR_BADOPTION;
6394 if (re == NULL || subject == NULL || (offsets == NULL && offsetcount > 0))
6395 return PCRE_ERROR_NULL;
6396 if (offsetcount < 0) return PCRE_ERROR_BADCOUNT;
6397 if (start_offset < 0 || start_offset > length) return PCRE_ERROR_BADOFFSET;
6398
6399 /* Check that the first field in the block is the magic number. If it is not,
6400 return with PCRE_ERROR_BADMAGIC. However, if the magic number is equal to
6401 REVERSED_MAGIC_NUMBER we return with PCRE_ERROR_BADENDIANNESS, which
6402 means that the pattern is likely compiled with different endianness. */
6403
6404 if (re->magic_number != MAGIC_NUMBER)
6405 return re->magic_number == REVERSED_MAGIC_NUMBER?
6406 PCRE_ERROR_BADENDIANNESS:PCRE_ERROR_BADMAGIC;
6407 if ((re->flags & PCRE_MODE) == 0) return PCRE_ERROR_BADMODE;
6408
6409 /* These two settings are used in the code for checking a UTF-8 string that
6410 follows immediately afterwards. Other values in the md block are used only
6411 during "normal" pcre_exec() processing, not when the JIT support is in use,
6412 so they are set up later. */
6413
6414 /* PCRE_UTF16 has the same value as PCRE_UTF8. */
6415 utf = md->utf = (re->options & PCRE_UTF8) != 0;
6416 md->partial = ((options & PCRE_PARTIAL_HARD) != 0)? 2 :
6417 ((options & PCRE_PARTIAL_SOFT) != 0)? 1 : 0;
6418
6419 /* Check a UTF-8 string if required. Pass back the character offset and error
6420 code for an invalid string if a results vector is available. */
6421
6422 #ifdef SUPPORT_UTF
6423 if (utf && (options & PCRE_NO_UTF8_CHECK) == 0)
6424 {
6425 int erroroffset;
6426 int errorcode = PRIV(valid_utf)((PCRE_PUCHAR)subject, length, &erroroffset);
6427 if (errorcode != 0)
6428 {
6429 if (offsetcount >= 2)
6430 {
6431 offsets[0] = erroroffset;
6432 offsets[1] = errorcode;
6433 }
6434 #ifdef COMPILE_PCRE16
6435 return (errorcode <= PCRE_UTF16_ERR1 && md->partial > 1)?
6436 PCRE_ERROR_SHORTUTF16 : PCRE_ERROR_BADUTF16;
6437 #else
6438 return (errorcode <= PCRE_UTF8_ERR5 && md->partial > 1)?
6439 PCRE_ERROR_SHORTUTF8 : PCRE_ERROR_BADUTF8;
6440 #endif
6441 }
6442
6443 /* Check that a start_offset points to the start of a UTF character. */
6444 if (start_offset > 0 && start_offset < length &&
6445 NOT_FIRSTCHAR(((PCRE_PUCHAR)subject)[start_offset]))
6446 return PCRE_ERROR_BADUTF8_OFFSET;
6447 }
6448 #endif
6449
6450 /* If the pattern was successfully studied with JIT support, run the JIT
6451 executable instead of the rest of this function. Most options must be set at
6452 compile time for the JIT code to be usable. Fallback to the normal code path if
6453 an unsupported flag is set. */
6454
6455 #ifdef SUPPORT_JIT
6456 if (extra_data != NULL
6457 && (extra_data->flags & (PCRE_EXTRA_EXECUTABLE_JIT |
6458 PCRE_EXTRA_TABLES)) == PCRE_EXTRA_EXECUTABLE_JIT
6459 && extra_data->executable_jit != NULL
6460 && (options & ~(PCRE_NO_UTF8_CHECK | PCRE_NOTBOL | PCRE_NOTEOL |
6461 PCRE_NOTEMPTY | PCRE_NOTEMPTY_ATSTART |
6462 PCRE_PARTIAL_SOFT | PCRE_PARTIAL_HARD)) == 0)
6463 {
6464 rc = PRIV(jit_exec)(re, extra_data, (const pcre_uchar *)subject, length,
6465 start_offset, options, offsets, offsetcount);
6466
6467 /* PCRE_ERROR_NULL means that the selected normal or partial matching
6468 mode is not compiled. In this case we simply fallback to interpreter. */
6469
6470 if (rc != PCRE_ERROR_NULL) return rc;
6471 }
6472 #endif
6473
6474 /* Carry on with non-JIT matching. This information is for finding all the
6475 numbers associated with a given name, for condition testing. */
6476
6477 md->name_table = (pcre_uchar *)re + re->name_table_offset;
6478 md->name_count = re->name_count;
6479 md->name_entry_size = re->name_entry_size;
6480
6481 /* Fish out the optional data from the extra_data structure, first setting
6482 the default values. */
6483
6484 study = NULL;
6485 md->match_limit = MATCH_LIMIT;
6486 md->match_limit_recursion = MATCH_LIMIT_RECURSION;
6487 md->callout_data = NULL;
6488
6489 /* The table pointer is always in native byte order. */
6490
6491 tables = re->tables;
6492
6493 if (extra_data != NULL)
6494 {
6495 unsigned int flags = extra_data->flags;
6496 if ((flags & PCRE_EXTRA_STUDY_DATA) != 0)
6497 study = (const pcre_study_data *)extra_data->study_data;
6498 if ((flags & PCRE_EXTRA_MATCH_LIMIT) != 0)
6499 md->match_limit = extra_data->match_limit;
6500 if ((flags & PCRE_EXTRA_MATCH_LIMIT_RECURSION) != 0)
6501 md->match_limit_recursion = extra_data->match_limit_recursion;
6502 if ((flags & PCRE_EXTRA_CALLOUT_DATA) != 0)
6503 md->callout_data = extra_data->callout_data;
6504 if ((flags & PCRE_EXTRA_TABLES) != 0) tables = extra_data->tables;
6505 }
6506
6507 /* If the exec call supplied NULL for tables, use the inbuilt ones. This
6508 is a feature that makes it possible to save compiled regex and re-use them
6509 in other programs later. */
6510
6511 if (tables == NULL) tables = PRIV(default_tables);
6512
6513 /* Set up other data */
6514
6515 anchored = ((re->options | options) & PCRE_ANCHORED) != 0;
6516 startline = (re->flags & PCRE_STARTLINE) != 0;
6517 firstline = (re->options & PCRE_FIRSTLINE) != 0;
6518
6519 /* The code starts after the real_pcre block and the capture name table. */
6520
6521 md->start_code = (const pcre_uchar *)re + re->name_table_offset +
6522 re->name_count * re->name_entry_size;
6523
6524 md->start_subject = (PCRE_PUCHAR)subject;
6525 md->start_offset = start_offset;
6526 md->end_subject = md->start_subject + length;
6527 end_subject = md->end_subject;
6528
6529 md->endonly = (re->options & PCRE_DOLLAR_ENDONLY) != 0;
6530 md->use_ucp = (re->options & PCRE_UCP) != 0;
6531 md->jscript_compat = (re->options & PCRE_JAVASCRIPT_COMPAT) != 0;
6532 md->ignore_skip_arg = FALSE;
6533
6534 /* Some options are unpacked into BOOL variables in the hope that testing
6535 them will be faster than individual option bits. */
6536
6537 md->notbol = (options & PCRE_NOTBOL) != 0;
6538 md->noteol = (options & PCRE_NOTEOL) != 0;
6539 md->notempty = (options & PCRE_NOTEMPTY) != 0;
6540 md->notempty_atstart = (options & PCRE_NOTEMPTY_ATSTART) != 0;
6541
6542 md->hitend = FALSE;
6543 md->mark = md->nomatch_mark = NULL; /* In case never set */
6544
6545 md->recursive = NULL; /* No recursion at top level */
6546 md->hasthen = (re->flags & PCRE_HASTHEN) != 0;
6547
6548 md->lcc = tables + lcc_offset;
6549 md->fcc = tables + fcc_offset;
6550 md->ctypes = tables + ctypes_offset;
6551
6552 /* Handle different \R options. */
6553
6554 switch (options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE))
6555 {
6556 case 0:
6557 if ((re->options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)) != 0)
6558 md->bsr_anycrlf = (re->options & PCRE_BSR_ANYCRLF) != 0;
6559 else
6560 #ifdef BSR_ANYCRLF
6561 md->bsr_anycrlf = TRUE;
6562 #else
6563 md->bsr_anycrlf = FALSE;
6564 #endif
6565 break;
6566
6567 case PCRE_BSR_ANYCRLF:
6568 md->bsr_anycrlf = TRUE;
6569 break;
6570
6571 case PCRE_BSR_UNICODE:
6572 md->bsr_anycrlf = FALSE;
6573 break;
6574
6575 default: return PCRE_ERROR_BADNEWLINE;
6576 }
6577
6578 /* Handle different types of newline. The three bits give eight cases. If
6579 nothing is set at run time, whatever was used at compile time applies. */
6580
6581 switch ((((options & PCRE_NEWLINE_BITS) == 0)? re->options :
6582 (pcre_uint32)options) & PCRE_NEWLINE_BITS)
6583 {
6584 case 0: newline = NEWLINE; break; /* Compile-time default */
6585 case PCRE_NEWLINE_CR: newline = CHAR_CR; break;
6586 case PCRE_NEWLINE_LF: newline = CHAR_NL; break;
6587 case PCRE_NEWLINE_CR+
6588 PCRE_NEWLINE_LF: newline = (CHAR_CR << 8) | CHAR_NL; break;
6589 case PCRE_NEWLINE_ANY: newline = -1; break;
6590 case PCRE_NEWLINE_ANYCRLF: newline = -2; break;
6591 default: return PCRE_ERROR_BADNEWLINE;
6592 }
6593
6594 if (newline == -2)
6595 {
6596 md->nltype = NLTYPE_ANYCRLF;
6597 }
6598 else if (newline < 0)
6599 {
6600 md->nltype = NLTYPE_ANY;
6601 }
6602 else
6603 {
6604 md->nltype = NLTYPE_FIXED;
6605 if (newline > 255)
6606 {
6607 md->nllen = 2;
6608 md->nl[0] = (newline >> 8) & 255;
6609 md->nl[1] = newline & 255;
6610 }
6611 else
6612 {
6613 md->nllen = 1;
6614 md->nl[0] = newline;
6615 }
6616 }
6617
6618 /* Partial matching was originally supported only for a restricted set of
6619 regexes; from release 8.00 there are no restrictions, but the bits are still
6620 defined (though never set). So there's no harm in leaving this code. */
6621
6622 if (md->partial && (re->flags & PCRE_NOPARTIAL) != 0)
6623 return PCRE_ERROR_BADPARTIAL;
6624
6625 /* If the expression has got more back references than the offsets supplied can
6626 hold, we get a temporary chunk of working store to use during the matching.
6627 Otherwise, we can use the vector supplied, rounding down its size to a multiple
6628 of 3. */
6629
6630 ocount = offsetcount - (offsetcount % 3);
6631 arg_offset_max = (2*ocount)/3;
6632
6633 if (re->top_backref > 0 && re->top_backref >= ocount/3)
6634 {
6635 ocount = re->top_backref * 3 + 3;
6636 md->offset_vector = (int *)(PUBL(malloc))(ocount * sizeof(int));
6637 if (md->offset_vector == NULL) return PCRE_ERROR_NOMEMORY;
6638 using_temporary_offsets = TRUE;
6639 DPRINTF(("Got memory to hold back references\n"));
6640 }
6641 else md->offset_vector = offsets;
6642
6643 md->offset_end = ocount;
6644 md->offset_max = (2*ocount)/3;
6645 md->offset_overflow = FALSE;
6646 md->capture_last = -1;
6647
6648 /* Reset the working variable associated with each extraction. These should
6649 never be used unless previously set, but they get saved and restored, and so we
6650 initialize them to avoid reading uninitialized locations. Also, unset the
6651 offsets for the matched string. This is really just for tidiness with callouts,
6652 in case they inspect these fields. */
6653
6654 if (md->offset_vector != NULL)
6655 {
6656 int *iptr = md->offset_vector + ocount;
6657 int *iend = iptr - re->top_bracket;
6658 if (iend < md->offset_vector + 2) iend = md->offset_vector + 2;
6659 while (--iptr >= iend) *iptr = -1;
6660 md->offset_vector[0] = md->offset_vector[1] = -1;
6661 }
6662
6663 /* Set up the first character to match, if available. The first_char value is
6664 never set for an anchored regular expression, but the anchoring may be forced
6665 at run time, so we have to test for anchoring. The first char may be unset for
6666 an unanchored pattern, of course. If there's no first char and the pattern was
6667 studied, there may be a bitmap of possible first characters. */
6668
6669 if (!anchored)
6670 {
6671 if ((re->flags & PCRE_FIRSTSET) != 0)
6672 {
6673 has_first_char = TRUE;
6674 first_char = first_char2 = (pcre_uchar)(re->first_char);
6675 if ((re->flags & PCRE_FCH_CASELESS) != 0)
6676 {
6677 first_char2 = TABLE_GET(first_char, md->fcc, first_char);
6678 #if defined SUPPORT_UCP && !(defined COMPILE_PCRE8)
6679 if (utf && first_char > 127)
6680 first_char2 = UCD_OTHERCASE(first_char);
6681 #endif
6682 }
6683 }
6684 else
6685 if (!startline && study != NULL &&
6686 (study->flags & PCRE_STUDY_MAPPED) != 0)
6687 start_bits = study->start_bits;
6688 }
6689
6690 /* For anchored or unanchored matches, there may be a "last known required
6691 character" set. */
6692
6693 if ((re->flags & PCRE_REQCHSET) != 0)
6694 {
6695 has_req_char = TRUE;
6696 req_char = req_char2 = (pcre_uchar)(re->req_char);
6697 if ((re->flags & PCRE_RCH_CASELESS) != 0)
6698 {
6699 req_char2 = TABLE_GET(req_char, md->fcc, req_char);
6700 #if defined SUPPORT_UCP && !(defined COMPILE_PCRE8)
6701 if (utf && req_char > 127)
6702 req_char2 = UCD_OTHERCASE(req_char);
6703 #endif
6704 }
6705 }
6706
6707
6708 /* ==========================================================================*/
6709
6710 /* Loop for handling unanchored repeated matching attempts; for anchored regexs
6711 the loop runs just once. */
6712
6713 for(;;)
6714 {
6715 PCRE_PUCHAR save_end_subject = end_subject;
6716 PCRE_PUCHAR new_start_match;
6717
6718 /* If firstline is TRUE, the start of the match is constrained to the first
6719 line of a multiline string. That is, the match must be before or at the first
6720 newline. Implement this by temporarily adjusting end_subject so that we stop
6721 scanning at a newline. If the match fails at the newline, later code breaks
6722 this loop. */
6723
6724 if (firstline)
6725 {
6726 PCRE_PUCHAR t = start_match;
6727 #ifdef SUPPORT_UTF
6728 if (utf)
6729 {
6730 while (t < md->end_subject && !IS_NEWLINE(t))
6731 {
6732 t++;
6733 ACROSSCHAR(t < end_subject, *t, t++);
6734 }
6735 }
6736 else
6737 #endif
6738 while (t < md->end_subject && !IS_NEWLINE(t)) t++;
6739 end_subject = t;
6740 }
6741
6742 /* There are some optimizations that avoid running the match if a known
6743 starting point is not found, or if a known later character is not present.
6744 However, there is an option that disables these, for testing and for ensuring
6745 that all callouts do actually occur. The option can be set in the regex by
6746 (*NO_START_OPT) or passed in match-time options. */
6747
6748 if (((options | re->options) & PCRE_NO_START_OPTIMIZE) == 0)
6749 {
6750 /* Advance to a unique first char if there is one. */
6751
6752 if (has_first_char)
6753 {
6754 if (first_char != first_char2)
6755 while (start_match < end_subject &&
6756 *start_match != first_char && *start_match != first_char2)
6757 start_match++;
6758 else
6759 while (start_match < end_subject && *start_match != first_char)
6760 start_match++;
6761 }
6762
6763 /* Or to just after a linebreak for a multiline match */
6764
6765 else if (startline)
6766 {
6767 if (start_match > md->start_subject + start_offset)
6768 {
6769 #ifdef SUPPORT_UTF
6770 if (utf)
6771 {
6772 while (start_match < end_subject && !WAS_NEWLINE(start_match))
6773 {
6774 start_match++;
6775 ACROSSCHAR(start_match < end_subject, *start_match,
6776 start_match++);
6777 }
6778 }
6779 else
6780 #endif
6781 while (start_match < end_subject && !WAS_NEWLINE(start_match))
6782 start_match++;
6783
6784 /* If we have just passed a CR and the newline option is ANY or ANYCRLF,
6785 and we are now at a LF, advance the match position by one more character.
6786 */
6787
6788 if (start_match[-1] == CHAR_CR &&
6789 (md->nltype == NLTYPE_ANY || md->nltype == NLTYPE_ANYCRLF) &&
6790 start_match < end_subject &&
6791 *start_match == CHAR_NL)
6792 start_match++;
6793 }
6794 }
6795
6796 /* Or to a non-unique first byte after study */
6797
6798 else if (start_bits != NULL)
6799 {
6800 while (start_match < end_subject)
6801 {
6802 unsigned int c = *start_match;
6803 #ifndef COMPILE_PCRE8
6804 if (c > 255) c = 255;
6805 #endif
6806 if ((start_bits[c/8] & (1 << (c&7))) == 0)
6807 {
6808 start_match++;
6809 #if defined SUPPORT_UTF && defined COMPILE_PCRE8
6810 /* In non 8-bit mode, the iteration will stop for
6811 characters > 255 at the beginning or not stop at all. */
6812 if (utf)
6813 ACROSSCHAR(start_match < end_subject, *start_match,
6814 start_match++);
6815 #endif
6816 }
6817 else break;
6818 }
6819 }
6820 } /* Starting optimizations */
6821
6822 /* Restore fudged end_subject */
6823
6824 end_subject = save_end_subject;
6825
6826 /* The following two optimizations are disabled for partial matching or if
6827 disabling is explicitly requested. */
6828
6829 if (((options | re->options) & PCRE_NO_START_OPTIMIZE) == 0 && !md->partial)
6830 {
6831 /* If the pattern was studied, a minimum subject length may be set. This is
6832 a lower bound; no actual string of that length may actually match the
6833 pattern. Although the value is, strictly, in characters, we treat it as
6834 bytes to avoid spending too much time in this optimization. */
6835
6836 if (study != NULL && (study->flags & PCRE_STUDY_MINLEN) != 0 &&
6837 (pcre_uint32)(end_subject - start_match) < study->minlength)
6838 {
6839 rc = MATCH_NOMATCH;
6840 break;
6841 }
6842
6843 /* If req_char is set, we know that that character must appear in the
6844 subject for the match to succeed. If the first character is set, req_char
6845 must be later in the subject; otherwise the test starts at the match point.
6846 This optimization can save a huge amount of backtracking in patterns with
6847 nested unlimited repeats that aren't going to match. Writing separate code
6848 for cased/caseless versions makes it go faster, as does using an
6849 autoincrement and backing off on a match.
6850
6851 HOWEVER: when the subject string is very, very long, searching to its end
6852 can take a long time, and give bad performance on quite ordinary patterns.
6853 This showed up when somebody was matching something like /^\d+C/ on a
6854 32-megabyte string... so we don't do this when the string is sufficiently
6855 long. */
6856
6857 if (has_req_char && end_subject - start_match < REQ_BYTE_MAX)
6858 {
6859 PCRE_PUCHAR p = start_match + (has_first_char? 1:0);
6860
6861 /* We don't need to repeat the search if we haven't yet reached the
6862 place we found it at last time. */
6863
6864 if (p > req_char_ptr)
6865 {
6866 if (req_char != req_char2)
6867 {
6868 while (p < end_subject)
6869 {
6870 int pp = *p++;
6871 if (pp == req_char || pp == req_char2) { p--; break; }
6872 }
6873 }
6874 else
6875 {
6876 while (p < end_subject)
6877 {
6878 if (*p++ == req_char) { p--; break; }
6879 }
6880 }
6881
6882 /* If we can't find the required character, break the matching loop,
6883 forcing a match failure. */
6884
6885 if (p >= end_subject)
6886 {
6887 rc = MATCH_NOMATCH;
6888 break;
6889 }
6890
6891 /* If we have found the required character, save the point where we
6892 found it, so that we don't search again next time round the loop if
6893 the start hasn't passed this character yet. */
6894
6895 req_char_ptr = p;
6896 }
6897 }
6898 }
6899
6900 #ifdef PCRE_DEBUG /* Sigh. Some compilers never learn. */
6901 printf(">>>> Match against: ");
6902 pchars(start_match, end_subject - start_match, TRUE, md);
6903 printf("\n");
6904 #endif
6905
6906 /* OK, we can now run the match. If "hitend" is set afterwards, remember the
6907 first starting point for which a partial match was found. */
6908
6909 md->start_match_ptr = start_match;
6910 md->start_used_ptr = start_match;
6911 md->match_call_count = 0;
6912 md->match_function_type = 0;
6913 md->end_offset_top = 0;
6914 rc = match(start_match, md->start_code, start_match, 2, md, NULL, 0);
6915 if (md->hitend && start_partial == NULL) start_partial = md->start_used_ptr;
6916
6917 switch(rc)
6918 {
6919 /* If MATCH_SKIP_ARG reaches this level it means that a MARK that matched
6920 the SKIP's arg was not found. In this circumstance, Perl ignores the SKIP
6921 entirely. The only way we can do that is to re-do the match at the same
6922 point, with a flag to force SKIP with an argument to be ignored. Just
6923 treating this case as NOMATCH does not work because it does not check other
6924 alternatives in patterns such as A(*SKIP:A)B|AC when the subject is AC. */
6925
6926 case MATCH_SKIP_ARG:
6927 new_start_match = start_match;
6928 md->ignore_skip_arg = TRUE;
6929 break;
6930
6931 /* SKIP passes back the next starting point explicitly, but if it is the
6932 same as the match we have just done, treat it as NOMATCH. */
6933
6934 case MATCH_SKIP:
6935 if (md->start_match_ptr != start_match)
6936 {
6937 new_start_match = md->start_match_ptr;
6938 break;
6939 }
6940 /* Fall through */
6941
6942 /* NOMATCH and PRUNE advance by one character. THEN at this level acts
6943 exactly like PRUNE. Unset the ignore SKIP-with-argument flag. */
6944
6945 case MATCH_NOMATCH:
6946 case MATCH_PRUNE:
6947 case MATCH_THEN:
6948 md->ignore_skip_arg = FALSE;
6949 new_start_match = start_match + 1;
6950 #ifdef SUPPORT_UTF
6951 if (utf)
6952 ACROSSCHAR(new_start_match < end_subject, *new_start_match,
6953 new_start_match++);
6954 #endif
6955 break;
6956
6957 /* COMMIT disables the bumpalong, but otherwise behaves as NOMATCH. */
6958
6959 case MATCH_COMMIT:
6960 rc = MATCH_NOMATCH;
6961 goto ENDLOOP;
6962
6963 /* Any other return is either a match, or some kind of error. */
6964
6965 default:
6966 goto ENDLOOP;
6967 }
6968
6969 /* Control reaches here for the various types of "no match at this point"
6970 result. Reset the code to MATCH_NOMATCH for subsequent checking. */
6971
6972 rc = MATCH_NOMATCH;
6973
6974 /* If PCRE_FIRSTLINE is set, the match must happen before or at the first
6975 newline in the subject (though it may continue over the newline). Therefore,
6976 if we have just failed to match, starting at a newline, do not continue. */
6977
6978 if (firstline && IS_NEWLINE(start_match)) break;
6979
6980 /* Advance to new matching position */
6981
6982 start_match = new_start_match;
6983
6984 /* Break the loop if the pattern is anchored or if we have passed the end of
6985 the subject. */
6986
6987 if (anchored || start_match > end_subject) break;
6988
6989 /* If we have just passed a CR and we are now at a LF, and the pattern does
6990 not contain any explicit matches for \r or \n, and the newline option is CRLF
6991 or ANY or ANYCRLF, advance the match position by one more character. In
6992 normal matching start_match will aways be greater than the first position at
6993 this stage, but a failed *SKIP can cause a return at the same point, which is
6994 why the first test exists. */
6995
6996 if (start_match > (PCRE_PUCHAR)subject + start_offset &&
6997 start_match[-1] == CHAR_CR &&
6998 start_match < end_subject &&
6999 *start_match == CHAR_NL &&
7000 (re->flags & PCRE_HASCRORLF) == 0 &&
7001 (md->nltype == NLTYPE_ANY ||
7002 md->nltype == NLTYPE_ANYCRLF ||
7003 md->nllen == 2))
7004 start_match++;
7005
7006 md->mark = NULL; /* Reset for start of next match attempt */
7007 } /* End of for(;;) "bumpalong" loop */
7008
7009 /* ==========================================================================*/
7010
7011 /* We reach here when rc is not MATCH_NOMATCH, or if one of the stopping
7012 conditions is true:
7013
7014 (1) The pattern is anchored or the match was failed by (*COMMIT);
7015
7016 (2) We are past the end of the subject;
7017
7018 (3) PCRE_FIRSTLINE is set and we have failed to match at a newline, because
7019 this option requests that a match occur at or before the first newline in
7020 the subject.
7021
7022 When we have a match and the offset vector is big enough to deal with any
7023 backreferences, captured substring offsets will already be set up. In the case
7024 where we had to get some local store to hold offsets for backreference
7025 processing, copy those that we can. In this case there need not be overflow if
7026 certain parts of the pattern were not used, even though there are more
7027 capturing parentheses than vector slots. */
7028
7029 ENDLOOP:
7030
7031 if (rc == MATCH_MATCH || rc == MATCH_ACCEPT)
7032 {
7033 if (using_temporary_offsets)
7034 {
7035 if (arg_offset_max >= 4)
7036 {
7037 memcpy(offsets + 2, md->offset_vector + 2,
7038 (arg_offset_max - 2) * sizeof(int));
7039 DPRINTF(("Copied offsets from temporary memory\n"));
7040 }
7041 if (md->end_offset_top > arg_offset_max) md->offset_overflow = TRUE;
7042 DPRINTF(("Freeing temporary memory\n"));
7043 (PUBL(free))(md->offset_vector);
7044 }
7045
7046 /* Set the return code to the number of captured strings, or 0 if there were
7047 too many to fit into the vector. */
7048
7049 rc = (md->offset_overflow && md->end_offset_top >= arg_offset_max)?
7050 0 : md->end_offset_top/2;
7051
7052 /* If there is space in the offset vector, set any unused pairs at the end of
7053 the pattern to -1 for backwards compatibility. It is documented that this
7054 happens. In earlier versions, the whole set of potential capturing offsets
7055 was set to -1 each time round the loop, but this is handled differently now.
7056 "Gaps" are set to -1 dynamically instead (this fixes a bug). Thus, it is only
7057 those at the end that need unsetting here. We can't just unset them all at
7058 the start of the whole thing because they may get set in one branch that is
7059 not the final matching branch. */
7060
7061 if (md->end_offset_top/2 <= re->top_bracket && offsets != NULL)
7062 {
7063 int *iptr, *iend;
7064 int resetcount = 2 + re->top_bracket * 2;
7065 if (resetcount > offsetcount) resetcount = offsetcount;
7066 iptr = offsets + md->end_offset_top;
7067 iend = offsets + resetcount;
7068 while (iptr < iend) *iptr++ = -1;
7069 }
7070
7071 /* If there is space, set up the whole thing as substring 0. The value of
7072 md->start_match_ptr might be modified if \K was encountered on the success
7073 matching path. */
7074
7075 if (offsetcount < 2) rc = 0; else
7076 {
7077 offsets[0] = (int)(md->start_match_ptr - md->start_subject);
7078 offsets[1] = (int)(md->end_match_ptr - md->start_subject);
7079 }
7080
7081 /* Return MARK data if requested */
7082
7083 if (extra_data != NULL && (extra_data->flags & PCRE_EXTRA_MARK) != 0)
7084 *(extra_data->mark) = (pcre_uchar *)md->mark;
7085 DPRINTF((">>>> returning %d\n", rc));
7086 #ifdef NO_RECURSE
7087 release_match_heapframes(&frame_zero);
7088 #endif
7089 return rc;
7090 }
7091
7092 /* Control gets here if there has been an error, or if the overall match
7093 attempt has failed at all permitted starting positions. */
7094
7095 if (using_temporary_offsets)
7096 {
7097 DPRINTF(("Freeing temporary memory\n"));
7098 (PUBL(free))(md->offset_vector);
7099 }
7100
7101 /* For anything other than nomatch or partial match, just return the code. */
7102
7103 if (rc != MATCH_NOMATCH && rc != PCRE_ERROR_PARTIAL)
7104 {
7105 DPRINTF((">>>> error: returning %d\n", rc));
7106 #ifdef NO_RECURSE
7107 release_match_heapframes(&frame_zero);
7108 #endif
7109 return rc;
7110 }
7111
7112 /* Handle partial matches - disable any mark data */
7113
7114 if (start_partial != NULL)
7115 {
7116 DPRINTF((">>>> returning PCRE_ERROR_PARTIAL\n"));
7117 md->mark = NULL;
7118 if (offsetcount > 1)
7119 {
7120 offsets[0] = (int)(start_partial - (PCRE_PUCHAR)subject);
7121 offsets[1] = (int)(end_subject - (PCRE_PUCHAR)subject);
7122 }
7123 rc = PCRE_ERROR_PARTIAL;
7124 }
7125
7126 /* This is the classic nomatch case */
7127
7128 else
7129 {
7130 DPRINTF((">>>> returning PCRE_ERROR_NOMATCH\n"));
7131 rc = PCRE_ERROR_NOMATCH;
7132 }
7133
7134 /* Return the MARK data if it has been requested. */
7135
7136 if (extra_data != NULL && (extra_data->flags & PCRE_EXTRA_MARK) != 0)
7137 *(extra_data->mark) = (pcre_uchar *)md->nomatch_mark;
7138 #ifdef NO_RECURSE
7139 release_match_heapframes(&frame_zero);
7140 #endif
7141 return rc;
7142 }
7143
7144 /* End of pcre_exec.c */
7145