1 /*************************************************
2 * Perl-Compatible Regular Expressions *
3 *************************************************/
4
5 /* PCRE is a library of functions to support regular expressions whose syntax
6 and semantics are as close as possible to those of the Perl 5 language.
7
8 Written by Philip Hazel
9 Original API code Copyright (c) 1997-2012 University of Cambridge
10 New API code Copyright (c) 2016 University of Cambridge
11
12 -----------------------------------------------------------------------------
13 Redistribution and use in source and binary forms, with or without
14 modification, are permitted provided that the following conditions are met:
15
16 * Redistributions of source code must retain the above copyright notice,
17 this list of conditions and the following disclaimer.
18
19 * Redistributions in binary form must reproduce the above copyright
20 notice, this list of conditions and the following disclaimer in the
21 documentation and/or other materials provided with the distribution.
22
23 * Neither the name of the University of Cambridge nor the names of its
24 contributors may be used to endorse or promote products derived from
25 this software without specific prior written permission.
26
27 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
28 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
29 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
30 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
31 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
32 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
33 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
34 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
35 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
36 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
37 POSSIBILITY OF SUCH DAMAGE.
38 -----------------------------------------------------------------------------
39 */
40
41
42 #ifdef HAVE_CONFIG_H
43 #include "config.h"
44 #endif
45
46 #define NLBLOCK mb /* Block containing newline information */
47 #define PSSTART start_subject /* Field containing processed string start */
48 #define PSEND end_subject /* Field containing processed string end */
49
50 #include "pcre2_internal.h"
51
52 /* Masks for identifying the public options that are permitted at match time.
53 */
54
55 #define PUBLIC_MATCH_OPTIONS \
56 (PCRE2_ANCHORED|PCRE2_NOTBOL|PCRE2_NOTEOL|PCRE2_NOTEMPTY| \
57 PCRE2_NOTEMPTY_ATSTART|PCRE2_NO_UTF_CHECK|PCRE2_PARTIAL_HARD| \
58 PCRE2_PARTIAL_SOFT|PCRE2_NO_JIT)
59
60 #define PUBLIC_JIT_MATCH_OPTIONS \
61 (PCRE2_NO_UTF_CHECK|PCRE2_NOTBOL|PCRE2_NOTEOL|PCRE2_NOTEMPTY|\
62 PCRE2_NOTEMPTY_ATSTART|PCRE2_PARTIAL_SOFT|PCRE2_PARTIAL_HARD)
63
64 /* The mb->capture_last field uses the lower 16 bits for the last captured
65 substring (which can never be greater than 65535) and a bit in the top half
66 to mean "capture vector overflowed". This odd way of doing things was
67 implemented when it was realized that preserving and restoring the overflow bit
68 whenever the last capture number was saved/restored made for a neater
69 interface, and doing it this way saved on (a) another variable, which would
70 have increased the stack frame size (a big NO-NO in PCRE) and (b) another
71 separate set of save/restore instructions. The following defines are used in
72 implementing this. */
73
74 #define CAPLMASK 0x0000ffff /* The bits used for last_capture */
75 #define OVFLMASK 0xffff0000 /* The bits used for the overflow flag */
76 #define OVFLBIT 0x00010000 /* The bit that is set for overflow */
77
78 /* Bits for setting in mb->match_function_type to indicate two special types
79 of call to match(). We do it this way to save on using another stack variable,
80 as stack usage is to be discouraged. */
81
82 #define MATCH_CONDASSERT 1 /* Called to check a condition assertion */
83 #define MATCH_CBEGROUP 2 /* Could-be-empty unlimited repeat group */
84
85 /* Non-error returns from the match() function. Error returns are externally
86 defined PCRE2_ERROR_xxx codes, which are all negative. */
87
88 #define MATCH_MATCH 1
89 #define MATCH_NOMATCH 0
90
91 /* Special internal returns from the match() function. Make them sufficiently
92 negative to avoid the external error codes. */
93
94 #define MATCH_ACCEPT (-999)
95 #define MATCH_KETRPOS (-998)
96 #define MATCH_ONCE (-997)
97 /* The next 5 must be kept together and in sequence so that a test that checks
98 for any one of them can use a range. */
99 #define MATCH_COMMIT (-996)
100 #define MATCH_PRUNE (-995)
101 #define MATCH_SKIP (-994)
102 #define MATCH_SKIP_ARG (-993)
103 #define MATCH_THEN (-992)
104 #define MATCH_BACKTRACK_MAX MATCH_THEN
105 #define MATCH_BACKTRACK_MIN MATCH_COMMIT
106
107 /* Min and max values for the common repeats; for the maxima, 0 => infinity */
108
109 static const char rep_min[] = { 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, };
110 static const char rep_max[] = { 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, };
111
112 /* Maximum number of ovector elements that can be saved on the system stack
113 when processing OP_RECURSE in non-HEAP_MATCH_RECURSE mode. If the ovector is
114 bigger, malloc() is used. This value should be a multiple of 3, because the
115 ovector length is always a multiple of 3. */
116
117 #define OP_RECURSE_STACK_SAVE_MAX 45
118
119
120
121 /*************************************************
122 * Match a back-reference *
123 *************************************************/
124
125 /* This function is called only when it is known that the offset lies within
126 the offsets that have so far been used in the match. Note that in caseless
127 UTF-8 mode, the number of subject bytes matched may be different to the number
128 of reference bytes. (In theory this could also happen in UTF-16 mode, but it
129 seems unlikely.)
130
131 Arguments:
132 offset index into the offset vector
133 offset_top top of the used offset vector
134 eptr pointer into the subject
135 mb points to match block
136 caseless TRUE if caseless
137 lengthptr pointer for returning the length matched
138
139 Returns: = 0 sucessful match; number of code units matched is set
140 < 0 no match
141 > 0 partial match
142 */
143
144 static int
match_ref(PCRE2_SIZE offset,PCRE2_SIZE offset_top,register PCRE2_SPTR eptr,match_block * mb,BOOL caseless,PCRE2_SIZE * lengthptr)145 match_ref(PCRE2_SIZE offset, PCRE2_SIZE offset_top, register PCRE2_SPTR eptr,
146 match_block *mb, BOOL caseless, PCRE2_SIZE *lengthptr)
147 {
148 #if defined SUPPORT_UNICODE
149 BOOL utf = (mb->poptions & PCRE2_UTF) != 0;
150 #endif
151
152 register PCRE2_SPTR p;
153 PCRE2_SIZE length;
154 PCRE2_SPTR eptr_start = eptr;
155
156 /* Deal with an unset group. The default is no match, but there is an option to
157 match an empty string. */
158
159 if (offset >= offset_top || mb->ovector[offset] == PCRE2_UNSET)
160 {
161 if ((mb->poptions & PCRE2_MATCH_UNSET_BACKREF) != 0)
162 {
163 *lengthptr = 0;
164 return 0; /* Match */
165 }
166 else return -1; /* No match */
167 }
168
169 /* Separate the caseless and UTF cases for speed. */
170
171 p = mb->start_subject + mb->ovector[offset];
172 length = mb->ovector[offset+1] - mb->ovector[offset];
173
174 if (caseless)
175 {
176 #if defined SUPPORT_UNICODE
177 if (utf)
178 {
179 /* Match characters up to the end of the reference. NOTE: the number of
180 code units matched may differ, because in UTF-8 there are some characters
181 whose upper and lower case versions code have different numbers of bytes.
182 For example, U+023A (2 bytes in UTF-8) is the upper case version of U+2C65
183 (3 bytes in UTF-8); a sequence of 3 of the former uses 6 bytes, as does a
184 sequence of two of the latter. It is important, therefore, to check the
185 length along the reference, not along the subject (earlier code did this
186 wrong). */
187
188 PCRE2_SPTR endptr = p + length;
189 while (p < endptr)
190 {
191 uint32_t c, d;
192 const ucd_record *ur;
193 if (eptr >= mb->end_subject) return 1; /* Partial match */
194 GETCHARINC(c, eptr);
195 GETCHARINC(d, p);
196 ur = GET_UCD(d);
197 if (c != d && c != (uint32_t)((int)d + ur->other_case))
198 {
199 const uint32_t *pp = PRIV(ucd_caseless_sets) + ur->caseset;
200 for (;;)
201 {
202 if (c < *pp) return -1; /* No match */
203 if (c == *pp++) break;
204 }
205 }
206 }
207 }
208 else
209 #endif
210
211 /* Not in UTF mode */
212
213 {
214 for (; length > 0; length--)
215 {
216 uint32_t cc, cp;
217 if (eptr >= mb->end_subject) return 1; /* Partial match */
218 cc = UCHAR21TEST(eptr);
219 cp = UCHAR21TEST(p);
220 if (TABLE_GET(cp, mb->lcc, cp) != TABLE_GET(cc, mb->lcc, cc))
221 return -1; /* No match */
222 p++;
223 eptr++;
224 }
225 }
226 }
227
228 /* In the caseful case, we can just compare the code units, whether or not we
229 are in UTF mode. */
230
231 else
232 {
233 for (; length > 0; length--)
234 {
235 if (eptr >= mb->end_subject) return 1; /* Partial match */
236 if (UCHAR21INCTEST(p) != UCHAR21INCTEST(eptr)) return -1; /*No match */
237 }
238 }
239
240 *lengthptr = eptr - eptr_start;
241 return 0; /* Match */
242 }
243
244
245
246 /***************************************************************************
247 ****************************************************************************
248 RECURSION IN THE match() FUNCTION
249
250 The match() function is highly recursive, though not every recursive call
251 increases the recursion depth. Nevertheless, some regular expressions can cause
252 it to recurse to a great depth. I was writing for Unix, so I just let it call
253 itself recursively. This uses the stack for saving everything that has to be
254 saved for a recursive call. On Unix, the stack can be large, and this works
255 fine.
256
257 It turns out that on some non-Unix-like systems there are problems with
258 programs that use a lot of stack. (This despite the fact that every last chip
259 has oodles of memory these days, and techniques for extending the stack have
260 been known for decades.) So....
261
262 There is a fudge, triggered by defining HEAP_MATCH_RECURSE, which avoids
263 recursive calls by keeping local variables that need to be preserved in blocks
264 of memory on the heap instead instead of on the stack. Macros are used to
265 achieve this so that the actual code doesn't look very different to what it
266 always used to.
267
268 The original heap-recursive code used longjmp(). However, it seems that this
269 can be very slow on some operating systems. Following a suggestion from Stan
270 Switzer, the use of longjmp() has been abolished, at the cost of having to
271 provide a unique number for each call to RMATCH. There is no way of generating
272 a sequence of numbers at compile time in C. I have given them names, to make
273 them stand out more clearly.
274
275 Crude tests on x86 Linux show a small speedup of around 5-8%. However, on
276 FreeBSD, avoiding longjmp() more than halves the time taken to run the standard
277 tests. Furthermore, not using longjmp() means that local dynamic variables
278 don't have indeterminate values; this has meant that the frame size can be
279 reduced because the result can be "passed back" by straight setting of the
280 variable instead of being passed in the frame.
281 ****************************************************************************
282 ***************************************************************************/
283
284 /* Numbers for RMATCH calls. When this list is changed, the code at HEAP_RETURN
285 below must be updated in sync. */
286
287 enum { RM1=1, RM2, RM3, RM4, RM5, RM6, RM7, RM8, RM9, RM10,
288 RM11, RM12, RM13, RM14, RM15, RM16, RM17, RM18, RM19, RM20,
289 RM21, RM22, RM23, RM24, RM25, RM26, RM27, RM28, RM29, RM30,
290 RM31, RM32, RM33, RM34, RM35, RM36, RM37, RM38, RM39, RM40,
291 RM41, RM42, RM43, RM44, RM45, RM46, RM47, RM48, RM49, RM50,
292 RM51, RM52, RM53, RM54, RM55, RM56, RM57, RM58, RM59, RM60,
293 RM61, RM62, RM63, RM64, RM65, RM66, RM67, RM68 };
294
295 /* These versions of the macros use the stack, as normal. Note that the "rw"
296 argument of RMATCH isn't actually used in this definition. */
297
298 #ifndef HEAP_MATCH_RECURSE
299 #define REGISTER register
300 #define RMATCH(ra,rb,rc,rd,re,rw) \
301 rrc = match(ra,rb,mstart,rc,rd,re,rdepth+1)
302 #define RRETURN(ra) return ra
303 #else
304
305 /* These versions of the macros manage a private stack on the heap. Note that
306 the "rd" argument of RMATCH isn't actually used in this definition. It's the mb
307 argument of match(), which never changes. */
308
309 #define REGISTER
310
311 #define RMATCH(ra,rb,rc,rd,re,rw)\
312 {\
313 heapframe *newframe = frame->Xnextframe;\
314 if (newframe == NULL)\
315 {\
316 newframe = (heapframe *)(mb->stack_memctl.malloc)\
317 (sizeof(heapframe), mb->stack_memctl.memory_data);\
318 if (newframe == NULL) RRETURN(PCRE2_ERROR_NOMEMORY);\
319 newframe->Xnextframe = NULL;\
320 frame->Xnextframe = newframe;\
321 }\
322 frame->Xwhere = rw;\
323 newframe->Xeptr = ra;\
324 newframe->Xecode = rb;\
325 newframe->Xmstart = mstart;\
326 newframe->Xoffset_top = rc;\
327 newframe->Xeptrb = re;\
328 newframe->Xrdepth = frame->Xrdepth + 1;\
329 newframe->Xprevframe = frame;\
330 frame = newframe;\
331 goto HEAP_RECURSE;\
332 L_##rw:;\
333 }
334
335 #define RRETURN(ra)\
336 {\
337 heapframe *oldframe = frame;\
338 frame = oldframe->Xprevframe;\
339 if (frame != NULL)\
340 {\
341 rrc = ra;\
342 goto HEAP_RETURN;\
343 }\
344 return ra;\
345 }
346
347
348 /* Structure for remembering the local variables in a private frame. Arrange it
349 so as to minimize the number of holes. */
350
351 typedef struct heapframe {
352 struct heapframe *Xprevframe;
353 struct heapframe *Xnextframe;
354
355 #ifdef SUPPORT_UNICODE
356 PCRE2_SPTR Xcharptr;
357 #endif
358 PCRE2_SPTR Xeptr;
359 PCRE2_SPTR Xecode;
360 PCRE2_SPTR Xmstart;
361 PCRE2_SPTR Xcallpat;
362 PCRE2_SPTR Xdata;
363 PCRE2_SPTR Xnext_ecode;
364 PCRE2_SPTR Xpp;
365 PCRE2_SPTR Xprev;
366 PCRE2_SPTR Xsaved_eptr;
367
368 eptrblock *Xeptrb;
369
370 PCRE2_SIZE Xlength;
371 PCRE2_SIZE Xoffset;
372 PCRE2_SIZE Xoffset_top;
373 PCRE2_SIZE Xsave_offset1, Xsave_offset2, Xsave_offset3;
374
375 uint32_t Xfc;
376 uint32_t Xnumber;
377 uint32_t Xrdepth;
378 uint32_t Xop;
379 uint32_t Xsave_capture_last;
380
381 #ifdef SUPPORT_UNICODE
382 uint32_t Xprop_value;
383 int Xprop_type;
384 int Xprop_fail_result;
385 int Xoclength;
386 #endif
387
388 int Xcodelink;
389 int Xctype;
390 int Xfi;
391 int Xmax;
392 int Xmin;
393 int Xwhere; /* Where to jump back to */
394
395 BOOL Xcondition;
396 BOOL Xcur_is_word;
397 BOOL Xprev_is_word;
398
399 eptrblock Xnewptrb;
400 recursion_info Xnew_recursive;
401
402 #ifdef SUPPORT_UNICODE
403 PCRE2_UCHAR Xocchars[6];
404 #endif
405 } heapframe;
406
407 #endif
408
409
410 /***************************************************************************
411 ***************************************************************************/
412
413
414 /* When HEAP_MATCH_RECURSE is not defined, the match() function implements
415 backtrack points by calling itself recursively in all but one case. The one
416 special case is when processing OP_RECURSE, which specifies recursion in the
417 pattern. The entire ovector must be saved and restored while processing
418 OP_RECURSE. If the ovector is small enough, instead of calling match()
419 directly, op_recurse_ovecsave() is called. This function uses the system stack
420 to save the ovector while calling match() to process the pattern recursion. */
421
422 #ifndef HEAP_MATCH_RECURSE
423
424 /* We need a prototype for match() because it is mutually recursive with
425 op_recurse_ovecsave(). */
426
427 static int
428 match(REGISTER PCRE2_SPTR eptr, REGISTER PCRE2_SPTR ecode, PCRE2_SPTR mstart,
429 PCRE2_SIZE offset_top, match_block *mb, eptrblock *eptrb, uint32_t rdepth);
430
431
432 /*************************************************
433 * Process OP_RECURSE, stacking ovector *
434 *************************************************/
435
436 /* When this function is called, mb->recursive has already been updated to
437 point to a new recursion data block, and all its fields other than ovec_save
438 have been set.
439
440 This function exists so that the local vector variable ovecsave is no longer
441 defined in the match() function, as it was in PCRE1. It is used only when there
442 is recursion in the pattern, so it wastes a lot of stack to have it defined for
443 every call of match(). We now use this function as an indirect way of calling
444 match() only in the case when ovecsave is needed. (David Wheeler used to say
445 "All problems in computer science can be solved by another level of
446 indirection.")
447
448 HOWEVER: when this file is compiled by gcc in an optimizing mode, because this
449 function is called only once, and only from within match(), gcc will "inline"
450 it - that is, move it inside match() - and this completely negates its reason
451 for existence. Therefore, we mark it as non-inline when gcc is in use.
452
453 Arguments:
454 eptr pointer to current character in subject
455 callpat the recursion point in the pattern
456 mstart pointer to the current match start position (can be modified
457 by encountering \K)
458 offset_top current top pointer (highest ovector offset used + 1)
459 mb pointer to "static" info block for the match
460 eptrb pointer to chain of blocks containing eptr at start of
461 brackets - for testing for empty matches
462 rdepth the recursion depth
463
464 Returns: a match() return code
465 */
466
467 static int
468 #if defined(__GNUC__) && !defined(__INTEL_COMPILER)
469 __attribute__ ((noinline))
470 #endif
op_recurse_ovecsave(REGISTER PCRE2_SPTR eptr,PCRE2_SPTR callpat,PCRE2_SPTR mstart,PCRE2_SIZE offset_top,match_block * mb,eptrblock * eptrb,uint32_t rdepth)471 op_recurse_ovecsave(REGISTER PCRE2_SPTR eptr, PCRE2_SPTR callpat,
472 PCRE2_SPTR mstart, PCRE2_SIZE offset_top, match_block *mb, eptrblock *eptrb,
473 uint32_t rdepth)
474 {
475 register int rrc;
476 BOOL cbegroup = *callpat >= OP_SBRA;
477 recursion_info *new_recursive = mb->recursive;
478 PCRE2_SIZE ovecsave[OP_RECURSE_STACK_SAVE_MAX];
479
480 /* Save the ovector */
481
482 new_recursive->ovec_save = ovecsave;
483 memcpy(ovecsave, mb->ovector, mb->offset_end * sizeof(PCRE2_SIZE));
484
485 /* Do the recursion. After processing each alternative, restore the ovector
486 data and the last captured value. */
487
488 do
489 {
490 if (cbegroup) mb->match_function_type |= MATCH_CBEGROUP;
491 rrc = match(eptr, callpat + PRIV(OP_lengths)[*callpat], mstart, offset_top,
492 mb, eptrb, rdepth + 1);
493 memcpy(mb->ovector, new_recursive->ovec_save,
494 mb->offset_end * sizeof(PCRE2_SIZE));
495 mb->capture_last = new_recursive->saved_capture_last;
496 if (rrc == MATCH_MATCH || rrc == MATCH_ACCEPT) return rrc;
497
498 /* PCRE does not allow THEN, SKIP, PRUNE or COMMIT to escape beyond a
499 recursion; they cause a NOMATCH for the entire recursion. These codes
500 are defined in a range that can be tested for. */
501
502 if (rrc >= MATCH_BACKTRACK_MIN && rrc <= MATCH_BACKTRACK_MAX)
503 return MATCH_NOMATCH;
504
505 /* Any return code other than NOMATCH is an error. Otherwise, advance to the
506 next alternative or to the end of the recursing subpattern. If there were
507 nested recursions, mb->recursive might be changed, so reset it before
508 looping. */
509
510 if (rrc != MATCH_NOMATCH) return rrc;
511 mb->recursive = new_recursive;
512 callpat += GET(callpat, 1);
513 }
514 while (*callpat == OP_ALT); /* Loop for the alternatives */
515
516 /* None of the alternatives matched. */
517
518 return MATCH_NOMATCH;
519 }
520 #endif /* HEAP_MATCH_RECURSE */
521
522
523
524 /*************************************************
525 * Match from current position *
526 *************************************************/
527
528 /* This function is called recursively in many circumstances. Whenever it
529 returns a negative (error) response, the outer incarnation must also return the
530 same response. */
531
532 /* These macros pack up tests that are used for partial matching, and which
533 appear several times in the code. We set the "hit end" flag if the pointer is
534 at the end of the subject and also past the earliest inspected character (i.e.
535 something has been matched, even if not part of the actual matched string). For
536 hard partial matching, we then return immediately. The second one is used when
537 we already know we are past the end of the subject. */
538
539 #define CHECK_PARTIAL()\
540 if (mb->partial != 0 && eptr >= mb->end_subject && \
541 eptr > mb->start_used_ptr) \
542 { \
543 mb->hitend = TRUE; \
544 if (mb->partial > 1) RRETURN(PCRE2_ERROR_PARTIAL); \
545 }
546
547 #define SCHECK_PARTIAL()\
548 if (mb->partial != 0 && eptr > mb->start_used_ptr) \
549 { \
550 mb->hitend = TRUE; \
551 if (mb->partial > 1) RRETURN(PCRE2_ERROR_PARTIAL); \
552 }
553
554
555 /* Performance note: It might be tempting to extract commonly used fields from
556 the mb structure (e.g. utf, end_subject) into individual variables to improve
557 performance. Tests using gcc on a SPARC disproved this; in the first case, it
558 made performance worse.
559
560 Arguments:
561 eptr pointer to current character in subject
562 ecode pointer to current position in compiled code
563 mstart pointer to the current match start position (can be modified
564 by encountering \K)
565 offset_top current top pointer (highest ovector offset used + 1)
566 mb pointer to "static" info block for the match
567 eptrb pointer to chain of blocks containing eptr at start of
568 brackets - for testing for empty matches
569 rdepth the recursion depth
570
571 Returns: MATCH_MATCH if matched ) these values are >= 0
572 MATCH_NOMATCH if failed to match )
573 a negative MATCH_xxx value for PRUNE, SKIP, etc
574 a negative PCRE2_ERROR_xxx value if aborted by an error condition
575 (e.g. stopped by repeated call or recursion limit)
576 */
577
578 static int
match(REGISTER PCRE2_SPTR eptr,REGISTER PCRE2_SPTR ecode,PCRE2_SPTR mstart,PCRE2_SIZE offset_top,match_block * mb,eptrblock * eptrb,uint32_t rdepth)579 match(REGISTER PCRE2_SPTR eptr, REGISTER PCRE2_SPTR ecode, PCRE2_SPTR mstart,
580 PCRE2_SIZE offset_top, match_block *mb, eptrblock *eptrb, uint32_t rdepth)
581 {
582 /* These variables do not need to be preserved over recursion in this function,
583 so they can be ordinary variables in all cases. Mark some of them with
584 "register" because they are used a lot in loops. */
585
586 register int rrc; /* Returns from recursive calls */
587 register int i; /* Used for loops not involving calls to RMATCH() */
588 register uint32_t c; /* Character values not kept over RMATCH() calls */
589 register BOOL utf; /* Local copy of UTF flag for speed */
590
591 BOOL minimize, possessive; /* Quantifier options */
592 BOOL caseless;
593 int condcode;
594
595 /* When recursion is not being used, all "local" variables that have to be
596 preserved over calls to RMATCH() are part of a "frame". We set up the top-level
597 frame on the stack here; subsequent instantiations are obtained from the heap
598 whenever RMATCH() does a "recursion". See the macro definitions above. Putting
599 the top-level on the stack rather than malloc-ing them all gives a performance
600 boost in many cases where there is not much "recursion". */
601
602 #ifdef HEAP_MATCH_RECURSE
603 heapframe *frame = (heapframe *)mb->match_frames_base;
604
605 /* Copy in the original argument variables */
606
607 frame->Xeptr = eptr;
608 frame->Xecode = ecode;
609 frame->Xmstart = mstart;
610 frame->Xoffset_top = offset_top;
611 frame->Xeptrb = eptrb;
612 frame->Xrdepth = rdepth;
613
614 /* This is where control jumps back to to effect "recursion" */
615
616 HEAP_RECURSE:
617
618 /* Macros make the argument variables come from the current frame */
619
620 #define eptr frame->Xeptr
621 #define ecode frame->Xecode
622 #define mstart frame->Xmstart
623 #define offset_top frame->Xoffset_top
624 #define eptrb frame->Xeptrb
625 #define rdepth frame->Xrdepth
626
627 /* Ditto for the local variables */
628
629 #ifdef SUPPORT_UNICODE
630 #define charptr frame->Xcharptr
631 #define prop_value frame->Xprop_value
632 #define prop_type frame->Xprop_type
633 #define prop_fail_result frame->Xprop_fail_result
634 #define oclength frame->Xoclength
635 #define occhars frame->Xocchars
636 #endif
637
638
639 #define callpat frame->Xcallpat
640 #define codelink frame->Xcodelink
641 #define data frame->Xdata
642 #define next_ecode frame->Xnext_ecode
643 #define pp frame->Xpp
644 #define prev frame->Xprev
645 #define saved_eptr frame->Xsaved_eptr
646
647 #define new_recursive frame->Xnew_recursive
648
649 #define ctype frame->Xctype
650 #define fc frame->Xfc
651 #define fi frame->Xfi
652 #define length frame->Xlength
653 #define max frame->Xmax
654 #define min frame->Xmin
655 #define number frame->Xnumber
656 #define offset frame->Xoffset
657 #define op frame->Xop
658 #define save_capture_last frame->Xsave_capture_last
659 #define save_offset1 frame->Xsave_offset1
660 #define save_offset2 frame->Xsave_offset2
661 #define save_offset3 frame->Xsave_offset3
662
663 #define condition frame->Xcondition
664 #define cur_is_word frame->Xcur_is_word
665 #define prev_is_word frame->Xprev_is_word
666
667 #define newptrb frame->Xnewptrb
668
669 /* When normal stack-based recursion is being used for match(), local variables
670 are allocated on the stack and get preserved during recursion in the usual way.
671 In this environment, fi and i, and fc and c, can be the same variables. */
672
673 #else /* HEAP_MATCH_RECURSE not defined */
674 #define fi i
675 #define fc c
676
677 /* Many of the following variables are used only in small blocks of the code.
678 My normal style of coding would have declared them within each of those blocks.
679 However, in order to accommodate the version of this code that uses an external
680 "stack" implemented on the heap, it is easier to declare them all here, so the
681 declarations can be cut out in a block. The only declarations within blocks
682 below are for variables that do not have to be preserved over a recursive call
683 to RMATCH(). */
684
685 #ifdef SUPPORT_UNICODE
686 PCRE2_SPTR charptr;
687 #endif
688 PCRE2_SPTR callpat;
689 PCRE2_SPTR data;
690 PCRE2_SPTR next_ecode;
691 PCRE2_SPTR pp;
692 PCRE2_SPTR prev;
693 PCRE2_SPTR saved_eptr;
694
695 PCRE2_SIZE length;
696 PCRE2_SIZE offset;
697 PCRE2_SIZE save_offset1, save_offset2, save_offset3;
698
699 uint32_t number;
700 uint32_t op;
701 uint32_t save_capture_last;
702
703 #ifdef SUPPORT_UNICODE
704 uint32_t prop_value;
705 int prop_type;
706 int prop_fail_result;
707 int oclength;
708 PCRE2_UCHAR occhars[6];
709 #endif
710
711 int codelink;
712 int ctype;
713 int max;
714 int min;
715
716 BOOL condition;
717 BOOL cur_is_word;
718 BOOL prev_is_word;
719
720 eptrblock newptrb;
721 recursion_info new_recursive;
722 #endif /* HEAP_MATCH_RECURSE not defined */
723
724 /* To save space on the stack and in the heap frame, I have doubled up on some
725 of the local variables that are used only in localised parts of the code, but
726 still need to be preserved over recursive calls of match(). These macros define
727 the alternative names that are used. */
728
729 #define allow_zero cur_is_word
730 #define cbegroup condition
731 #define code_offset codelink
732 #define condassert condition
733 #define foc number
734 #define matched_once prev_is_word
735 #define save_mark data
736
737 /* These statements are here to stop the compiler complaining about unitialized
738 variables. */
739
740 #ifdef SUPPORT_UNICODE
741 prop_value = 0;
742 prop_fail_result = 0;
743 #endif
744
745
746 /* This label is used for tail recursion, which is used in a few cases even
747 when HEAP_MATCH_RECURSE is not defined, in order to reduce the amount of stack
748 that is used. Thanks to Ian Taylor for noticing this possibility and sending
749 the original patch. */
750
751 TAIL_RECURSE:
752
753 /* OK, now we can get on with the real code of the function. Recursive calls
754 are specified by the macro RMATCH and RRETURN is used to return. When
755 HEAP_MATCH_RECURSE is *not* defined, these just turn into a recursive call to
756 match() and a "return", respectively. However, RMATCH isn't like a function
757 call because it's quite a complicated macro. It has to be used in one
758 particular way. This shouldn't, however, impact performance when true recursion
759 is being used. */
760
761 #ifdef SUPPORT_UNICODE
762 utf = (mb->poptions & PCRE2_UTF) != 0;
763 #else
764 utf = FALSE;
765 #endif
766
767 /* First check that we haven't called match() too many times, or that we
768 haven't exceeded the recursive call limit. */
769
770 if (mb->match_call_count++ >= mb->match_limit) RRETURN(PCRE2_ERROR_MATCHLIMIT);
771 if (rdepth >= mb->match_limit_recursion) RRETURN(PCRE2_ERROR_RECURSIONLIMIT);
772
773 /* At the start of a group with an unlimited repeat that may match an empty
774 string, the variable mb->match_function_type contains the MATCH_CBEGROUP bit.
775 It is done this way to save having to use another function argument, which
776 would take up space on the stack. See also MATCH_CONDASSERT below.
777
778 When MATCH_CBEGROUP is set, add the current subject pointer to the chain of
779 such remembered pointers, to be checked when we hit the closing ket, in order
780 to break infinite loops that match no characters. When match() is called in
781 other circumstances, don't add to the chain. The MATCH_CBEGROUP feature must
782 NOT be used with tail recursion, because the memory block that is used is on
783 the stack, so a new one may be required for each match(). */
784
785 if ((mb->match_function_type & MATCH_CBEGROUP) != 0)
786 {
787 newptrb.epb_saved_eptr = eptr;
788 newptrb.epb_prev = eptrb;
789 eptrb = &newptrb;
790 mb->match_function_type &= ~MATCH_CBEGROUP;
791 }
792
793 /* Now, at last, we can start processing the opcodes. */
794
795 for (;;)
796 {
797 minimize = possessive = FALSE;
798 op = *ecode;
799
800 switch(op)
801 {
802 case OP_MARK:
803 mb->nomatch_mark = ecode + 2;
804 mb->mark = NULL; /* In case previously set by assertion */
805 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode] + ecode[1], offset_top, mb,
806 eptrb, RM55);
807 if ((rrc == MATCH_MATCH || rrc == MATCH_ACCEPT) &&
808 mb->mark == NULL) mb->mark = ecode + 2;
809
810 /* A return of MATCH_SKIP_ARG means that matching failed at SKIP with an
811 argument, and we must check whether that argument matches this MARK's
812 argument. It is passed back in mb->start_match_ptr (an overloading of that
813 variable). If it does match, we reset that variable to the current subject
814 position and return MATCH_SKIP. Otherwise, pass back the return code
815 unaltered. */
816
817 else if (rrc == MATCH_SKIP_ARG &&
818 PRIV(strcmp)(ecode + 2, mb->start_match_ptr) == 0)
819 {
820 mb->start_match_ptr = eptr;
821 RRETURN(MATCH_SKIP);
822 }
823 RRETURN(rrc);
824
825 case OP_FAIL:
826 RRETURN(MATCH_NOMATCH);
827
828 case OP_COMMIT:
829 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, mb,
830 eptrb, RM52);
831 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
832 RRETURN(MATCH_COMMIT);
833
834 case OP_PRUNE:
835 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, mb,
836 eptrb, RM51);
837 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
838 RRETURN(MATCH_PRUNE);
839
840 case OP_PRUNE_ARG:
841 mb->nomatch_mark = ecode + 2;
842 mb->mark = NULL; /* In case previously set by assertion */
843 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode] + ecode[1], offset_top, mb,
844 eptrb, RM56);
845 if ((rrc == MATCH_MATCH || rrc == MATCH_ACCEPT) &&
846 mb->mark == NULL) mb->mark = ecode + 2;
847 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
848 RRETURN(MATCH_PRUNE);
849
850 case OP_SKIP:
851 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, mb,
852 eptrb, RM53);
853 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
854 mb->start_match_ptr = eptr; /* Pass back current position */
855 RRETURN(MATCH_SKIP);
856
857 /* Note that, for Perl compatibility, SKIP with an argument does NOT set
858 nomatch_mark. When a pattern match ends with a SKIP_ARG for which there was
859 not a matching mark, we have to re-run the match, ignoring the SKIP_ARG
860 that failed and any that precede it (either they also failed, or were not
861 triggered). To do this, we maintain a count of executed SKIP_ARGs. If a
862 SKIP_ARG gets to top level, the match is re-run with mb->ignore_skip_arg
863 set to the count of the one that failed. */
864
865 case OP_SKIP_ARG:
866 mb->skip_arg_count++;
867 if (mb->skip_arg_count <= mb->ignore_skip_arg)
868 {
869 ecode += PRIV(OP_lengths)[*ecode] + ecode[1];
870 break;
871 }
872 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode] + ecode[1], offset_top, mb,
873 eptrb, RM57);
874 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
875
876 /* Pass back the current skip name by overloading mb->start_match_ptr and
877 returning the special MATCH_SKIP_ARG return code. This will either be
878 caught by a matching MARK, or get to the top, where it causes a rematch
879 with mb->ignore_skip_arg set to the value of mb->skip_arg_count. */
880
881 mb->start_match_ptr = ecode + 2;
882 RRETURN(MATCH_SKIP_ARG);
883
884 /* For THEN (and THEN_ARG) we pass back the address of the opcode, so that
885 the branch in which it occurs can be determined. Overload the start of
886 match pointer to do this. */
887
888 case OP_THEN:
889 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, mb,
890 eptrb, RM54);
891 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
892 mb->start_match_ptr = ecode;
893 RRETURN(MATCH_THEN);
894
895 case OP_THEN_ARG:
896 mb->nomatch_mark = ecode + 2;
897 mb->mark = NULL; /* In case previously set by assertion */
898 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode] + ecode[1], offset_top,
899 mb, eptrb, RM58);
900 if ((rrc == MATCH_MATCH || rrc == MATCH_ACCEPT) &&
901 mb->mark == NULL) mb->mark = ecode + 2;
902 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
903 mb->start_match_ptr = ecode;
904 RRETURN(MATCH_THEN);
905
906 /* Handle an atomic group that does not contain any capturing parentheses.
907 This can be handled like an assertion. Prior to 8.13, all atomic groups
908 were handled this way. In 8.13, the code was changed as below for ONCE, so
909 that backups pass through the group and thereby reset captured values.
910 However, this uses a lot more stack, so in 8.20, atomic groups that do not
911 contain any captures generate OP_ONCE_NC, which can be handled in the old,
912 less stack intensive way.
913
914 Check the alternative branches in turn - the matching won't pass the KET
915 for this kind of subpattern. If any one branch matches, we carry on as at
916 the end of a normal bracket, leaving the subject pointer, but resetting
917 the start-of-match value in case it was changed by \K. */
918
919 case OP_ONCE_NC:
920 prev = ecode;
921 saved_eptr = eptr;
922 save_mark = mb->mark;
923 do
924 {
925 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, mb, eptrb, RM64);
926 if (rrc == MATCH_MATCH) /* Note: _not_ MATCH_ACCEPT */
927 {
928 mstart = mb->start_match_ptr;
929 break;
930 }
931 if (rrc == MATCH_THEN)
932 {
933 next_ecode = ecode + GET(ecode,1);
934 if (mb->start_match_ptr < next_ecode &&
935 (*ecode == OP_ALT || *next_ecode == OP_ALT))
936 rrc = MATCH_NOMATCH;
937 }
938
939 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
940 ecode += GET(ecode,1);
941 mb->mark = save_mark;
942 }
943 while (*ecode == OP_ALT);
944
945 /* If hit the end of the group (which could be repeated), fail */
946
947 if (*ecode != OP_ONCE_NC && *ecode != OP_ALT) RRETURN(MATCH_NOMATCH);
948
949 /* Continue as from after the group, updating the offsets high water
950 mark, since extracts may have been taken. */
951
952 do ecode += GET(ecode, 1); while (*ecode == OP_ALT);
953
954 offset_top = mb->end_offset_top;
955 eptr = mb->end_match_ptr;
956
957 /* For a non-repeating ket, just continue at this level. This also
958 happens for a repeating ket if no characters were matched in the group.
959 This is the forcible breaking of infinite loops as implemented in Perl
960 5.005. */
961
962 if (*ecode == OP_KET || eptr == saved_eptr)
963 {
964 ecode += 1+LINK_SIZE;
965 break;
966 }
967
968 /* The repeating kets try the rest of the pattern or restart from the
969 preceding bracket, in the appropriate order. The second "call" of match()
970 uses tail recursion, to avoid using another stack frame. */
971
972 if (*ecode == OP_KETRMIN)
973 {
974 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, mb, eptrb, RM65);
975 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
976 ecode = prev;
977 goto TAIL_RECURSE;
978 }
979 else /* OP_KETRMAX */
980 {
981 RMATCH(eptr, prev, offset_top, mb, eptrb, RM66);
982 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
983 ecode += 1 + LINK_SIZE;
984 goto TAIL_RECURSE;
985 }
986 /* Control never gets here */
987
988 /* Handle a capturing bracket, other than those that are possessive with an
989 unlimited repeat. If there is space in the offset vector, save the current
990 subject position in the working slot at the top of the vector. We mustn't
991 change the current values of the data slot, because they may be set from a
992 previous iteration of this group, and be referred to by a reference inside
993 the group. A failure to match might occur after the group has succeeded,
994 if something later on doesn't match. For this reason, we need to restore
995 the working value and also the values of the final offsets, in case they
996 were set by a previous iteration of the same bracket.
997
998 If there isn't enough space in the offset vector, treat this as if it were
999 a non-capturing bracket. Don't worry about setting the flag for the error
1000 case here; that is handled in the code for KET. */
1001
1002 case OP_CBRA:
1003 case OP_SCBRA:
1004 number = GET2(ecode, 1+LINK_SIZE);
1005 offset = number << 1;
1006
1007 if (offset < mb->offset_max)
1008 {
1009 save_offset1 = mb->ovector[offset];
1010 save_offset2 = mb->ovector[offset+1];
1011 save_offset3 = mb->ovector[mb->offset_end - number];
1012 save_capture_last = mb->capture_last;
1013 save_mark = mb->mark;
1014
1015 mb->ovector[mb->offset_end - number] = eptr - mb->start_subject;
1016
1017 for (;;)
1018 {
1019 if (op >= OP_SBRA) mb->match_function_type |= MATCH_CBEGROUP;
1020 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, mb,
1021 eptrb, RM1);
1022 if (rrc == MATCH_ONCE) break; /* Backing up through an atomic group */
1023
1024 /* If we backed up to a THEN, check whether it is within the current
1025 branch by comparing the address of the THEN that is passed back with
1026 the end of the branch. If it is within the current branch, and the
1027 branch is one of two or more alternatives (it either starts or ends
1028 with OP_ALT), we have reached the limit of THEN's action, so convert
1029 the return code to NOMATCH, which will cause normal backtracking to
1030 happen from now on. Otherwise, THEN is passed back to an outer
1031 alternative. This implements Perl's treatment of parenthesized groups,
1032 where a group not containing | does not affect the current alternative,
1033 that is, (X) is NOT the same as (X|(*F)). */
1034
1035 if (rrc == MATCH_THEN)
1036 {
1037 next_ecode = ecode + GET(ecode,1);
1038 if (mb->start_match_ptr < next_ecode &&
1039 (*ecode == OP_ALT || *next_ecode == OP_ALT))
1040 rrc = MATCH_NOMATCH;
1041 }
1042
1043 /* Anything other than NOMATCH is passed back. */
1044
1045 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1046 mb->capture_last = save_capture_last;
1047 ecode += GET(ecode, 1);
1048 mb->mark = save_mark;
1049 if (*ecode != OP_ALT) break;
1050 }
1051
1052 mb->ovector[offset] = save_offset1;
1053 mb->ovector[offset+1] = save_offset2;
1054 mb->ovector[mb->offset_end - number] = save_offset3;
1055
1056 /* At this point, rrc will be one of MATCH_ONCE or MATCH_NOMATCH. */
1057
1058 RRETURN(rrc);
1059 }
1060
1061 /* FALL THROUGH ... Insufficient room for saving captured contents. Treat
1062 as a non-capturing bracket. */
1063
1064 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1065 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1066
1067 /* Non-capturing or atomic group, except for possessive with unlimited
1068 repeat and ONCE group with no captures. Loop for all the alternatives.
1069
1070 When we get to the final alternative within the brackets, we used to return
1071 the result of a recursive call to match() whatever happened so it was
1072 possible to reduce stack usage by turning this into a tail recursion,
1073 except in the case of a possibly empty group. However, now that there is
1074 the possiblity of (*THEN) occurring in the final alternative, this
1075 optimization is no longer always possible.
1076
1077 We can optimize if we know there are no (*THEN)s in the pattern; at present
1078 this is the best that can be done.
1079
1080 MATCH_ONCE is returned when the end of an atomic group is successfully
1081 reached, but subsequent matching fails. It passes back up the tree (causing
1082 captured values to be reset) until the original atomic group level is
1083 reached. This is tested by comparing mb->once_target with the start of the
1084 group. At this point, the return is converted into MATCH_NOMATCH so that
1085 previous backup points can be taken. */
1086
1087 case OP_ONCE:
1088 case OP_BRA:
1089 case OP_SBRA:
1090
1091 for (;;)
1092 {
1093 if (op >= OP_SBRA || op == OP_ONCE)
1094 mb->match_function_type |= MATCH_CBEGROUP;
1095
1096 /* If this is not a possibly empty group, and there are no (*THEN)s in
1097 the pattern, and this is the final alternative, optimize as described
1098 above. */
1099
1100 else if (!mb->hasthen && ecode[GET(ecode, 1)] != OP_ALT)
1101 {
1102 ecode += PRIV(OP_lengths)[*ecode];
1103 goto TAIL_RECURSE;
1104 }
1105
1106 /* In all other cases, we have to make another call to match(). */
1107
1108 save_mark = mb->mark;
1109 save_capture_last = mb->capture_last;
1110 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, mb, eptrb,
1111 RM2);
1112
1113 /* See comment in the code for capturing groups above about handling
1114 THEN. */
1115
1116 if (rrc == MATCH_THEN)
1117 {
1118 next_ecode = ecode + GET(ecode,1);
1119 if (mb->start_match_ptr < next_ecode &&
1120 (*ecode == OP_ALT || *next_ecode == OP_ALT))
1121 rrc = MATCH_NOMATCH;
1122 }
1123
1124 if (rrc != MATCH_NOMATCH)
1125 {
1126 if (rrc == MATCH_ONCE)
1127 {
1128 PCRE2_SPTR scode = ecode;
1129 if (*scode != OP_ONCE) /* If not at start, find it */
1130 {
1131 while (*scode == OP_ALT) scode += GET(scode, 1);
1132 scode -= GET(scode, 1);
1133 }
1134 if (mb->once_target == scode) rrc = MATCH_NOMATCH;
1135 }
1136 RRETURN(rrc);
1137 }
1138 ecode += GET(ecode, 1);
1139 mb->mark = save_mark;
1140 if (*ecode != OP_ALT) break;
1141 mb->capture_last = save_capture_last;
1142 }
1143
1144 RRETURN(MATCH_NOMATCH);
1145
1146 /* Handle possessive capturing brackets with an unlimited repeat. We come
1147 here from BRAZERO with allow_zero set TRUE. The ovector values are
1148 handled similarly to the normal case above. However, the matching is
1149 different. The end of these brackets will always be OP_KETRPOS, which
1150 returns MATCH_KETRPOS without going further in the pattern. By this means
1151 we can handle the group by iteration rather than recursion, thereby
1152 reducing the amount of stack needed. If the ovector is too small for
1153 capturing, treat as non-capturing. */
1154
1155 case OP_CBRAPOS:
1156 case OP_SCBRAPOS:
1157 allow_zero = FALSE;
1158
1159 POSSESSIVE_CAPTURE:
1160 number = GET2(ecode, 1+LINK_SIZE);
1161 offset = number << 1;
1162 if (offset >= mb->offset_max) goto POSSESSIVE_NON_CAPTURE;
1163
1164 matched_once = FALSE;
1165 code_offset = (int)(ecode - mb->start_code);
1166
1167 save_offset1 = mb->ovector[offset];
1168 save_offset2 = mb->ovector[offset+1];
1169 save_offset3 = mb->ovector[mb->offset_end - number];
1170 save_capture_last = mb->capture_last;
1171
1172 /* Each time round the loop, save the current subject position for use
1173 when the group matches. For MATCH_MATCH, the group has matched, so we
1174 restart it with a new subject starting position, remembering that we had
1175 at least one match. For MATCH_NOMATCH, carry on with the alternatives, as
1176 usual. If we haven't matched any alternatives in any iteration, check to
1177 see if a previous iteration matched. If so, the group has matched;
1178 continue from afterwards. Otherwise it has failed; restore the previous
1179 capture values before returning NOMATCH. */
1180
1181 for (;;)
1182 {
1183 mb->ovector[mb->offset_end - number] = eptr - mb->start_subject;
1184 if (op >= OP_SBRA) mb->match_function_type |= MATCH_CBEGROUP;
1185 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, mb,
1186 eptrb, RM63);
1187 if (rrc == MATCH_KETRPOS)
1188 {
1189 offset_top = mb->end_offset_top;
1190 ecode = mb->start_code + code_offset;
1191 save_capture_last = mb->capture_last;
1192 matched_once = TRUE;
1193 mstart = mb->start_match_ptr; /* In case \K changed it */
1194 if (eptr == mb->end_match_ptr) /* Matched an empty string */
1195 {
1196 do ecode += GET(ecode, 1); while (*ecode == OP_ALT);
1197 break;
1198 }
1199 eptr = mb->end_match_ptr;
1200 continue;
1201 }
1202
1203 /* See comment in the code for capturing groups above about handling
1204 THEN. */
1205
1206 if (rrc == MATCH_THEN)
1207 {
1208 next_ecode = ecode + GET(ecode,1);
1209 if (mb->start_match_ptr < next_ecode &&
1210 (*ecode == OP_ALT || *next_ecode == OP_ALT))
1211 rrc = MATCH_NOMATCH;
1212 }
1213
1214 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1215 mb->capture_last = save_capture_last;
1216 ecode += GET(ecode, 1);
1217 if (*ecode != OP_ALT) break;
1218 }
1219
1220 if (!matched_once)
1221 {
1222 mb->ovector[offset] = save_offset1;
1223 mb->ovector[offset+1] = save_offset2;
1224 mb->ovector[mb->offset_end - number] = save_offset3;
1225 }
1226
1227 if (allow_zero || matched_once)
1228 {
1229 ecode += 1 + LINK_SIZE;
1230 break;
1231 }
1232 RRETURN(MATCH_NOMATCH);
1233
1234 /* Non-capturing possessive bracket with unlimited repeat. We come here
1235 from BRAZERO with allow_zero = TRUE. The code is similar to the above,
1236 without the capturing complication. It is written out separately for speed
1237 and cleanliness. */
1238
1239 case OP_BRAPOS:
1240 case OP_SBRAPOS:
1241 allow_zero = FALSE;
1242
1243 POSSESSIVE_NON_CAPTURE:
1244 matched_once = FALSE;
1245 code_offset = (int)(ecode - mb->start_code);
1246 save_capture_last = mb->capture_last;
1247
1248 for (;;)
1249 {
1250 if (op >= OP_SBRA) mb->match_function_type |= MATCH_CBEGROUP;
1251 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, mb,
1252 eptrb, RM48);
1253 if (rrc == MATCH_KETRPOS)
1254 {
1255 offset_top = mb->end_offset_top;
1256 ecode = mb->start_code + code_offset;
1257 matched_once = TRUE;
1258 mstart = mb->start_match_ptr; /* In case \K reset it */
1259 if (eptr == mb->end_match_ptr) /* Matched an empty string */
1260 {
1261 do ecode += GET(ecode, 1); while (*ecode == OP_ALT);
1262 break;
1263 }
1264 eptr = mb->end_match_ptr;
1265 continue;
1266 }
1267
1268 /* See comment in the code for capturing groups above about handling
1269 THEN. */
1270
1271 if (rrc == MATCH_THEN)
1272 {
1273 next_ecode = ecode + GET(ecode,1);
1274 if (mb->start_match_ptr < next_ecode &&
1275 (*ecode == OP_ALT || *next_ecode == OP_ALT))
1276 rrc = MATCH_NOMATCH;
1277 }
1278
1279 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1280 ecode += GET(ecode, 1);
1281 if (*ecode != OP_ALT) break;
1282 mb->capture_last = save_capture_last;
1283 }
1284
1285 if (matched_once || allow_zero)
1286 {
1287 ecode += 1 + LINK_SIZE;
1288 break;
1289 }
1290 RRETURN(MATCH_NOMATCH);
1291
1292 /* Control never reaches here. */
1293
1294 /* Conditional group: compilation checked that there are no more than two
1295 branches. If the condition is false, skipping the first branch takes us
1296 past the end of the item if there is only one branch, but that's exactly
1297 what we want. */
1298
1299 case OP_COND:
1300 case OP_SCOND:
1301
1302 /* The variable codelink will be added to ecode when the condition is
1303 false, to get to the second branch. Setting it to the offset to the ALT
1304 or KET, then incrementing ecode achieves this effect. We now have ecode
1305 pointing to the condition or callout. */
1306
1307 codelink = GET(ecode, 1); /* Offset to the second branch */
1308 ecode += 1 + LINK_SIZE; /* From this opcode */
1309
1310 /* Because of the way auto-callout works during compile, a callout item is
1311 inserted between OP_COND and an assertion condition. */
1312
1313 if (*ecode == OP_CALLOUT || *ecode == OP_CALLOUT_STR)
1314 {
1315 unsigned int callout_length = (*ecode == OP_CALLOUT)
1316 ? PRIV(OP_lengths)[OP_CALLOUT] : GET(ecode, 1 + 2*LINK_SIZE);
1317
1318 if (mb->callout != NULL)
1319 {
1320 pcre2_callout_block cb;
1321 cb.version = 1;
1322 cb.capture_top = offset_top/2;
1323 cb.capture_last = mb->capture_last & CAPLMASK;
1324 cb.offset_vector = mb->ovector;
1325 cb.mark = mb->nomatch_mark;
1326 cb.subject = mb->start_subject;
1327 cb.subject_length = (PCRE2_SIZE)(mb->end_subject - mb->start_subject);
1328 cb.start_match = (PCRE2_SIZE)(mstart - mb->start_subject);
1329 cb.current_position = (PCRE2_SIZE)(eptr - mb->start_subject);
1330 cb.pattern_position = GET(ecode, 1);
1331 cb.next_item_length = GET(ecode, 1 + LINK_SIZE);
1332
1333 if (*ecode == OP_CALLOUT)
1334 {
1335 cb.callout_number = ecode[1 + 2*LINK_SIZE];
1336 cb.callout_string_offset = 0;
1337 cb.callout_string = NULL;
1338 cb.callout_string_length = 0;
1339 }
1340 else
1341 {
1342 cb.callout_number = 0;
1343 cb.callout_string_offset = GET(ecode, 1 + 3*LINK_SIZE);
1344 cb.callout_string = ecode + (1 + 4*LINK_SIZE) + 1;
1345 cb.callout_string_length =
1346 callout_length - (1 + 4*LINK_SIZE) - 2;
1347 }
1348
1349 if ((rrc = mb->callout(&cb, mb->callout_data)) > 0)
1350 RRETURN(MATCH_NOMATCH);
1351 if (rrc < 0) RRETURN(rrc);
1352 }
1353
1354 /* Advance ecode past the callout, so it now points to the condition. We
1355 must adjust codelink so that the value of ecode+codelink is unchanged. */
1356
1357 ecode += callout_length;
1358 codelink -= callout_length;
1359 }
1360
1361 /* Test the various possible conditions */
1362
1363 condition = FALSE;
1364 switch(condcode = *ecode)
1365 {
1366 case OP_RREF: /* Numbered group recursion test */
1367 if (mb->recursive != NULL) /* Not recursing => FALSE */
1368 {
1369 uint32_t recno = GET2(ecode, 1); /* Recursion group number*/
1370 condition = (recno == RREF_ANY || recno == mb->recursive->group_num);
1371 }
1372 break;
1373
1374 case OP_DNRREF: /* Duplicate named group recursion test */
1375 if (mb->recursive != NULL)
1376 {
1377 int count = GET2(ecode, 1 + IMM2_SIZE);
1378 PCRE2_SPTR slot = mb->name_table + GET2(ecode, 1) * mb->name_entry_size;
1379 while (count-- > 0)
1380 {
1381 uint32_t recno = GET2(slot, 0);
1382 condition = recno == mb->recursive->group_num;
1383 if (condition) break;
1384 slot += mb->name_entry_size;
1385 }
1386 }
1387 break;
1388
1389 case OP_CREF: /* Numbered group used test */
1390 offset = GET2(ecode, 1) << 1; /* Doubled ref number */
1391 condition = offset < offset_top &&
1392 mb->ovector[offset] != PCRE2_UNSET;
1393 break;
1394
1395 case OP_DNCREF: /* Duplicate named group used test */
1396 {
1397 int count = GET2(ecode, 1 + IMM2_SIZE);
1398 PCRE2_SPTR slot = mb->name_table + GET2(ecode, 1) * mb->name_entry_size;
1399 while (count-- > 0)
1400 {
1401 offset = GET2(slot, 0) << 1;
1402 condition = offset < offset_top &&
1403 mb->ovector[offset] != PCRE2_UNSET;
1404 if (condition) break;
1405 slot += mb->name_entry_size;
1406 }
1407 }
1408 break;
1409
1410 case OP_FALSE:
1411 case OP_FAIL: /* The assertion (?!) becomes OP_FAIL */
1412 break;
1413
1414 case OP_TRUE:
1415 condition = TRUE;
1416 break;
1417
1418 /* The condition is an assertion. Call match() to evaluate it - setting
1419 the MATCH_CONDASSERT bit in mb->match_function_type causes it to stop at
1420 the end of an assertion. */
1421
1422 default:
1423 mb->match_function_type |= MATCH_CONDASSERT;
1424 RMATCH(eptr, ecode, offset_top, mb, NULL, RM3);
1425 if (rrc == MATCH_MATCH)
1426 {
1427 if (mb->end_offset_top > offset_top)
1428 offset_top = mb->end_offset_top; /* Captures may have happened */
1429 condition = TRUE;
1430
1431 /* Advance ecode past the assertion to the start of the first branch,
1432 but adjust it so that the general choosing code below works. If the
1433 assertion has a quantifier that allows zero repeats we must skip over
1434 the BRAZERO. This is a lunatic thing to do, but somebody did! */
1435
1436 if (*ecode == OP_BRAZERO) ecode++;
1437 ecode += GET(ecode, 1);
1438 while (*ecode == OP_ALT) ecode += GET(ecode, 1);
1439 ecode += 1 + LINK_SIZE - PRIV(OP_lengths)[condcode];
1440 }
1441
1442 /* PCRE doesn't allow the effect of (*THEN) to escape beyond an
1443 assertion; it is therefore treated as NOMATCH. Any other return is an
1444 error. */
1445
1446 else if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN)
1447 {
1448 RRETURN(rrc); /* Need braces because of following else */
1449 }
1450 break;
1451 }
1452
1453 /* Choose branch according to the condition */
1454
1455 ecode += condition? PRIV(OP_lengths)[condcode] : codelink;
1456
1457 /* We are now at the branch that is to be obeyed. As there is only one, we
1458 can use tail recursion to avoid using another stack frame, except when
1459 there is unlimited repeat of a possibly empty group. In the latter case, a
1460 recursive call to match() is always required, unless the second alternative
1461 doesn't exist, in which case we can just plough on. Note that, for
1462 compatibility with Perl, the | in a conditional group is NOT treated as
1463 creating two alternatives. If a THEN is encountered in the branch, it
1464 propagates out to the enclosing alternative (unless nested in a deeper set
1465 of alternatives, of course). */
1466
1467 if (condition || ecode[-(1+LINK_SIZE)] == OP_ALT)
1468 {
1469 if (op != OP_SCOND)
1470 {
1471 goto TAIL_RECURSE;
1472 }
1473
1474 mb->match_function_type |= MATCH_CBEGROUP;
1475 RMATCH(eptr, ecode, offset_top, mb, eptrb, RM49);
1476 RRETURN(rrc);
1477 }
1478
1479 /* Condition false & no alternative; continue after the group. */
1480
1481 else
1482 {
1483 }
1484 break;
1485
1486
1487 /* Before OP_ACCEPT there may be any number of OP_CLOSE opcodes,
1488 to close any currently open capturing brackets. */
1489
1490 case OP_CLOSE:
1491 number = GET2(ecode, 1); /* Must be less than 65536 */
1492 offset = number << 1;
1493 mb->capture_last = (mb->capture_last & OVFLMASK) | number;
1494 if (offset >= mb->offset_max) mb->capture_last |= OVFLBIT; else
1495 {
1496 mb->ovector[offset] =
1497 mb->ovector[mb->offset_end - number];
1498 mb->ovector[offset+1] = eptr - mb->start_subject;
1499
1500 /* If this group is at or above the current highwater mark, ensure that
1501 any groups between the current high water mark and this group are marked
1502 unset and then update the high water mark. */
1503
1504 if (offset >= offset_top)
1505 {
1506 register PCRE2_SIZE *iptr = mb->ovector + offset_top;
1507 register PCRE2_SIZE *iend = mb->ovector + offset;
1508 while (iptr < iend) *iptr++ = PCRE2_UNSET;
1509 offset_top = offset + 2;
1510 }
1511 }
1512 ecode += 1 + IMM2_SIZE;
1513 break;
1514
1515
1516 /* End of the pattern, either real or forced. In an assertion ACCEPT,
1517 update the last used pointer. */
1518
1519 case OP_ASSERT_ACCEPT:
1520 if (eptr > mb->last_used_ptr) mb->last_used_ptr = eptr;
1521
1522 case OP_ACCEPT:
1523 case OP_END:
1524
1525 /* If we have matched an empty string, fail if not in an assertion and not
1526 in a recursion if either PCRE2_NOTEMPTY is set, or if PCRE2_NOTEMPTY_ATSTART
1527 is set and we have matched at the start of the subject. In both cases,
1528 backtracking will then try other alternatives, if any. */
1529
1530 if (eptr == mstart && op != OP_ASSERT_ACCEPT &&
1531 mb->recursive == NULL &&
1532 ((mb->moptions & PCRE2_NOTEMPTY) != 0 ||
1533 ((mb->moptions & PCRE2_NOTEMPTY_ATSTART) != 0 &&
1534 mstart == mb->start_subject + mb->start_offset)))
1535 RRETURN(MATCH_NOMATCH);
1536
1537 /* Otherwise, we have a match. */
1538
1539 mb->end_match_ptr = eptr; /* Record where we ended */
1540 mb->end_offset_top = offset_top; /* and how many extracts were taken */
1541 mb->start_match_ptr = mstart; /* and the start (\K can modify) */
1542
1543 /* For some reason, the macros don't work properly if an expression is
1544 given as the argument to RRETURN when the heap is in use. */
1545
1546 rrc = (op == OP_END)? MATCH_MATCH : MATCH_ACCEPT;
1547 RRETURN(rrc);
1548
1549 /* Assertion brackets. Check the alternative branches in turn - the
1550 matching won't pass the KET for an assertion. If any one branch matches,
1551 the assertion is true. Lookbehind assertions have an OP_REVERSE item at the
1552 start of each branch to move the current point backwards, so the code at
1553 this level is identical to the lookahead case. When the assertion is part
1554 of a condition, we want to return immediately afterwards. The caller of
1555 this incarnation of the match() function will have set MATCH_CONDASSERT in
1556 mb->match_function type, and one of these opcodes will be the first opcode
1557 that is processed. We use a local variable that is preserved over calls to
1558 match() to remember this case. */
1559
1560 case OP_ASSERT:
1561 case OP_ASSERTBACK:
1562 save_mark = mb->mark;
1563 if ((mb->match_function_type & MATCH_CONDASSERT) != 0)
1564 {
1565 condassert = TRUE;
1566 mb->match_function_type &= ~MATCH_CONDASSERT;
1567 }
1568 else condassert = FALSE;
1569
1570 /* Loop for each branch */
1571
1572 do
1573 {
1574 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, mb, NULL, RM4);
1575
1576 /* A match means that the assertion is true; break out of the loop
1577 that matches its alternatives. */
1578
1579 if (rrc == MATCH_MATCH || rrc == MATCH_ACCEPT)
1580 {
1581 mstart = mb->start_match_ptr; /* In case \K reset it */
1582 break;
1583 }
1584
1585 /* If not matched, restore the previous mark setting. */
1586
1587 mb->mark = save_mark;
1588
1589 /* See comment in the code for capturing groups above about handling
1590 THEN. */
1591
1592 if (rrc == MATCH_THEN)
1593 {
1594 next_ecode = ecode + GET(ecode,1);
1595 if (mb->start_match_ptr < next_ecode &&
1596 (*ecode == OP_ALT || *next_ecode == OP_ALT))
1597 rrc = MATCH_NOMATCH;
1598 }
1599
1600 /* Anything other than NOMATCH causes the entire assertion to fail,
1601 passing back the return code. This includes COMMIT, SKIP, PRUNE and an
1602 uncaptured THEN, which means they take their normal effect. This
1603 consistent approach does not always have exactly the same effect as in
1604 Perl. */
1605
1606 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1607 ecode += GET(ecode, 1);
1608 }
1609 while (*ecode == OP_ALT); /* Continue for next alternative */
1610
1611 /* If we have tried all the alternative branches, the assertion has
1612 failed. If not, we broke out after a match. */
1613
1614 if (*ecode == OP_KET) RRETURN(MATCH_NOMATCH);
1615
1616 /* If checking an assertion for a condition, return MATCH_MATCH. */
1617
1618 if (condassert) RRETURN(MATCH_MATCH);
1619
1620 /* Continue from after a successful assertion, updating the offsets high
1621 water mark, since extracts may have been taken during the assertion. */
1622
1623 do ecode += GET(ecode,1); while (*ecode == OP_ALT);
1624 ecode += 1 + LINK_SIZE;
1625 offset_top = mb->end_offset_top;
1626 continue;
1627
1628 /* Negative assertion: all branches must fail to match for the assertion to
1629 succeed. */
1630
1631 case OP_ASSERT_NOT:
1632 case OP_ASSERTBACK_NOT:
1633 save_mark = mb->mark;
1634 if ((mb->match_function_type & MATCH_CONDASSERT) != 0)
1635 {
1636 condassert = TRUE;
1637 mb->match_function_type &= ~MATCH_CONDASSERT;
1638 }
1639 else condassert = FALSE;
1640
1641 /* Loop for each alternative branch. */
1642
1643 do
1644 {
1645 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, mb, NULL, RM5);
1646 mb->mark = save_mark; /* Always restore the mark setting */
1647
1648 switch(rrc)
1649 {
1650 case MATCH_MATCH: /* A successful match means */
1651 case MATCH_ACCEPT: /* the assertion has failed. */
1652 RRETURN(MATCH_NOMATCH);
1653
1654 case MATCH_NOMATCH: /* Carry on with next branch */
1655 break;
1656
1657 /* See comment in the code for capturing groups above about handling
1658 THEN. */
1659
1660 case MATCH_THEN:
1661 next_ecode = ecode + GET(ecode,1);
1662 if (mb->start_match_ptr < next_ecode &&
1663 (*ecode == OP_ALT || *next_ecode == OP_ALT))
1664 {
1665 rrc = MATCH_NOMATCH;
1666 break;
1667 }
1668 /* Otherwise fall through. */
1669
1670 /* COMMIT, SKIP, PRUNE, and an uncaptured THEN cause the whole
1671 assertion to fail to match, without considering any more alternatives.
1672 Failing to match means the assertion is true. This is a consistent
1673 approach, but does not always have the same effect as in Perl. */
1674
1675 case MATCH_COMMIT:
1676 case MATCH_SKIP:
1677 case MATCH_SKIP_ARG:
1678 case MATCH_PRUNE:
1679 do ecode += GET(ecode,1); while (*ecode == OP_ALT);
1680 goto NEG_ASSERT_TRUE; /* Break out of alternation loop */
1681
1682 /* Anything else is an error */
1683
1684 default:
1685 RRETURN(rrc);
1686 }
1687
1688 /* Continue with next branch */
1689
1690 ecode += GET(ecode,1);
1691 }
1692 while (*ecode == OP_ALT);
1693
1694 /* All branches in the assertion failed to match. */
1695
1696 NEG_ASSERT_TRUE:
1697 if (condassert) RRETURN(MATCH_MATCH); /* Condition assertion */
1698 ecode += 1 + LINK_SIZE; /* Continue with current branch */
1699 continue;
1700
1701 /* Move the subject pointer back. This occurs only at the start of
1702 each branch of a lookbehind assertion. If we are too close to the start to
1703 move back, this match function fails. When working with UTF-8 we move
1704 back a number of characters, not bytes. */
1705
1706 case OP_REVERSE:
1707 i = GET(ecode, 1);
1708 #ifdef SUPPORT_UNICODE
1709 if (utf)
1710 {
1711 while (i-- > 0)
1712 {
1713 if (eptr <= mb->start_subject) RRETURN(MATCH_NOMATCH);
1714 eptr--;
1715 BACKCHAR(eptr);
1716 }
1717 }
1718 else
1719 #endif
1720
1721 /* No UTF-8 support, or not in UTF-8 mode: count is byte count */
1722
1723 {
1724 if (i > eptr - mb->start_subject) RRETURN(MATCH_NOMATCH);
1725 eptr -= i;
1726 }
1727
1728 /* Save the earliest consulted character, then skip to next op code */
1729
1730 if (eptr < mb->start_used_ptr) mb->start_used_ptr = eptr;
1731 ecode += 1 + LINK_SIZE;
1732 break;
1733
1734 /* The callout item calls an external function, if one is provided, passing
1735 details of the match so far. This is mainly for debugging, though the
1736 function is able to force a failure. */
1737
1738 case OP_CALLOUT:
1739 case OP_CALLOUT_STR:
1740 {
1741 unsigned int callout_length = (*ecode == OP_CALLOUT)
1742 ? PRIV(OP_lengths)[OP_CALLOUT] : GET(ecode, 1 + 2*LINK_SIZE);
1743
1744 if (mb->callout != NULL)
1745 {
1746 pcre2_callout_block cb;
1747 cb.version = 1;
1748 cb.callout_number = ecode[LINK_SIZE + 1];
1749 cb.capture_top = offset_top/2;
1750 cb.capture_last = mb->capture_last & CAPLMASK;
1751 cb.offset_vector = mb->ovector;
1752 cb.mark = mb->nomatch_mark;
1753 cb.subject = mb->start_subject;
1754 cb.subject_length = (PCRE2_SIZE)(mb->end_subject - mb->start_subject);
1755 cb.start_match = (PCRE2_SIZE)(mstart - mb->start_subject);
1756 cb.current_position = (PCRE2_SIZE)(eptr - mb->start_subject);
1757 cb.pattern_position = GET(ecode, 1);
1758 cb.next_item_length = GET(ecode, 1 + LINK_SIZE);
1759
1760 if (*ecode == OP_CALLOUT)
1761 {
1762 cb.callout_number = ecode[1 + 2*LINK_SIZE];
1763 cb.callout_string_offset = 0;
1764 cb.callout_string = NULL;
1765 cb.callout_string_length = 0;
1766 }
1767 else
1768 {
1769 cb.callout_number = 0;
1770 cb.callout_string_offset = GET(ecode, 1 + 3*LINK_SIZE);
1771 cb.callout_string = ecode + (1 + 4*LINK_SIZE) + 1;
1772 cb.callout_string_length =
1773 callout_length - (1 + 4*LINK_SIZE) - 2;
1774 }
1775
1776 if ((rrc = mb->callout(&cb, mb->callout_data)) > 0)
1777 RRETURN(MATCH_NOMATCH);
1778 if (rrc < 0) RRETURN(rrc);
1779 }
1780 ecode += callout_length;
1781 }
1782 break;
1783
1784 /* Recursion either matches the current regex, or some subexpression. The
1785 offset data is the offset to the starting bracket from the start of the
1786 whole pattern. (This is so that it works from duplicated subpatterns.)
1787
1788 The state of the capturing groups is preserved over recursion, and
1789 re-instated afterwards. We don't know how many are started and not yet
1790 finished (offset_top records the completed total) so we just have to save
1791 all the potential data. There may be up to 65535 such values, which is too
1792 large to put on the stack, but using malloc for small numbers seems
1793 expensive. As a compromise, the stack is used when there are no more than
1794 OP_RECURSE_STACK_SAVE_MAX values to store; otherwise malloc is used.
1795
1796 There are also other values that have to be saved. We use a chained
1797 sequence of blocks that actually live on the stack. Thanks to Robin Houston
1798 for the original version of this logic. It has, however, been hacked around
1799 a lot, so he is not to blame for the current way it works. */
1800
1801 case OP_RECURSE:
1802 {
1803 ovecsave_frame *fr;
1804 recursion_info *ri;
1805 uint32_t recno;
1806
1807 callpat = mb->start_code + GET(ecode, 1);
1808 recno = (callpat == mb->start_code)? 0 : GET2(callpat, 1 + LINK_SIZE);
1809
1810 /* Check for repeating a pattern recursion without advancing the subject
1811 pointer. This should catch convoluted mutual recursions. (Some simple
1812 cases are caught at compile time.) */
1813
1814 for (ri = mb->recursive; ri != NULL; ri = ri->prevrec)
1815 if (recno == ri->group_num && eptr == ri->subject_position)
1816 RRETURN(PCRE2_ERROR_RECURSELOOP);
1817
1818 /* Add to "recursing stack" */
1819
1820 new_recursive.group_num = recno;
1821 new_recursive.saved_capture_last = mb->capture_last;
1822 new_recursive.subject_position = eptr;
1823 new_recursive.prevrec = mb->recursive;
1824 mb->recursive = &new_recursive;
1825
1826 /* Where to continue from afterwards */
1827
1828 ecode += 1 + LINK_SIZE;
1829
1830 /* When we are using the system stack for match() recursion we can call a
1831 function that uses the system stack for preserving the ovector while
1832 processing the pattern recursion, but only if the ovector is small
1833 enough. */
1834
1835 #ifndef HEAP_MATCH_RECURSE
1836 if (mb->offset_end <= OP_RECURSE_STACK_SAVE_MAX)
1837 {
1838 rrc = op_recurse_ovecsave(eptr, callpat, mstart, offset_top, mb,
1839 eptrb, rdepth);
1840 mb->recursive = new_recursive.prevrec;
1841 if (rrc != MATCH_MATCH && rrc != MATCH_ACCEPT) RRETURN(rrc);
1842
1843 /* Set where we got to in the subject, and reset the start, in case
1844 it was changed by \K. This *is* propagated back out of a recursion,
1845 for Perl compatibility. */
1846
1847 eptr = mb->end_match_ptr;
1848 mstart = mb->start_match_ptr;
1849 break; /* End of processing OP_RECURSE */
1850 }
1851 #endif
1852 /* If the ovector is too big, or if we are using the heap for match()
1853 recursion, we have to use the heap for saving the ovector. Used ovecsave
1854 frames are kept on a chain and re-used. This makes a small improvement in
1855 execution time on Linux. */
1856
1857 if (mb->ovecsave_chain != NULL)
1858 {
1859 new_recursive.ovec_save = mb->ovecsave_chain->saved_ovec;
1860 mb->ovecsave_chain = mb->ovecsave_chain->next;
1861 }
1862 else
1863 {
1864 fr = (ovecsave_frame *)(mb->memctl.malloc(sizeof(ovecsave_frame *) +
1865 mb->offset_end * sizeof(PCRE2_SIZE), mb->memctl.memory_data));
1866 if (fr == NULL) RRETURN(PCRE2_ERROR_NOMEMORY);
1867 new_recursive.ovec_save = fr->saved_ovec;
1868 }
1869
1870 memcpy(new_recursive.ovec_save, mb->ovector,
1871 mb->offset_end * sizeof(PCRE2_SIZE));
1872
1873 /* Do the recursion. After processing each alternative, restore the
1874 ovector data and the last captured value. This code has the same overall
1875 logic as the code in the op_recurse_ovecsave() function, but is adapted
1876 to use RMATCH/RRETURN and to release the heap block containing the saved
1877 ovector. */
1878
1879 cbegroup = (*callpat >= OP_SBRA);
1880 do
1881 {
1882 if (cbegroup) mb->match_function_type |= MATCH_CBEGROUP;
1883 RMATCH(eptr, callpat + PRIV(OP_lengths)[*callpat], offset_top,
1884 mb, eptrb, RM6);
1885 memcpy(mb->ovector, new_recursive.ovec_save,
1886 mb->offset_end * sizeof(PCRE2_SIZE));
1887 mb->capture_last = new_recursive.saved_capture_last;
1888 mb->recursive = new_recursive.prevrec;
1889
1890 if (rrc == MATCH_MATCH || rrc == MATCH_ACCEPT)
1891 {
1892 fr = (ovecsave_frame *)
1893 ((uint8_t *)new_recursive.ovec_save - sizeof(ovecsave_frame *));
1894 fr->next = mb->ovecsave_chain;
1895 mb->ovecsave_chain = fr;
1896
1897 /* Set where we got to in the subject, and reset the start, in case
1898 it was changed by \K. This *is* propagated back out of a recursion,
1899 for Perl compatibility. */
1900
1901 eptr = mb->end_match_ptr;
1902 mstart = mb->start_match_ptr;
1903 goto RECURSION_MATCHED; /* Exit loop; end processing */
1904 }
1905
1906 /* PCRE does not allow THEN, SKIP, PRUNE or COMMIT to escape beyond a
1907 recursion; they cause a NOMATCH for the entire recursion. These codes
1908 are defined in a range that can be tested for. */
1909
1910 if (rrc >= MATCH_BACKTRACK_MIN && rrc <= MATCH_BACKTRACK_MAX)
1911 {
1912 rrc = MATCH_NOMATCH;
1913 goto RECURSION_RETURN;
1914 }
1915
1916 /* Any return code other than NOMATCH is an error. */
1917
1918 if (rrc != MATCH_NOMATCH) goto RECURSION_RETURN;
1919 mb->recursive = &new_recursive;
1920 callpat += GET(callpat, 1);
1921 }
1922 while (*callpat == OP_ALT);
1923
1924 RECURSION_RETURN:
1925 mb->recursive = new_recursive.prevrec;
1926 fr = (ovecsave_frame *)
1927 ((uint8_t *)new_recursive.ovec_save - sizeof(ovecsave_frame *));
1928 fr->next = mb->ovecsave_chain;
1929 mb->ovecsave_chain = fr;
1930 RRETURN(rrc);
1931 }
1932
1933 RECURSION_MATCHED:
1934 break;
1935
1936 /* An alternation is the end of a branch; scan along to find the end of the
1937 bracketed group and go to there. */
1938
1939 case OP_ALT:
1940 do ecode += GET(ecode,1); while (*ecode == OP_ALT);
1941 break;
1942
1943 /* BRAZERO, BRAMINZERO and SKIPZERO occur just before a bracket group,
1944 indicating that it may occur zero times. It may repeat infinitely, or not
1945 at all - i.e. it could be ()* or ()? or even (){0} in the pattern. Brackets
1946 with fixed upper repeat limits are compiled as a number of copies, with the
1947 optional ones preceded by BRAZERO or BRAMINZERO. */
1948
1949 case OP_BRAZERO:
1950 next_ecode = ecode + 1;
1951 RMATCH(eptr, next_ecode, offset_top, mb, eptrb, RM10);
1952 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1953 do next_ecode += GET(next_ecode, 1); while (*next_ecode == OP_ALT);
1954 ecode = next_ecode + 1 + LINK_SIZE;
1955 break;
1956
1957 case OP_BRAMINZERO:
1958 next_ecode = ecode + 1;
1959 do next_ecode += GET(next_ecode, 1); while (*next_ecode == OP_ALT);
1960 RMATCH(eptr, next_ecode + 1+LINK_SIZE, offset_top, mb, eptrb, RM11);
1961 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1962 ecode++;
1963 break;
1964
1965 case OP_SKIPZERO:
1966 next_ecode = ecode+1;
1967 do next_ecode += GET(next_ecode,1); while (*next_ecode == OP_ALT);
1968 ecode = next_ecode + 1 + LINK_SIZE;
1969 break;
1970
1971 /* BRAPOSZERO occurs before a possessive bracket group. Don't do anything
1972 here; just jump to the group, with allow_zero set TRUE. */
1973
1974 case OP_BRAPOSZERO:
1975 op = *(++ecode);
1976 allow_zero = TRUE;
1977 if (op == OP_CBRAPOS || op == OP_SCBRAPOS) goto POSSESSIVE_CAPTURE;
1978 goto POSSESSIVE_NON_CAPTURE;
1979
1980 /* End of a group, repeated or non-repeating. */
1981
1982 case OP_KET:
1983 case OP_KETRMIN:
1984 case OP_KETRMAX:
1985 case OP_KETRPOS:
1986 prev = ecode - GET(ecode, 1);
1987
1988 /* If this was a group that remembered the subject start, in order to break
1989 infinite repeats of empty string matches, retrieve the subject start from
1990 the chain. Otherwise, set it NULL. */
1991
1992 if (*prev >= OP_SBRA || *prev == OP_ONCE)
1993 {
1994 saved_eptr = eptrb->epb_saved_eptr; /* Value at start of group */
1995 eptrb = eptrb->epb_prev; /* Backup to previous group */
1996 }
1997 else saved_eptr = NULL;
1998
1999 /* If we are at the end of an assertion group or a non-capturing atomic
2000 group, stop matching and return MATCH_MATCH, but record the current high
2001 water mark for use by positive assertions. We also need to record the match
2002 start in case it was changed by \K. */
2003
2004 if ((*prev >= OP_ASSERT && *prev <= OP_ASSERTBACK_NOT) ||
2005 *prev == OP_ONCE_NC)
2006 {
2007 mb->end_match_ptr = eptr; /* For ONCE_NC */
2008 mb->end_offset_top = offset_top;
2009 mb->start_match_ptr = mstart;
2010 if (eptr > mb->last_used_ptr) mb->last_used_ptr = eptr;
2011 RRETURN(MATCH_MATCH); /* Sets mb->mark */
2012 }
2013
2014 /* For capturing groups we have to check the group number back at the start
2015 and if necessary complete handling an extraction by setting the offsets and
2016 bumping the high water mark. Whole-pattern recursion is coded as a recurse
2017 into group 0, so it won't be picked up here. Instead, we catch it when the
2018 OP_END is reached. Other recursion is handled here. We just have to record
2019 the current subject position and start match pointer and give a MATCH
2020 return. */
2021
2022 if (*prev == OP_CBRA || *prev == OP_SCBRA ||
2023 *prev == OP_CBRAPOS || *prev == OP_SCBRAPOS)
2024 {
2025 number = GET2(prev, 1+LINK_SIZE);
2026 offset = number << 1;
2027
2028 /* Handle a recursively called group. */
2029
2030 if (mb->recursive != NULL && mb->recursive->group_num == number)
2031 {
2032 mb->end_match_ptr = eptr;
2033 mb->start_match_ptr = mstart;
2034 if (eptr > mb->last_used_ptr) mb->last_used_ptr = eptr;
2035 RRETURN(MATCH_MATCH);
2036 }
2037
2038 /* Deal with capturing */
2039
2040 mb->capture_last = (mb->capture_last & OVFLMASK) | number;
2041 if (offset >= mb->offset_max) mb->capture_last |= OVFLBIT; else
2042 {
2043 /* If offset is greater than offset_top, it means that we are
2044 "skipping" a capturing group, and that group's offsets must be marked
2045 unset. In earlier versions of PCRE, all the offsets were unset at the
2046 start of matching, but this doesn't work because atomic groups and
2047 assertions can cause a value to be set that should later be unset.
2048 Example: matching /(?>(a))b|(a)c/ against "ac". This sets group 1 as
2049 part of the atomic group, but this is not on the final matching path,
2050 so must be unset when 2 is set. (If there is no group 2, there is no
2051 problem, because offset_top will then be 2, indicating no capture.) */
2052
2053 if (offset > offset_top)
2054 {
2055 register PCRE2_SIZE *iptr = mb->ovector + offset_top;
2056 register PCRE2_SIZE *iend = mb->ovector + offset;
2057 while (iptr < iend) *iptr++ = PCRE2_UNSET;
2058 }
2059
2060 /* Now make the extraction */
2061
2062 mb->ovector[offset] = mb->ovector[mb->offset_end - number];
2063 mb->ovector[offset+1] = eptr - mb->start_subject;
2064 if (offset_top <= offset) offset_top = offset + 2;
2065 }
2066 }
2067
2068 /* OP_KETRPOS is a possessive repeating ket. Remember the current position,
2069 and return the MATCH_KETRPOS. This makes it possible to do the repeats one
2070 at a time from the outer level, thus saving stack. This must precede the
2071 empty string test - in this case that test is done at the outer level. */
2072
2073 if (*ecode == OP_KETRPOS)
2074 {
2075 mb->start_match_ptr = mstart; /* In case \K reset it */
2076 mb->end_match_ptr = eptr;
2077 mb->end_offset_top = offset_top;
2078 if (eptr > mb->last_used_ptr) mb->last_used_ptr = eptr;
2079 RRETURN(MATCH_KETRPOS);
2080 }
2081
2082 /* For an ordinary non-repeating ket, just continue at this level. This
2083 also happens for a repeating ket if no characters were matched in the
2084 group. This is the forcible breaking of infinite loops as implemented in
2085 Perl 5.005. For a non-repeating atomic group that includes captures,
2086 establish a backup point by processing the rest of the pattern at a lower
2087 level. If this results in a NOMATCH return, pass MATCH_ONCE back to the
2088 original OP_ONCE level, thereby bypassing intermediate backup points, but
2089 resetting any captures that happened along the way. */
2090
2091 if (*ecode == OP_KET || eptr == saved_eptr)
2092 {
2093 if (*prev == OP_ONCE)
2094 {
2095 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, mb, eptrb, RM12);
2096 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2097 mb->once_target = prev; /* Level at which to change to MATCH_NOMATCH */
2098 RRETURN(MATCH_ONCE);
2099 }
2100 ecode += 1 + LINK_SIZE; /* Carry on at this level */
2101 break;
2102 }
2103
2104 /* The normal repeating kets try the rest of the pattern or restart from
2105 the preceding bracket, in the appropriate order. In the second case, we can
2106 use tail recursion to avoid using another stack frame, unless we have an
2107 an atomic group or an unlimited repeat of a group that can match an empty
2108 string. */
2109
2110 if (*ecode == OP_KETRMIN)
2111 {
2112 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, mb, eptrb, RM7);
2113 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2114 if (*prev == OP_ONCE)
2115 {
2116 RMATCH(eptr, prev, offset_top, mb, eptrb, RM8);
2117 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2118 mb->once_target = prev; /* Level at which to change to MATCH_NOMATCH */
2119 RRETURN(MATCH_ONCE);
2120 }
2121 if (*prev >= OP_SBRA) /* Could match an empty string */
2122 {
2123 RMATCH(eptr, prev, offset_top, mb, eptrb, RM50);
2124 RRETURN(rrc);
2125 }
2126 ecode = prev;
2127 goto TAIL_RECURSE;
2128 }
2129 else /* OP_KETRMAX */
2130 {
2131 RMATCH(eptr, prev, offset_top, mb, eptrb, RM13);
2132 if (rrc == MATCH_ONCE && mb->once_target == prev) rrc = MATCH_NOMATCH;
2133 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2134 if (*prev == OP_ONCE)
2135 {
2136 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, mb, eptrb, RM9);
2137 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2138 mb->once_target = prev;
2139 RRETURN(MATCH_ONCE);
2140 }
2141 ecode += 1 + LINK_SIZE;
2142 goto TAIL_RECURSE;
2143 }
2144 /* Control never gets here */
2145
2146 /* Not multiline mode: start of subject assertion, unless notbol. */
2147
2148 case OP_CIRC:
2149 if ((mb->moptions & PCRE2_NOTBOL) != 0 && eptr == mb->start_subject)
2150 RRETURN(MATCH_NOMATCH);
2151
2152 /* Start of subject assertion */
2153
2154 case OP_SOD:
2155 if (eptr != mb->start_subject) RRETURN(MATCH_NOMATCH);
2156 ecode++;
2157 break;
2158
2159 /* Multiline mode: start of subject unless notbol, or after any newline
2160 except for one at the very end, unless PCRE2_ALT_CIRCUMFLEX is set. */
2161
2162 case OP_CIRCM:
2163 if ((mb->moptions & PCRE2_NOTBOL) != 0 && eptr == mb->start_subject)
2164 RRETURN(MATCH_NOMATCH);
2165 if (eptr != mb->start_subject &&
2166 ((eptr == mb->end_subject &&
2167 (mb->poptions & PCRE2_ALT_CIRCUMFLEX) == 0) ||
2168 !WAS_NEWLINE(eptr)))
2169 RRETURN(MATCH_NOMATCH);
2170 ecode++;
2171 break;
2172
2173 /* Start of match assertion */
2174
2175 case OP_SOM:
2176 if (eptr != mb->start_subject + mb->start_offset) RRETURN(MATCH_NOMATCH);
2177 ecode++;
2178 break;
2179
2180 /* Reset the start of match point */
2181
2182 case OP_SET_SOM:
2183 mstart = eptr;
2184 ecode++;
2185 break;
2186
2187 /* Multiline mode: assert before any newline, or before end of subject
2188 unless noteol is set. */
2189
2190 case OP_DOLLM:
2191 if (eptr < mb->end_subject)
2192 {
2193 if (!IS_NEWLINE(eptr))
2194 {
2195 if (mb->partial != 0 &&
2196 eptr + 1 >= mb->end_subject &&
2197 NLBLOCK->nltype == NLTYPE_FIXED &&
2198 NLBLOCK->nllen == 2 &&
2199 UCHAR21TEST(eptr) == NLBLOCK->nl[0])
2200 {
2201 mb->hitend = TRUE;
2202 if (mb->partial > 1) RRETURN(PCRE2_ERROR_PARTIAL);
2203 }
2204 RRETURN(MATCH_NOMATCH);
2205 }
2206 }
2207 else
2208 {
2209 if ((mb->moptions & PCRE2_NOTEOL) != 0) RRETURN(MATCH_NOMATCH);
2210 SCHECK_PARTIAL();
2211 }
2212 ecode++;
2213 break;
2214
2215 /* Not multiline mode: assert before a terminating newline or before end of
2216 subject unless noteol is set. */
2217
2218 case OP_DOLL:
2219 if ((mb->moptions & PCRE2_NOTEOL) != 0) RRETURN(MATCH_NOMATCH);
2220 if ((mb->poptions & PCRE2_DOLLAR_ENDONLY) == 0) goto ASSERT_NL_OR_EOS;
2221
2222 /* ... else fall through for endonly */
2223
2224 /* End of subject assertion (\z) */
2225
2226 case OP_EOD:
2227 if (eptr < mb->end_subject) RRETURN(MATCH_NOMATCH);
2228 SCHECK_PARTIAL();
2229 ecode++;
2230 break;
2231
2232 /* End of subject or ending \n assertion (\Z) */
2233
2234 case OP_EODN:
2235 ASSERT_NL_OR_EOS:
2236 if (eptr < mb->end_subject &&
2237 (!IS_NEWLINE(eptr) || eptr != mb->end_subject - mb->nllen))
2238 {
2239 if (mb->partial != 0 &&
2240 eptr + 1 >= mb->end_subject &&
2241 NLBLOCK->nltype == NLTYPE_FIXED &&
2242 NLBLOCK->nllen == 2 &&
2243 UCHAR21TEST(eptr) == NLBLOCK->nl[0])
2244 {
2245 mb->hitend = TRUE;
2246 if (mb->partial > 1) RRETURN(PCRE2_ERROR_PARTIAL);
2247 }
2248 RRETURN(MATCH_NOMATCH);
2249 }
2250
2251 /* Either at end of string or \n before end. */
2252
2253 SCHECK_PARTIAL();
2254 ecode++;
2255 break;
2256
2257 /* Word boundary assertions */
2258
2259 case OP_NOT_WORD_BOUNDARY:
2260 case OP_WORD_BOUNDARY:
2261 {
2262
2263 /* Find out if the previous and current characters are "word" characters.
2264 It takes a bit more work in UTF-8 mode. Characters > 255 are assumed to
2265 be "non-word" characters. Remember the earliest consulted character for
2266 partial matching. */
2267
2268 #ifdef SUPPORT_UNICODE
2269 if (utf)
2270 {
2271 /* Get status of previous character */
2272
2273 if (eptr == mb->start_subject) prev_is_word = FALSE; else
2274 {
2275 PCRE2_SPTR lastptr = eptr - 1;
2276 BACKCHAR(lastptr);
2277 if (lastptr < mb->start_used_ptr) mb->start_used_ptr = lastptr;
2278 GETCHAR(c, lastptr);
2279 if ((mb->poptions & PCRE2_UCP) != 0)
2280 {
2281 if (c == '_') prev_is_word = TRUE; else
2282 {
2283 int cat = UCD_CATEGORY(c);
2284 prev_is_word = (cat == ucp_L || cat == ucp_N);
2285 }
2286 }
2287 else
2288 prev_is_word = c < 256 && (mb->ctypes[c] & ctype_word) != 0;
2289 }
2290
2291 /* Get status of next character */
2292
2293 if (eptr >= mb->end_subject)
2294 {
2295 SCHECK_PARTIAL();
2296 cur_is_word = FALSE;
2297 }
2298 else
2299 {
2300 PCRE2_SPTR nextptr = eptr + 1;
2301 FORWARDCHARTEST(nextptr, mb->end_subject);
2302 if (nextptr > mb->last_used_ptr) mb->last_used_ptr = nextptr;
2303 GETCHAR(c, eptr);
2304 if ((mb->poptions & PCRE2_UCP) != 0)
2305 {
2306 if (c == '_') cur_is_word = TRUE; else
2307 {
2308 int cat = UCD_CATEGORY(c);
2309 cur_is_word = (cat == ucp_L || cat == ucp_N);
2310 }
2311 }
2312 else
2313 cur_is_word = c < 256 && (mb->ctypes[c] & ctype_word) != 0;
2314 }
2315 }
2316 else
2317 #endif /* SUPPORT UTF */
2318
2319 /* Not in UTF-8 mode, but we may still have PCRE2_UCP set, and for
2320 consistency with the behaviour of \w we do use it in this case. */
2321
2322 {
2323 /* Get status of previous character */
2324
2325 if (eptr == mb->start_subject) prev_is_word = FALSE; else
2326 {
2327 if (eptr <= mb->start_used_ptr) mb->start_used_ptr = eptr - 1;
2328 #ifdef SUPPORT_UNICODE
2329 if ((mb->poptions & PCRE2_UCP) != 0)
2330 {
2331 c = eptr[-1];
2332 if (c == '_') prev_is_word = TRUE; else
2333 {
2334 int cat = UCD_CATEGORY(c);
2335 prev_is_word = (cat == ucp_L || cat == ucp_N);
2336 }
2337 }
2338 else
2339 #endif
2340 prev_is_word = MAX_255(eptr[-1])
2341 && ((mb->ctypes[eptr[-1]] & ctype_word) != 0);
2342 }
2343
2344 /* Get status of next character */
2345
2346 if (eptr >= mb->end_subject)
2347 {
2348 SCHECK_PARTIAL();
2349 cur_is_word = FALSE;
2350 }
2351 else
2352 {
2353 if (eptr >= mb->last_used_ptr) mb->last_used_ptr = eptr + 1;
2354 #ifdef SUPPORT_UNICODE
2355 if ((mb->poptions & PCRE2_UCP) != 0)
2356 {
2357 c = *eptr;
2358 if (c == '_') cur_is_word = TRUE; else
2359 {
2360 int cat = UCD_CATEGORY(c);
2361 cur_is_word = (cat == ucp_L || cat == ucp_N);
2362 }
2363 }
2364 else
2365 #endif
2366 cur_is_word = MAX_255(*eptr)
2367 && ((mb->ctypes[*eptr] & ctype_word) != 0);
2368 }
2369 }
2370
2371 /* Now see if the situation is what we want */
2372
2373 if ((*ecode++ == OP_WORD_BOUNDARY)?
2374 cur_is_word == prev_is_word : cur_is_word != prev_is_word)
2375 RRETURN(MATCH_NOMATCH);
2376 }
2377 break;
2378
2379 /* Match any single character type except newline; have to take care with
2380 CRLF newlines and partial matching. */
2381
2382 case OP_ANY:
2383 if (IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH);
2384 if (mb->partial != 0 &&
2385 eptr + 1 >= mb->end_subject &&
2386 NLBLOCK->nltype == NLTYPE_FIXED &&
2387 NLBLOCK->nllen == 2 &&
2388 UCHAR21TEST(eptr) == NLBLOCK->nl[0])
2389 {
2390 mb->hitend = TRUE;
2391 if (mb->partial > 1) RRETURN(PCRE2_ERROR_PARTIAL);
2392 }
2393
2394 /* Fall through */
2395
2396 /* Match any single character whatsoever. */
2397
2398 case OP_ALLANY:
2399 if (eptr >= mb->end_subject) /* DO NOT merge the eptr++ here; it must */
2400 { /* not be updated before SCHECK_PARTIAL. */
2401 SCHECK_PARTIAL();
2402 RRETURN(MATCH_NOMATCH);
2403 }
2404 eptr++;
2405 #ifdef SUPPORT_UNICODE
2406 if (utf) ACROSSCHAR(eptr < mb->end_subject, *eptr, eptr++);
2407 #endif
2408 ecode++;
2409 break;
2410
2411 /* Match a single code unit, even in UTF-8 mode. This opcode really does
2412 match any code unit, even newline. (It really should be called ANYCODEUNIT,
2413 of course - the byte name is from pre-16 bit days.) */
2414
2415 case OP_ANYBYTE:
2416 if (eptr >= mb->end_subject) /* DO NOT merge the eptr++ here; it must */
2417 { /* not be updated before SCHECK_PARTIAL. */
2418 SCHECK_PARTIAL();
2419 RRETURN(MATCH_NOMATCH);
2420 }
2421 eptr++;
2422 ecode++;
2423 break;
2424
2425 case OP_NOT_DIGIT:
2426 if (eptr >= mb->end_subject)
2427 {
2428 SCHECK_PARTIAL();
2429 RRETURN(MATCH_NOMATCH);
2430 }
2431 GETCHARINCTEST(c, eptr);
2432 if (
2433 #ifdef SUPPORT_WIDE_CHARS
2434 c < 256 &&
2435 #endif
2436 (mb->ctypes[c] & ctype_digit) != 0
2437 )
2438 RRETURN(MATCH_NOMATCH);
2439 ecode++;
2440 break;
2441
2442 case OP_DIGIT:
2443 if (eptr >= mb->end_subject)
2444 {
2445 SCHECK_PARTIAL();
2446 RRETURN(MATCH_NOMATCH);
2447 }
2448 GETCHARINCTEST(c, eptr);
2449 if (
2450 #ifdef SUPPORT_WIDE_CHARS
2451 c > 255 ||
2452 #endif
2453 (mb->ctypes[c] & ctype_digit) == 0
2454 )
2455 RRETURN(MATCH_NOMATCH);
2456 ecode++;
2457 break;
2458
2459 case OP_NOT_WHITESPACE:
2460 if (eptr >= mb->end_subject)
2461 {
2462 SCHECK_PARTIAL();
2463 RRETURN(MATCH_NOMATCH);
2464 }
2465 GETCHARINCTEST(c, eptr);
2466 if (
2467 #ifdef SUPPORT_WIDE_CHARS
2468 c < 256 &&
2469 #endif
2470 (mb->ctypes[c] & ctype_space) != 0
2471 )
2472 RRETURN(MATCH_NOMATCH);
2473 ecode++;
2474 break;
2475
2476 case OP_WHITESPACE:
2477 if (eptr >= mb->end_subject)
2478 {
2479 SCHECK_PARTIAL();
2480 RRETURN(MATCH_NOMATCH);
2481 }
2482 GETCHARINCTEST(c, eptr);
2483 if (
2484 #ifdef SUPPORT_WIDE_CHARS
2485 c > 255 ||
2486 #endif
2487 (mb->ctypes[c] & ctype_space) == 0
2488 )
2489 RRETURN(MATCH_NOMATCH);
2490 ecode++;
2491 break;
2492
2493 case OP_NOT_WORDCHAR:
2494 if (eptr >= mb->end_subject)
2495 {
2496 SCHECK_PARTIAL();
2497 RRETURN(MATCH_NOMATCH);
2498 }
2499 GETCHARINCTEST(c, eptr);
2500 if (
2501 #ifdef SUPPORT_WIDE_CHARS
2502 c < 256 &&
2503 #endif
2504 (mb->ctypes[c] & ctype_word) != 0
2505 )
2506 RRETURN(MATCH_NOMATCH);
2507 ecode++;
2508 break;
2509
2510 case OP_WORDCHAR:
2511 if (eptr >= mb->end_subject)
2512 {
2513 SCHECK_PARTIAL();
2514 RRETURN(MATCH_NOMATCH);
2515 }
2516 GETCHARINCTEST(c, eptr);
2517 if (
2518 #ifdef SUPPORT_WIDE_CHARS
2519 c > 255 ||
2520 #endif
2521 (mb->ctypes[c] & ctype_word) == 0
2522 )
2523 RRETURN(MATCH_NOMATCH);
2524 ecode++;
2525 break;
2526
2527 case OP_ANYNL:
2528 if (eptr >= mb->end_subject)
2529 {
2530 SCHECK_PARTIAL();
2531 RRETURN(MATCH_NOMATCH);
2532 }
2533 GETCHARINCTEST(c, eptr);
2534 switch(c)
2535 {
2536 default: RRETURN(MATCH_NOMATCH);
2537
2538 case CHAR_CR:
2539 if (eptr >= mb->end_subject)
2540 {
2541 SCHECK_PARTIAL();
2542 }
2543 else if (UCHAR21TEST(eptr) == CHAR_LF) eptr++;
2544 break;
2545
2546 case CHAR_LF:
2547 break;
2548
2549 case CHAR_VT:
2550 case CHAR_FF:
2551 case CHAR_NEL:
2552 #ifndef EBCDIC
2553 case 0x2028:
2554 case 0x2029:
2555 #endif /* Not EBCDIC */
2556 if (mb->bsr_convention == PCRE2_BSR_ANYCRLF) RRETURN(MATCH_NOMATCH);
2557 break;
2558 }
2559 ecode++;
2560 break;
2561
2562 case OP_NOT_HSPACE:
2563 if (eptr >= mb->end_subject)
2564 {
2565 SCHECK_PARTIAL();
2566 RRETURN(MATCH_NOMATCH);
2567 }
2568 GETCHARINCTEST(c, eptr);
2569 switch(c)
2570 {
2571 HSPACE_CASES: RRETURN(MATCH_NOMATCH); /* Byte and multibyte cases */
2572 default: break;
2573 }
2574 ecode++;
2575 break;
2576
2577 case OP_HSPACE:
2578 if (eptr >= mb->end_subject)
2579 {
2580 SCHECK_PARTIAL();
2581 RRETURN(MATCH_NOMATCH);
2582 }
2583 GETCHARINCTEST(c, eptr);
2584 switch(c)
2585 {
2586 HSPACE_CASES: break; /* Byte and multibyte cases */
2587 default: RRETURN(MATCH_NOMATCH);
2588 }
2589 ecode++;
2590 break;
2591
2592 case OP_NOT_VSPACE:
2593 if (eptr >= mb->end_subject)
2594 {
2595 SCHECK_PARTIAL();
2596 RRETURN(MATCH_NOMATCH);
2597 }
2598 GETCHARINCTEST(c, eptr);
2599 switch(c)
2600 {
2601 VSPACE_CASES: RRETURN(MATCH_NOMATCH);
2602 default: break;
2603 }
2604 ecode++;
2605 break;
2606
2607 case OP_VSPACE:
2608 if (eptr >= mb->end_subject)
2609 {
2610 SCHECK_PARTIAL();
2611 RRETURN(MATCH_NOMATCH);
2612 }
2613 GETCHARINCTEST(c, eptr);
2614 switch(c)
2615 {
2616 VSPACE_CASES: break;
2617 default: RRETURN(MATCH_NOMATCH);
2618 }
2619 ecode++;
2620 break;
2621
2622 #ifdef SUPPORT_UNICODE
2623 /* Check the next character by Unicode property. We will get here only
2624 if the support is in the binary; otherwise a compile-time error occurs. */
2625
2626 case OP_PROP:
2627 case OP_NOTPROP:
2628 if (eptr >= mb->end_subject)
2629 {
2630 SCHECK_PARTIAL();
2631 RRETURN(MATCH_NOMATCH);
2632 }
2633 GETCHARINCTEST(c, eptr);
2634 {
2635 const uint32_t *cp;
2636 const ucd_record *prop = GET_UCD(c);
2637
2638 switch(ecode[1])
2639 {
2640 case PT_ANY:
2641 if (op == OP_NOTPROP) RRETURN(MATCH_NOMATCH);
2642 break;
2643
2644 case PT_LAMP:
2645 if ((prop->chartype == ucp_Lu ||
2646 prop->chartype == ucp_Ll ||
2647 prop->chartype == ucp_Lt) == (op == OP_NOTPROP))
2648 RRETURN(MATCH_NOMATCH);
2649 break;
2650
2651 case PT_GC:
2652 if ((ecode[2] != PRIV(ucp_gentype)[prop->chartype]) == (op == OP_PROP))
2653 RRETURN(MATCH_NOMATCH);
2654 break;
2655
2656 case PT_PC:
2657 if ((ecode[2] != prop->chartype) == (op == OP_PROP))
2658 RRETURN(MATCH_NOMATCH);
2659 break;
2660
2661 case PT_SC:
2662 if ((ecode[2] != prop->script) == (op == OP_PROP))
2663 RRETURN(MATCH_NOMATCH);
2664 break;
2665
2666 /* These are specials */
2667
2668 case PT_ALNUM:
2669 if ((PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
2670 PRIV(ucp_gentype)[prop->chartype] == ucp_N) == (op == OP_NOTPROP))
2671 RRETURN(MATCH_NOMATCH);
2672 break;
2673
2674 /* Perl space used to exclude VT, but from Perl 5.18 it is included,
2675 which means that Perl space and POSIX space are now identical. PCRE
2676 was changed at release 8.34. */
2677
2678 case PT_SPACE: /* Perl space */
2679 case PT_PXSPACE: /* POSIX space */
2680 switch(c)
2681 {
2682 HSPACE_CASES:
2683 VSPACE_CASES:
2684 if (op == OP_NOTPROP) RRETURN(MATCH_NOMATCH);
2685 break;
2686
2687 default:
2688 if ((PRIV(ucp_gentype)[prop->chartype] == ucp_Z) ==
2689 (op == OP_NOTPROP)) RRETURN(MATCH_NOMATCH);
2690 break;
2691 }
2692 break;
2693
2694 case PT_WORD:
2695 if ((PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
2696 PRIV(ucp_gentype)[prop->chartype] == ucp_N ||
2697 c == CHAR_UNDERSCORE) == (op == OP_NOTPROP))
2698 RRETURN(MATCH_NOMATCH);
2699 break;
2700
2701 case PT_CLIST:
2702 cp = PRIV(ucd_caseless_sets) + ecode[2];
2703 for (;;)
2704 {
2705 if (c < *cp)
2706 { if (op == OP_PROP) { RRETURN(MATCH_NOMATCH); } else break; }
2707 if (c == *cp++)
2708 { if (op == OP_PROP) break; else { RRETURN(MATCH_NOMATCH); } }
2709 }
2710 break;
2711
2712 case PT_UCNC:
2713 if ((c == CHAR_DOLLAR_SIGN || c == CHAR_COMMERCIAL_AT ||
2714 c == CHAR_GRAVE_ACCENT || (c >= 0xa0 && c <= 0xd7ff) ||
2715 c >= 0xe000) == (op == OP_NOTPROP))
2716 RRETURN(MATCH_NOMATCH);
2717 break;
2718
2719 /* This should never occur */
2720
2721 default:
2722 RRETURN(PCRE2_ERROR_INTERNAL);
2723 }
2724
2725 ecode += 3;
2726 }
2727 break;
2728
2729 /* Match an extended Unicode sequence. We will get here only if the support
2730 is in the binary; otherwise a compile-time error occurs. */
2731
2732 case OP_EXTUNI:
2733 if (eptr >= mb->end_subject)
2734 {
2735 SCHECK_PARTIAL();
2736 RRETURN(MATCH_NOMATCH);
2737 }
2738 else
2739 {
2740 int lgb, rgb;
2741 GETCHARINCTEST(c, eptr);
2742 lgb = UCD_GRAPHBREAK(c);
2743 while (eptr < mb->end_subject)
2744 {
2745 int len = 1;
2746 if (!utf) c = *eptr; else { GETCHARLEN(c, eptr, len); }
2747 rgb = UCD_GRAPHBREAK(c);
2748 if ((PRIV(ucp_gbtable)[lgb] & (1 << rgb)) == 0) break;
2749 lgb = rgb;
2750 eptr += len;
2751 }
2752 }
2753 CHECK_PARTIAL();
2754 ecode++;
2755 break;
2756 #endif /* SUPPORT_UNICODE */
2757
2758
2759 /* Match a back reference, possibly repeatedly. Look past the end of the
2760 item to see if there is repeat information following.
2761
2762 The OP_REF and OP_REFI opcodes are used for a reference to a numbered group
2763 or to a non-duplicated named group. For a duplicated named group, OP_DNREF
2764 and OP_DNREFI are used. In this case we must scan the list of groups to
2765 which the name refers, and use the first one that is set. */
2766
2767 case OP_DNREF:
2768 case OP_DNREFI:
2769 caseless = op == OP_DNREFI;
2770 {
2771 int count = GET2(ecode, 1+IMM2_SIZE);
2772 PCRE2_SPTR slot = mb->name_table + GET2(ecode, 1) * mb->name_entry_size;
2773 ecode += 1 + 2*IMM2_SIZE;
2774
2775 /* Initializing 'offset' avoids a compiler warning in the REF_REPEAT
2776 code. */
2777
2778 offset = 0;
2779 while (count-- > 0)
2780 {
2781 offset = GET2(slot, 0) << 1;
2782 if (offset < offset_top && mb->ovector[offset] != PCRE2_UNSET) break;
2783 slot += mb->name_entry_size;
2784 }
2785 }
2786 goto REF_REPEAT;
2787
2788 case OP_REF:
2789 case OP_REFI:
2790 caseless = op == OP_REFI;
2791 offset = GET2(ecode, 1) << 1; /* Doubled ref number */
2792 ecode += 1 + IMM2_SIZE;
2793
2794 /* Set up for repetition, or handle the non-repeated case */
2795
2796 REF_REPEAT:
2797 switch (*ecode)
2798 {
2799 case OP_CRSTAR:
2800 case OP_CRMINSTAR:
2801 case OP_CRPLUS:
2802 case OP_CRMINPLUS:
2803 case OP_CRQUERY:
2804 case OP_CRMINQUERY:
2805 c = *ecode++ - OP_CRSTAR;
2806 minimize = (c & 1) != 0;
2807 min = rep_min[c]; /* Pick up values from tables; */
2808 max = rep_max[c]; /* zero for max => infinity */
2809 if (max == 0) max = INT_MAX;
2810 break;
2811
2812 case OP_CRRANGE:
2813 case OP_CRMINRANGE:
2814 minimize = (*ecode == OP_CRMINRANGE);
2815 min = GET2(ecode, 1);
2816 max = GET2(ecode, 1 + IMM2_SIZE);
2817 if (max == 0) max = INT_MAX;
2818 ecode += 1 + 2 * IMM2_SIZE;
2819 break;
2820
2821 default: /* No repeat follows */
2822 {
2823 int rc = match_ref(offset, offset_top, eptr, mb, caseless, &length);
2824 if (rc != 0)
2825 {
2826 if (rc > 0) eptr = mb->end_subject; /* Partial match */
2827 CHECK_PARTIAL();
2828 RRETURN(MATCH_NOMATCH);
2829 }
2830 }
2831 eptr += length;
2832 continue; /* With the main loop */
2833 }
2834
2835 /* Handle repeated back references. If a set group has length zero, just
2836 continue with the main loop, because it matches however many times. For an
2837 unset reference, if the minimum is zero, we can also just continue. We an
2838 also continue if PCRE2_MATCH_UNSET_BACKREF is set, because this makes unset
2839 group be have as a zero-length group. For any other unset cases, carrying
2840 on will result in NOMATCH. */
2841
2842 if (offset < offset_top && mb->ovector[offset] != PCRE2_UNSET)
2843 {
2844 if (mb->ovector[offset] == mb->ovector[offset + 1]) continue;
2845 }
2846 else /* Group is not set */
2847 {
2848 if (min == 0 || (mb->poptions & PCRE2_MATCH_UNSET_BACKREF) != 0)
2849 continue;
2850 }
2851
2852 /* First, ensure the minimum number of matches are present. We get back
2853 the length of the reference string explicitly rather than passing the
2854 address of eptr, so that eptr can be a register variable. */
2855
2856 for (i = 1; i <= min; i++)
2857 {
2858 PCRE2_SIZE slength;
2859 int rc = match_ref(offset, offset_top, eptr, mb, caseless, &slength);
2860 if (rc != 0)
2861 {
2862 if (rc > 0) eptr = mb->end_subject; /* Partial match */
2863 CHECK_PARTIAL();
2864 RRETURN(MATCH_NOMATCH);
2865 }
2866 eptr += slength;
2867 }
2868
2869 /* If min = max, continue at the same level without recursion.
2870 They are not both allowed to be zero. */
2871
2872 if (min == max) continue;
2873
2874 /* If minimizing, keep trying and advancing the pointer */
2875
2876 if (minimize)
2877 {
2878 for (fi = min;; fi++)
2879 {
2880 int rc;
2881 PCRE2_SIZE slength;
2882 RMATCH(eptr, ecode, offset_top, mb, eptrb, RM14);
2883 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2884 if (fi >= max) RRETURN(MATCH_NOMATCH);
2885 rc = match_ref(offset, offset_top, eptr, mb, caseless, &slength);
2886 if (rc != 0)
2887 {
2888 if (rc > 0) eptr = mb->end_subject; /* Partial match */
2889 CHECK_PARTIAL();
2890 RRETURN(MATCH_NOMATCH);
2891 }
2892 eptr += slength;
2893 }
2894 /* Control never gets here */
2895 }
2896
2897 /* If maximizing, find the longest string and work backwards, as long as
2898 the matched lengths for each iteration are the same. */
2899
2900 else
2901 {
2902 BOOL samelengths = TRUE;
2903 pp = eptr;
2904 length = mb->ovector[offset+1] - mb->ovector[offset];
2905
2906 for (i = min; i < max; i++)
2907 {
2908 PCRE2_SIZE slength;
2909 int rc = match_ref(offset, offset_top, eptr, mb, caseless, &slength);
2910
2911 if (rc != 0)
2912 {
2913 /* Can't use CHECK_PARTIAL because we don't want to update eptr in
2914 the soft partial matching case. */
2915
2916 if (rc > 0 && mb->partial != 0 &&
2917 mb->end_subject > mb->start_used_ptr)
2918 {
2919 mb->hitend = TRUE;
2920 if (mb->partial > 1) RRETURN(PCRE2_ERROR_PARTIAL);
2921 }
2922 break;
2923 }
2924
2925 if (slength != length) samelengths = FALSE;
2926 eptr += slength;
2927 }
2928
2929 /* If the length matched for each repetition is the same as the length of
2930 the captured group, we can easily work backwards. This is the normal
2931 case. However, in caseless UTF-8 mode there are pairs of case-equivalent
2932 characters whose lengths (in terms of code units) differ. However, this
2933 is very rare, so we handle it by re-matching fewer and fewer times. */
2934
2935 if (samelengths)
2936 {
2937 while (eptr >= pp)
2938 {
2939 RMATCH(eptr, ecode, offset_top, mb, eptrb, RM15);
2940 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2941 eptr -= length;
2942 }
2943 }
2944
2945 /* The rare case of non-matching lengths. Re-scan the repetition for each
2946 iteration. We know that match_ref() will succeed every time. */
2947
2948 else
2949 {
2950 max = i;
2951 for (;;)
2952 {
2953 RMATCH(eptr, ecode, offset_top, mb, eptrb, RM68);
2954 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2955 if (eptr == pp) break; /* Failed after minimal repetition */
2956 eptr = pp;
2957 max--;
2958 for (i = min; i < max; i++)
2959 {
2960 PCRE2_SIZE slength;
2961 (void)match_ref(offset, offset_top, eptr, mb, caseless, &slength);
2962 eptr += slength;
2963 }
2964 }
2965 }
2966
2967 RRETURN(MATCH_NOMATCH);
2968 }
2969 /* Control never gets here */
2970
2971 /* Match a bit-mapped character class, possibly repeatedly. This op code is
2972 used when all the characters in the class have values in the range 0-255,
2973 and either the matching is caseful, or the characters are in the range
2974 0-127 when UTF-8 processing is enabled. The only difference between
2975 OP_CLASS and OP_NCLASS occurs when a data character outside the range is
2976 encountered.
2977
2978 First, look past the end of the item to see if there is repeat information
2979 following. Then obey similar code to character type repeats - written out
2980 again for speed. */
2981
2982 case OP_NCLASS:
2983 case OP_CLASS:
2984 {
2985 /* The data variable is saved across frames, so the byte map needs to
2986 be stored there. */
2987 #define BYTE_MAP ((uint8_t *)data)
2988 data = ecode + 1; /* Save for matching */
2989 ecode += 1 + (32 / sizeof(PCRE2_UCHAR)); /* Advance past the item */
2990
2991 switch (*ecode)
2992 {
2993 case OP_CRSTAR:
2994 case OP_CRMINSTAR:
2995 case OP_CRPLUS:
2996 case OP_CRMINPLUS:
2997 case OP_CRQUERY:
2998 case OP_CRMINQUERY:
2999 case OP_CRPOSSTAR:
3000 case OP_CRPOSPLUS:
3001 case OP_CRPOSQUERY:
3002 c = *ecode++ - OP_CRSTAR;
3003 if (c < OP_CRPOSSTAR - OP_CRSTAR) minimize = (c & 1) != 0;
3004 else possessive = TRUE;
3005 min = rep_min[c]; /* Pick up values from tables; */
3006 max = rep_max[c]; /* zero for max => infinity */
3007 if (max == 0) max = INT_MAX;
3008 break;
3009
3010 case OP_CRRANGE:
3011 case OP_CRMINRANGE:
3012 case OP_CRPOSRANGE:
3013 minimize = (*ecode == OP_CRMINRANGE);
3014 possessive = (*ecode == OP_CRPOSRANGE);
3015 min = GET2(ecode, 1);
3016 max = GET2(ecode, 1 + IMM2_SIZE);
3017 if (max == 0) max = INT_MAX;
3018 ecode += 1 + 2 * IMM2_SIZE;
3019 break;
3020
3021 default: /* No repeat follows */
3022 min = max = 1;
3023 break;
3024 }
3025
3026 /* First, ensure the minimum number of matches are present. */
3027
3028 #ifdef SUPPORT_UNICODE
3029 if (utf)
3030 {
3031 for (i = 1; i <= min; i++)
3032 {
3033 if (eptr >= mb->end_subject)
3034 {
3035 SCHECK_PARTIAL();
3036 RRETURN(MATCH_NOMATCH);
3037 }
3038 GETCHARINC(c, eptr);
3039 if (c > 255)
3040 {
3041 if (op == OP_CLASS) RRETURN(MATCH_NOMATCH);
3042 }
3043 else
3044 if ((BYTE_MAP[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
3045 }
3046 }
3047 else
3048 #endif
3049 /* Not UTF mode */
3050 {
3051 for (i = 1; i <= min; i++)
3052 {
3053 if (eptr >= mb->end_subject)
3054 {
3055 SCHECK_PARTIAL();
3056 RRETURN(MATCH_NOMATCH);
3057 }
3058 c = *eptr++;
3059 #if PCRE2_CODE_UNIT_WIDTH != 8
3060 if (c > 255)
3061 {
3062 if (op == OP_CLASS) RRETURN(MATCH_NOMATCH);
3063 }
3064 else
3065 #endif
3066 if ((BYTE_MAP[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
3067 }
3068 }
3069
3070 /* If max == min we can continue with the main loop without the
3071 need to recurse. */
3072
3073 if (min == max) continue;
3074
3075 /* If minimizing, keep testing the rest of the expression and advancing
3076 the pointer while it matches the class. */
3077
3078 if (minimize)
3079 {
3080 #ifdef SUPPORT_UNICODE
3081 if (utf)
3082 {
3083 for (fi = min;; fi++)
3084 {
3085 RMATCH(eptr, ecode, offset_top, mb, eptrb, RM16);
3086 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3087 if (fi >= max) RRETURN(MATCH_NOMATCH);
3088 if (eptr >= mb->end_subject)
3089 {
3090 SCHECK_PARTIAL();
3091 RRETURN(MATCH_NOMATCH);
3092 }
3093 GETCHARINC(c, eptr);
3094 if (c > 255)
3095 {
3096 if (op == OP_CLASS) RRETURN(MATCH_NOMATCH);
3097 }
3098 else
3099 if ((BYTE_MAP[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
3100 }
3101 }
3102 else
3103 #endif
3104 /* Not UTF mode */
3105 {
3106 for (fi = min;; fi++)
3107 {
3108 RMATCH(eptr, ecode, offset_top, mb, eptrb, RM17);
3109 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3110 if (fi >= max) RRETURN(MATCH_NOMATCH);
3111 if (eptr >= mb->end_subject)
3112 {
3113 SCHECK_PARTIAL();
3114 RRETURN(MATCH_NOMATCH);
3115 }
3116 c = *eptr++;
3117 #if PCRE2_CODE_UNIT_WIDTH != 8
3118 if (c > 255)
3119 {
3120 if (op == OP_CLASS) RRETURN(MATCH_NOMATCH);
3121 }
3122 else
3123 #endif
3124 if ((BYTE_MAP[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
3125 }
3126 }
3127 /* Control never gets here */
3128 }
3129
3130 /* If maximizing, find the longest possible run, then work backwards. */
3131
3132 else
3133 {
3134 pp = eptr;
3135
3136 #ifdef SUPPORT_UNICODE
3137 if (utf)
3138 {
3139 for (i = min; i < max; i++)
3140 {
3141 int len = 1;
3142 if (eptr >= mb->end_subject)
3143 {
3144 SCHECK_PARTIAL();
3145 break;
3146 }
3147 GETCHARLEN(c, eptr, len);
3148 if (c > 255)
3149 {
3150 if (op == OP_CLASS) break;
3151 }
3152 else
3153 if ((BYTE_MAP[c/8] & (1 << (c&7))) == 0) break;
3154 eptr += len;
3155 }
3156
3157 if (possessive) continue; /* No backtracking */
3158
3159 for (;;)
3160 {
3161 RMATCH(eptr, ecode, offset_top, mb, eptrb, RM18);
3162 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3163 if (eptr-- == pp) break; /* Stop if tried at original pos */
3164 BACKCHAR(eptr);
3165 }
3166 }
3167 else
3168 #endif
3169 /* Not UTF mode */
3170 {
3171 for (i = min; i < max; i++)
3172 {
3173 if (eptr >= mb->end_subject)
3174 {
3175 SCHECK_PARTIAL();
3176 break;
3177 }
3178 c = *eptr;
3179 #if PCRE2_CODE_UNIT_WIDTH != 8
3180 if (c > 255)
3181 {
3182 if (op == OP_CLASS) break;
3183 }
3184 else
3185 #endif
3186 if ((BYTE_MAP[c/8] & (1 << (c&7))) == 0) break;
3187 eptr++;
3188 }
3189
3190 if (possessive) continue; /* No backtracking */
3191
3192 while (eptr >= pp)
3193 {
3194 RMATCH(eptr, ecode, offset_top, mb, eptrb, RM19);
3195 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3196 eptr--;
3197 }
3198 }
3199
3200 RRETURN(MATCH_NOMATCH);
3201 }
3202 #undef BYTE_MAP
3203 }
3204 /* Control never gets here */
3205
3206
3207 /* Match an extended character class. In the 8-bit library, this opcode is
3208 encountered only when UTF-8 mode mode is supported. In the 16-bit and
3209 32-bit libraries, codepoints greater than 255 may be encountered even when
3210 UTF is not supported. */
3211
3212 #ifdef SUPPORT_WIDE_CHARS
3213 case OP_XCLASS:
3214 {
3215 data = ecode + 1 + LINK_SIZE; /* Save for matching */
3216 ecode += GET(ecode, 1); /* Advance past the item */
3217
3218 switch (*ecode)
3219 {
3220 case OP_CRSTAR:
3221 case OP_CRMINSTAR:
3222 case OP_CRPLUS:
3223 case OP_CRMINPLUS:
3224 case OP_CRQUERY:
3225 case OP_CRMINQUERY:
3226 case OP_CRPOSSTAR:
3227 case OP_CRPOSPLUS:
3228 case OP_CRPOSQUERY:
3229 c = *ecode++ - OP_CRSTAR;
3230 if (c < OP_CRPOSSTAR - OP_CRSTAR) minimize = (c & 1) != 0;
3231 else possessive = TRUE;
3232 min = rep_min[c]; /* Pick up values from tables; */
3233 max = rep_max[c]; /* zero for max => infinity */
3234 if (max == 0) max = INT_MAX;
3235 break;
3236
3237 case OP_CRRANGE:
3238 case OP_CRMINRANGE:
3239 case OP_CRPOSRANGE:
3240 minimize = (*ecode == OP_CRMINRANGE);
3241 possessive = (*ecode == OP_CRPOSRANGE);
3242 min = GET2(ecode, 1);
3243 max = GET2(ecode, 1 + IMM2_SIZE);
3244 if (max == 0) max = INT_MAX;
3245 ecode += 1 + 2 * IMM2_SIZE;
3246 break;
3247
3248 default: /* No repeat follows */
3249 min = max = 1;
3250 break;
3251 }
3252
3253 /* First, ensure the minimum number of matches are present. */
3254
3255 for (i = 1; i <= min; i++)
3256 {
3257 if (eptr >= mb->end_subject)
3258 {
3259 SCHECK_PARTIAL();
3260 RRETURN(MATCH_NOMATCH);
3261 }
3262 GETCHARINCTEST(c, eptr);
3263 if (!PRIV(xclass)(c, data, utf)) RRETURN(MATCH_NOMATCH);
3264 }
3265
3266 /* If max == min we can continue with the main loop without the
3267 need to recurse. */
3268
3269 if (min == max) continue;
3270
3271 /* If minimizing, keep testing the rest of the expression and advancing
3272 the pointer while it matches the class. */
3273
3274 if (minimize)
3275 {
3276 for (fi = min;; fi++)
3277 {
3278 RMATCH(eptr, ecode, offset_top, mb, eptrb, RM20);
3279 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3280 if (fi >= max) RRETURN(MATCH_NOMATCH);
3281 if (eptr >= mb->end_subject)
3282 {
3283 SCHECK_PARTIAL();
3284 RRETURN(MATCH_NOMATCH);
3285 }
3286 GETCHARINCTEST(c, eptr);
3287 if (!PRIV(xclass)(c, data, utf)) RRETURN(MATCH_NOMATCH);
3288 }
3289 /* Control never gets here */
3290 }
3291
3292 /* If maximizing, find the longest possible run, then work backwards. */
3293
3294 else
3295 {
3296 pp = eptr;
3297 for (i = min; i < max; i++)
3298 {
3299 int len = 1;
3300 if (eptr >= mb->end_subject)
3301 {
3302 SCHECK_PARTIAL();
3303 break;
3304 }
3305 #ifdef SUPPORT_UNICODE
3306 GETCHARLENTEST(c, eptr, len);
3307 #else
3308 c = *eptr;
3309 #endif
3310 if (!PRIV(xclass)(c, data, utf)) break;
3311 eptr += len;
3312 }
3313
3314 if (possessive) continue; /* No backtracking */
3315
3316 for(;;)
3317 {
3318 RMATCH(eptr, ecode, offset_top, mb, eptrb, RM21);
3319 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3320 if (eptr-- == pp) break; /* Stop if tried at original pos */
3321 #ifdef SUPPORT_UNICODE
3322 if (utf) BACKCHAR(eptr);
3323 #endif
3324 }
3325 RRETURN(MATCH_NOMATCH);
3326 }
3327
3328 /* Control never gets here */
3329 }
3330 #endif /* End of XCLASS */
3331
3332 /* Match a single character, casefully */
3333
3334 case OP_CHAR:
3335 #ifdef SUPPORT_UNICODE
3336 if (utf)
3337 {
3338 length = 1;
3339 ecode++;
3340 GETCHARLEN(fc, ecode, length);
3341 if (length > (PCRE2_SIZE)(mb->end_subject - eptr))
3342 {
3343 CHECK_PARTIAL(); /* Not SCHECK_PARTIAL() */
3344 RRETURN(MATCH_NOMATCH);
3345 }
3346 for (; length > 0; length--)
3347 {
3348 if (*ecode++ != UCHAR21INC(eptr)) RRETURN(MATCH_NOMATCH);
3349 }
3350 }
3351 else
3352 #endif
3353 /* Not UTF mode */
3354 {
3355 if (mb->end_subject - eptr < 1)
3356 {
3357 SCHECK_PARTIAL(); /* This one can use SCHECK_PARTIAL() */
3358 RRETURN(MATCH_NOMATCH);
3359 }
3360 if (ecode[1] != *eptr++) RRETURN(MATCH_NOMATCH);
3361 ecode += 2;
3362 }
3363 break;
3364
3365 /* Match a single character, caselessly. If we are at the end of the
3366 subject, give up immediately. */
3367
3368 case OP_CHARI:
3369 if (eptr >= mb->end_subject)
3370 {
3371 SCHECK_PARTIAL();
3372 RRETURN(MATCH_NOMATCH);
3373 }
3374
3375 #ifdef SUPPORT_UNICODE
3376 if (utf)
3377 {
3378 length = 1;
3379 ecode++;
3380 GETCHARLEN(fc, ecode, length);
3381
3382 /* If the pattern character's value is < 128, we have only one byte, and
3383 we know that its other case must also be one byte long, so we can use the
3384 fast lookup table. We know that there is at least one byte left in the
3385 subject. */
3386
3387 if (fc < 128)
3388 {
3389 uint32_t cc = UCHAR21(eptr);
3390 if (mb->lcc[fc] != TABLE_GET(cc, mb->lcc, cc)) RRETURN(MATCH_NOMATCH);
3391 ecode++;
3392 eptr++;
3393 }
3394
3395 /* Otherwise we must pick up the subject character. Note that we cannot
3396 use the value of "length" to check for sufficient bytes left, because the
3397 other case of the character may have more or fewer bytes. */
3398
3399 else
3400 {
3401 uint32_t dc;
3402 GETCHARINC(dc, eptr);
3403 ecode += length;
3404
3405 /* If we have Unicode property support, we can use it to test the other
3406 case of the character, if there is one. */
3407
3408 if (fc != dc)
3409 {
3410 #ifdef SUPPORT_UNICODE
3411 if (dc != UCD_OTHERCASE(fc))
3412 #endif
3413 RRETURN(MATCH_NOMATCH);
3414 }
3415 }
3416 }
3417 else
3418 #endif /* SUPPORT_UNICODE */
3419
3420 /* Not UTF mode */
3421 {
3422 if (TABLE_GET(ecode[1], mb->lcc, ecode[1])
3423 != TABLE_GET(*eptr, mb->lcc, *eptr)) RRETURN(MATCH_NOMATCH);
3424 eptr++;
3425 ecode += 2;
3426 }
3427 break;
3428
3429 /* Match a single character repeatedly. */
3430
3431 case OP_EXACT:
3432 case OP_EXACTI:
3433 min = max = GET2(ecode, 1);
3434 ecode += 1 + IMM2_SIZE;
3435 goto REPEATCHAR;
3436
3437 case OP_POSUPTO:
3438 case OP_POSUPTOI:
3439 possessive = TRUE;
3440 /* Fall through */
3441
3442 case OP_UPTO:
3443 case OP_UPTOI:
3444 case OP_MINUPTO:
3445 case OP_MINUPTOI:
3446 min = 0;
3447 max = GET2(ecode, 1);
3448 minimize = *ecode == OP_MINUPTO || *ecode == OP_MINUPTOI;
3449 ecode += 1 + IMM2_SIZE;
3450 goto REPEATCHAR;
3451
3452 case OP_POSSTAR:
3453 case OP_POSSTARI:
3454 possessive = TRUE;
3455 min = 0;
3456 max = INT_MAX;
3457 ecode++;
3458 goto REPEATCHAR;
3459
3460 case OP_POSPLUS:
3461 case OP_POSPLUSI:
3462 possessive = TRUE;
3463 min = 1;
3464 max = INT_MAX;
3465 ecode++;
3466 goto REPEATCHAR;
3467
3468 case OP_POSQUERY:
3469 case OP_POSQUERYI:
3470 possessive = TRUE;
3471 min = 0;
3472 max = 1;
3473 ecode++;
3474 goto REPEATCHAR;
3475
3476 case OP_STAR:
3477 case OP_STARI:
3478 case OP_MINSTAR:
3479 case OP_MINSTARI:
3480 case OP_PLUS:
3481 case OP_PLUSI:
3482 case OP_MINPLUS:
3483 case OP_MINPLUSI:
3484 case OP_QUERY:
3485 case OP_QUERYI:
3486 case OP_MINQUERY:
3487 case OP_MINQUERYI:
3488 c = *ecode++ - ((op < OP_STARI)? OP_STAR : OP_STARI);
3489 minimize = (c & 1) != 0;
3490 min = rep_min[c]; /* Pick up values from tables; */
3491 max = rep_max[c]; /* zero for max => infinity */
3492 if (max == 0) max = INT_MAX;
3493
3494 /* Common code for all repeated single-character matches. We first check
3495 for the minimum number of characters. If the minimum equals the maximum, we
3496 are done. Otherwise, if minimizing, check the rest of the pattern for a
3497 match; if there isn't one, advance up to the maximum, one character at a
3498 time.
3499
3500 If maximizing, advance up to the maximum number of matching characters,
3501 until eptr is past the end of the maximum run. If possessive, we are
3502 then done (no backing up). Otherwise, match at this position; anything
3503 other than no match is immediately returned. For nomatch, back up one
3504 character, unless we are matching \R and the last thing matched was
3505 \r\n, in which case, back up two bytes. When we reach the first optional
3506 character position, we can save stack by doing a tail recurse.
3507
3508 The various UTF/non-UTF and caseful/caseless cases are handled separately,
3509 for speed. */
3510
3511 REPEATCHAR:
3512 #ifdef SUPPORT_UNICODE
3513 if (utf)
3514 {
3515 length = 1;
3516 charptr = ecode;
3517 GETCHARLEN(fc, ecode, length);
3518 ecode += length;
3519
3520 /* Handle multibyte character matching specially here. There is
3521 support for caseless matching if UCP support is present. */
3522
3523 if (length > 1)
3524 {
3525 uint32_t othercase;
3526 if (op >= OP_STARI && /* Caseless */
3527 (othercase = UCD_OTHERCASE(fc)) != fc)
3528 oclength = PRIV(ord2utf)(othercase, occhars);
3529 else oclength = 0;
3530
3531 for (i = 1; i <= min; i++)
3532 {
3533 if (eptr <= mb->end_subject - length &&
3534 memcmp(eptr, charptr, CU2BYTES(length)) == 0) eptr += length;
3535 else if (oclength > 0 &&
3536 eptr <= mb->end_subject - oclength &&
3537 memcmp(eptr, occhars, CU2BYTES(oclength)) == 0) eptr += oclength;
3538 else
3539 {
3540 CHECK_PARTIAL();
3541 RRETURN(MATCH_NOMATCH);
3542 }
3543 }
3544
3545 if (min == max) continue;
3546
3547 if (minimize)
3548 {
3549 for (fi = min;; fi++)
3550 {
3551 RMATCH(eptr, ecode, offset_top, mb, eptrb, RM22);
3552 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3553 if (fi >= max) RRETURN(MATCH_NOMATCH);
3554 if (eptr <= mb->end_subject - length &&
3555 memcmp(eptr, charptr, CU2BYTES(length)) == 0) eptr += length;
3556 else if (oclength > 0 &&
3557 eptr <= mb->end_subject - oclength &&
3558 memcmp(eptr, occhars, CU2BYTES(oclength)) == 0) eptr += oclength;
3559 else
3560 {
3561 CHECK_PARTIAL();
3562 RRETURN(MATCH_NOMATCH);
3563 }
3564 }
3565 /* Control never gets here */
3566 }
3567
3568 else /* Maximize */
3569 {
3570 pp = eptr;
3571 for (i = min; i < max; i++)
3572 {
3573 if (eptr <= mb->end_subject - length &&
3574 memcmp(eptr, charptr, CU2BYTES(length)) == 0) eptr += length;
3575 else if (oclength > 0 &&
3576 eptr <= mb->end_subject - oclength &&
3577 memcmp(eptr, occhars, CU2BYTES(oclength)) == 0) eptr += oclength;
3578 else
3579 {
3580 CHECK_PARTIAL();
3581 break;
3582 }
3583 }
3584
3585 if (possessive) continue; /* No backtracking */
3586
3587 /* After \C in UTF mode, pp might be in the middle of a Unicode
3588 character. Use <= pp to ensure backtracking doesn't go too far. */
3589
3590 for(;;)
3591 {
3592 if (eptr <= pp) goto TAIL_RECURSE;
3593 RMATCH(eptr, ecode, offset_top, mb, eptrb, RM23);
3594 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3595 eptr--;
3596 BACKCHAR(eptr);
3597 }
3598 }
3599 /* Control never gets here */
3600 }
3601
3602 /* If the length of a UTF-8 character is 1, we fall through here, and
3603 obey the code as for non-UTF-8 characters below, though in this case the
3604 value of fc will always be < 128. */
3605 }
3606 else
3607 #endif /* SUPPORT_UNICODE */
3608
3609 /* When not in UTF-8 mode, load a single-byte character. */
3610 fc = *ecode++;
3611
3612 /* The value of fc at this point is always one character, though we may
3613 or may not be in UTF mode. The code is duplicated for the caseless and
3614 caseful cases, for speed, since matching characters is likely to be quite
3615 common. First, ensure the minimum number of matches are present. If min =
3616 max, continue at the same level without recursing. Otherwise, if
3617 minimizing, keep trying the rest of the expression and advancing one
3618 matching character if failing, up to the maximum. Alternatively, if
3619 maximizing, find the maximum number of characters and work backwards. */
3620
3621 if (op >= OP_STARI) /* Caseless */
3622 {
3623 #if PCRE2_CODE_UNIT_WIDTH == 8
3624 /* fc must be < 128 if UTF is enabled. */
3625 foc = mb->fcc[fc];
3626 #else
3627 #ifdef SUPPORT_UNICODE
3628 if (utf && fc > 127)
3629 foc = UCD_OTHERCASE(fc);
3630 else
3631 #endif /* SUPPORT_UNICODE */
3632 foc = TABLE_GET(fc, mb->fcc, fc);
3633 #endif /* PCRE2_CODE_UNIT_WIDTH == 8 */
3634
3635 for (i = 1; i <= min; i++)
3636 {
3637 uint32_t cc; /* Faster than PCRE2_UCHAR */
3638 if (eptr >= mb->end_subject)
3639 {
3640 SCHECK_PARTIAL();
3641 RRETURN(MATCH_NOMATCH);
3642 }
3643 cc = UCHAR21TEST(eptr);
3644 if (fc != cc && foc != cc) RRETURN(MATCH_NOMATCH);
3645 eptr++;
3646 }
3647 if (min == max) continue;
3648 if (minimize)
3649 {
3650 for (fi = min;; fi++)
3651 {
3652 uint32_t cc; /* Faster than PCRE2_UCHAR */
3653 RMATCH(eptr, ecode, offset_top, mb, eptrb, RM24);
3654 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3655 if (fi >= max) RRETURN(MATCH_NOMATCH);
3656 if (eptr >= mb->end_subject)
3657 {
3658 SCHECK_PARTIAL();
3659 RRETURN(MATCH_NOMATCH);
3660 }
3661 cc = UCHAR21TEST(eptr);
3662 if (fc != cc && foc != cc) RRETURN(MATCH_NOMATCH);
3663 eptr++;
3664 }
3665 /* Control never gets here */
3666 }
3667 else /* Maximize */
3668 {
3669 pp = eptr;
3670 for (i = min; i < max; i++)
3671 {
3672 uint32_t cc; /* Faster than PCRE2_UCHAR */
3673 if (eptr >= mb->end_subject)
3674 {
3675 SCHECK_PARTIAL();
3676 break;
3677 }
3678 cc = UCHAR21TEST(eptr);
3679 if (fc != cc && foc != cc) break;
3680 eptr++;
3681 }
3682 if (possessive) continue; /* No backtracking */
3683 for (;;)
3684 {
3685 if (eptr == pp) goto TAIL_RECURSE;
3686 RMATCH(eptr, ecode, offset_top, mb, eptrb, RM25);
3687 eptr--;
3688 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3689 }
3690 /* Control never gets here */
3691 }
3692 }
3693
3694 /* Caseful comparisons (includes all multi-byte characters) */
3695
3696 else
3697 {
3698 for (i = 1; i <= min; i++)
3699 {
3700 if (eptr >= mb->end_subject)
3701 {
3702 SCHECK_PARTIAL();
3703 RRETURN(MATCH_NOMATCH);
3704 }
3705 if (fc != UCHAR21INCTEST(eptr)) RRETURN(MATCH_NOMATCH);
3706 }
3707
3708 if (min == max) continue;
3709
3710 if (minimize)
3711 {
3712 for (fi = min;; fi++)
3713 {
3714 RMATCH(eptr, ecode, offset_top, mb, eptrb, RM26);
3715 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3716 if (fi >= max) RRETURN(MATCH_NOMATCH);
3717 if (eptr >= mb->end_subject)
3718 {
3719 SCHECK_PARTIAL();
3720 RRETURN(MATCH_NOMATCH);
3721 }
3722 if (fc != UCHAR21INCTEST(eptr)) RRETURN(MATCH_NOMATCH);
3723 }
3724 /* Control never gets here */
3725 }
3726 else /* Maximize */
3727 {
3728 pp = eptr;
3729 for (i = min; i < max; i++)
3730 {
3731 if (eptr >= mb->end_subject)
3732 {
3733 SCHECK_PARTIAL();
3734 break;
3735 }
3736 if (fc != UCHAR21TEST(eptr)) break;
3737 eptr++;
3738 }
3739 if (possessive) continue; /* No backtracking */
3740 for (;;)
3741 {
3742 if (eptr == pp) goto TAIL_RECURSE;
3743 RMATCH(eptr, ecode, offset_top, mb, eptrb, RM27);
3744 eptr--;
3745 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3746 }
3747 /* Control never gets here */
3748 }
3749 }
3750 /* Control never gets here */
3751
3752 /* Match a negated single one-byte character. The character we are
3753 checking can be multibyte. */
3754
3755 case OP_NOT:
3756 case OP_NOTI:
3757 if (eptr >= mb->end_subject)
3758 {
3759 SCHECK_PARTIAL();
3760 RRETURN(MATCH_NOMATCH);
3761 }
3762 #ifdef SUPPORT_UNICODE
3763 if (utf)
3764 {
3765 register uint32_t ch, och;
3766
3767 ecode++;
3768 GETCHARINC(ch, ecode);
3769 GETCHARINC(c, eptr);
3770
3771 if (op == OP_NOT)
3772 {
3773 if (ch == c) RRETURN(MATCH_NOMATCH);
3774 }
3775 else
3776 {
3777 if (ch > 127)
3778 och = UCD_OTHERCASE(ch);
3779 else
3780 och = TABLE_GET(ch, mb->fcc, ch);
3781 if (ch == c || och == c) RRETURN(MATCH_NOMATCH);
3782 }
3783 }
3784 else
3785 #endif /* SUPPORT_UNICODE */
3786 {
3787 register uint32_t ch = ecode[1];
3788 c = *eptr++;
3789 if (ch == c || (op == OP_NOTI && TABLE_GET(ch, mb->fcc, ch) == c))
3790 RRETURN(MATCH_NOMATCH);
3791 ecode += 2;
3792 }
3793 break;
3794
3795 /* Match a negated single one-byte character repeatedly. This is almost a
3796 repeat of the code for a repeated single character, but I haven't found a
3797 nice way of commoning these up that doesn't require a test of the
3798 positive/negative option for each character match. Maybe that wouldn't add
3799 very much to the time taken, but character matching *is* what this is all
3800 about... */
3801
3802 case OP_NOTEXACT:
3803 case OP_NOTEXACTI:
3804 min = max = GET2(ecode, 1);
3805 ecode += 1 + IMM2_SIZE;
3806 goto REPEATNOTCHAR;
3807
3808 case OP_NOTUPTO:
3809 case OP_NOTUPTOI:
3810 case OP_NOTMINUPTO:
3811 case OP_NOTMINUPTOI:
3812 min = 0;
3813 max = GET2(ecode, 1);
3814 minimize = *ecode == OP_NOTMINUPTO || *ecode == OP_NOTMINUPTOI;
3815 ecode += 1 + IMM2_SIZE;
3816 goto REPEATNOTCHAR;
3817
3818 case OP_NOTPOSSTAR:
3819 case OP_NOTPOSSTARI:
3820 possessive = TRUE;
3821 min = 0;
3822 max = INT_MAX;
3823 ecode++;
3824 goto REPEATNOTCHAR;
3825
3826 case OP_NOTPOSPLUS:
3827 case OP_NOTPOSPLUSI:
3828 possessive = TRUE;
3829 min = 1;
3830 max = INT_MAX;
3831 ecode++;
3832 goto REPEATNOTCHAR;
3833
3834 case OP_NOTPOSQUERY:
3835 case OP_NOTPOSQUERYI:
3836 possessive = TRUE;
3837 min = 0;
3838 max = 1;
3839 ecode++;
3840 goto REPEATNOTCHAR;
3841
3842 case OP_NOTPOSUPTO:
3843 case OP_NOTPOSUPTOI:
3844 possessive = TRUE;
3845 min = 0;
3846 max = GET2(ecode, 1);
3847 ecode += 1 + IMM2_SIZE;
3848 goto REPEATNOTCHAR;
3849
3850 case OP_NOTSTAR:
3851 case OP_NOTSTARI:
3852 case OP_NOTMINSTAR:
3853 case OP_NOTMINSTARI:
3854 case OP_NOTPLUS:
3855 case OP_NOTPLUSI:
3856 case OP_NOTMINPLUS:
3857 case OP_NOTMINPLUSI:
3858 case OP_NOTQUERY:
3859 case OP_NOTQUERYI:
3860 case OP_NOTMINQUERY:
3861 case OP_NOTMINQUERYI:
3862 c = *ecode++ - ((op >= OP_NOTSTARI)? OP_NOTSTARI: OP_NOTSTAR);
3863 minimize = (c & 1) != 0;
3864 min = rep_min[c]; /* Pick up values from tables; */
3865 max = rep_max[c]; /* zero for max => infinity */
3866 if (max == 0) max = INT_MAX;
3867
3868 /* Common code for all repeated single-byte matches. */
3869
3870 REPEATNOTCHAR:
3871 GETCHARINCTEST(fc, ecode);
3872
3873 /* The code is duplicated for the caseless and caseful cases, for speed,
3874 since matching characters is likely to be quite common. First, ensure the
3875 minimum number of matches are present. If min = max, continue at the same
3876 level without recursing. Otherwise, if minimizing, keep trying the rest of
3877 the expression and advancing one matching character if failing, up to the
3878 maximum. Alternatively, if maximizing, find the maximum number of
3879 characters and work backwards. */
3880
3881 if (op >= OP_NOTSTARI) /* Caseless */
3882 {
3883 #ifdef SUPPORT_UNICODE
3884 if (utf && fc > 127)
3885 foc = UCD_OTHERCASE(fc);
3886 else
3887 #endif /* SUPPORT_UNICODE */
3888 foc = TABLE_GET(fc, mb->fcc, fc);
3889
3890 #ifdef SUPPORT_UNICODE
3891 if (utf)
3892 {
3893 register uint32_t d;
3894 for (i = 1; i <= min; i++)
3895 {
3896 if (eptr >= mb->end_subject)
3897 {
3898 SCHECK_PARTIAL();
3899 RRETURN(MATCH_NOMATCH);
3900 }
3901 GETCHARINC(d, eptr);
3902 if (fc == d || (uint32_t)foc == d) RRETURN(MATCH_NOMATCH);
3903 }
3904 }
3905 else
3906 #endif /* SUPPORT_UNICODE */
3907 /* Not UTF mode */
3908 {
3909 for (i = 1; i <= min; i++)
3910 {
3911 if (eptr >= mb->end_subject)
3912 {
3913 SCHECK_PARTIAL();
3914 RRETURN(MATCH_NOMATCH);
3915 }
3916 if (fc == *eptr || foc == *eptr) RRETURN(MATCH_NOMATCH);
3917 eptr++;
3918 }
3919 }
3920
3921 if (min == max) continue;
3922
3923 if (minimize)
3924 {
3925 #ifdef SUPPORT_UNICODE
3926 if (utf)
3927 {
3928 register uint32_t d;
3929 for (fi = min;; fi++)
3930 {
3931 RMATCH(eptr, ecode, offset_top, mb, eptrb, RM28);
3932 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3933 if (fi >= max) RRETURN(MATCH_NOMATCH);
3934 if (eptr >= mb->end_subject)
3935 {
3936 SCHECK_PARTIAL();
3937 RRETURN(MATCH_NOMATCH);
3938 }
3939 GETCHARINC(d, eptr);
3940 if (fc == d || (uint32_t)foc == d) RRETURN(MATCH_NOMATCH);
3941 }
3942 }
3943 else
3944 #endif /*SUPPORT_UNICODE */
3945 /* Not UTF mode */
3946 {
3947 for (fi = min;; fi++)
3948 {
3949 RMATCH(eptr, ecode, offset_top, mb, eptrb, RM29);
3950 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3951 if (fi >= max) RRETURN(MATCH_NOMATCH);
3952 if (eptr >= mb->end_subject)
3953 {
3954 SCHECK_PARTIAL();
3955 RRETURN(MATCH_NOMATCH);
3956 }
3957 if (fc == *eptr || foc == *eptr) RRETURN(MATCH_NOMATCH);
3958 eptr++;
3959 }
3960 }
3961 /* Control never gets here */
3962 }
3963
3964 /* Maximize case */
3965
3966 else
3967 {
3968 pp = eptr;
3969
3970 #ifdef SUPPORT_UNICODE
3971 if (utf)
3972 {
3973 register uint32_t d;
3974 for (i = min; i < max; i++)
3975 {
3976 int len = 1;
3977 if (eptr >= mb->end_subject)
3978 {
3979 SCHECK_PARTIAL();
3980 break;
3981 }
3982 GETCHARLEN(d, eptr, len);
3983 if (fc == d || (uint32_t)foc == d) break;
3984 eptr += len;
3985 }
3986 if (possessive) continue; /* No backtracking */
3987
3988 /* After \C in UTF mode, pp might be in the middle of a Unicode
3989 character. Use <= pp to ensure backtracking doesn't go too far. */
3990
3991 for(;;)
3992 {
3993 if (eptr <= pp) goto TAIL_RECURSE;
3994 RMATCH(eptr, ecode, offset_top, mb, eptrb, RM30);
3995 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3996 eptr--;
3997 BACKCHAR(eptr);
3998 }
3999 }
4000 else
4001 #endif /* SUPPORT_UNICODE */
4002 /* Not UTF mode */
4003 {
4004 for (i = min; i < max; i++)
4005 {
4006 if (eptr >= mb->end_subject)
4007 {
4008 SCHECK_PARTIAL();
4009 break;
4010 }
4011 if (fc == *eptr || foc == *eptr) break;
4012 eptr++;
4013 }
4014 if (possessive) continue; /* No backtracking */
4015 for (;;)
4016 {
4017 if (eptr == pp) goto TAIL_RECURSE;
4018 RMATCH(eptr, ecode, offset_top, mb, eptrb, RM31);
4019 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4020 eptr--;
4021 }
4022 }
4023 /* Control never gets here */
4024 }
4025 }
4026
4027 /* Caseful comparisons */
4028
4029 else
4030 {
4031 #ifdef SUPPORT_UNICODE
4032 if (utf)
4033 {
4034 register uint32_t d;
4035 for (i = 1; i <= min; i++)
4036 {
4037 if (eptr >= mb->end_subject)
4038 {
4039 SCHECK_PARTIAL();
4040 RRETURN(MATCH_NOMATCH);
4041 }
4042 GETCHARINC(d, eptr);
4043 if (fc == d) RRETURN(MATCH_NOMATCH);
4044 }
4045 }
4046 else
4047 #endif
4048 /* Not UTF mode */
4049 {
4050 for (i = 1; i <= min; i++)
4051 {
4052 if (eptr >= mb->end_subject)
4053 {
4054 SCHECK_PARTIAL();
4055 RRETURN(MATCH_NOMATCH);
4056 }
4057 if (fc == *eptr++) RRETURN(MATCH_NOMATCH);
4058 }
4059 }
4060
4061 if (min == max) continue;
4062
4063 if (minimize)
4064 {
4065 #ifdef SUPPORT_UNICODE
4066 if (utf)
4067 {
4068 register uint32_t d;
4069 for (fi = min;; fi++)
4070 {
4071 RMATCH(eptr, ecode, offset_top, mb, eptrb, RM32);
4072 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4073 if (fi >= max) RRETURN(MATCH_NOMATCH);
4074 if (eptr >= mb->end_subject)
4075 {
4076 SCHECK_PARTIAL();
4077 RRETURN(MATCH_NOMATCH);
4078 }
4079 GETCHARINC(d, eptr);
4080 if (fc == d) RRETURN(MATCH_NOMATCH);
4081 }
4082 }
4083 else
4084 #endif
4085 /* Not UTF mode */
4086 {
4087 for (fi = min;; fi++)
4088 {
4089 RMATCH(eptr, ecode, offset_top, mb, eptrb, RM33);
4090 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4091 if (fi >= max) RRETURN(MATCH_NOMATCH);
4092 if (eptr >= mb->end_subject)
4093 {
4094 SCHECK_PARTIAL();
4095 RRETURN(MATCH_NOMATCH);
4096 }
4097 if (fc == *eptr++) RRETURN(MATCH_NOMATCH);
4098 }
4099 }
4100 /* Control never gets here */
4101 }
4102
4103 /* Maximize case */
4104
4105 else
4106 {
4107 pp = eptr;
4108
4109 #ifdef SUPPORT_UNICODE
4110 if (utf)
4111 {
4112 register uint32_t d;
4113 for (i = min; i < max; i++)
4114 {
4115 int len = 1;
4116 if (eptr >= mb->end_subject)
4117 {
4118 SCHECK_PARTIAL();
4119 break;
4120 }
4121 GETCHARLEN(d, eptr, len);
4122 if (fc == d) break;
4123 eptr += len;
4124 }
4125 if (possessive) continue; /* No backtracking */
4126
4127 /* After \C in UTF mode, pp might be in the middle of a Unicode
4128 character. Use <= pp to ensure backtracking doesn't go too far. */
4129
4130 for(;;)
4131 {
4132 if (eptr <= pp) goto TAIL_RECURSE;
4133 RMATCH(eptr, ecode, offset_top, mb, eptrb, RM34);
4134 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4135 eptr--;
4136 BACKCHAR(eptr);
4137 }
4138 }
4139 else
4140 #endif
4141 /* Not UTF mode */
4142 {
4143 for (i = min; i < max; i++)
4144 {
4145 if (eptr >= mb->end_subject)
4146 {
4147 SCHECK_PARTIAL();
4148 break;
4149 }
4150 if (fc == *eptr) break;
4151 eptr++;
4152 }
4153 if (possessive) continue; /* No backtracking */
4154 for (;;)
4155 {
4156 if (eptr == pp) goto TAIL_RECURSE;
4157 RMATCH(eptr, ecode, offset_top, mb, eptrb, RM35);
4158 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4159 eptr--;
4160 }
4161 }
4162 /* Control never gets here */
4163 }
4164 }
4165 /* Control never gets here */
4166
4167 /* Match a single character type repeatedly; several different opcodes
4168 share code. This is very similar to the code for single characters, but we
4169 repeat it in the interests of efficiency. */
4170
4171 case OP_TYPEEXACT:
4172 min = max = GET2(ecode, 1);
4173 minimize = TRUE;
4174 ecode += 1 + IMM2_SIZE;
4175 goto REPEATTYPE;
4176
4177 case OP_TYPEUPTO:
4178 case OP_TYPEMINUPTO:
4179 min = 0;
4180 max = GET2(ecode, 1);
4181 minimize = *ecode == OP_TYPEMINUPTO;
4182 ecode += 1 + IMM2_SIZE;
4183 goto REPEATTYPE;
4184
4185 case OP_TYPEPOSSTAR:
4186 possessive = TRUE;
4187 min = 0;
4188 max = INT_MAX;
4189 ecode++;
4190 goto REPEATTYPE;
4191
4192 case OP_TYPEPOSPLUS:
4193 possessive = TRUE;
4194 min = 1;
4195 max = INT_MAX;
4196 ecode++;
4197 goto REPEATTYPE;
4198
4199 case OP_TYPEPOSQUERY:
4200 possessive = TRUE;
4201 min = 0;
4202 max = 1;
4203 ecode++;
4204 goto REPEATTYPE;
4205
4206 case OP_TYPEPOSUPTO:
4207 possessive = TRUE;
4208 min = 0;
4209 max = GET2(ecode, 1);
4210 ecode += 1 + IMM2_SIZE;
4211 goto REPEATTYPE;
4212
4213 case OP_TYPESTAR:
4214 case OP_TYPEMINSTAR:
4215 case OP_TYPEPLUS:
4216 case OP_TYPEMINPLUS:
4217 case OP_TYPEQUERY:
4218 case OP_TYPEMINQUERY:
4219 c = *ecode++ - OP_TYPESTAR;
4220 minimize = (c & 1) != 0;
4221 min = rep_min[c]; /* Pick up values from tables; */
4222 max = rep_max[c]; /* zero for max => infinity */
4223 if (max == 0) max = INT_MAX;
4224
4225 /* Common code for all repeated single character type matches. Note that
4226 in UTF-8 mode, '.' matches a character of any length, but for the other
4227 character types, the valid characters are all one-byte long. */
4228
4229 REPEATTYPE:
4230 ctype = *ecode++; /* Code for the character type */
4231
4232 #ifdef SUPPORT_UNICODE
4233 if (ctype == OP_PROP || ctype == OP_NOTPROP)
4234 {
4235 prop_fail_result = ctype == OP_NOTPROP;
4236 prop_type = *ecode++;
4237 prop_value = *ecode++;
4238 }
4239 else prop_type = -1;
4240 #endif
4241
4242 /* First, ensure the minimum number of matches are present. Use inline
4243 code for maximizing the speed, and do the type test once at the start
4244 (i.e. keep it out of the loop). Separate the UTF-8 code completely as that
4245 is tidier. Also separate the UCP code, which can be the same for both UTF-8
4246 and single-bytes. */
4247
4248 if (min > 0)
4249 {
4250 #ifdef SUPPORT_UNICODE
4251 if (prop_type >= 0)
4252 {
4253 switch(prop_type)
4254 {
4255 case PT_ANY:
4256 if (prop_fail_result) RRETURN(MATCH_NOMATCH);
4257 for (i = 1; i <= min; i++)
4258 {
4259 if (eptr >= mb->end_subject)
4260 {
4261 SCHECK_PARTIAL();
4262 RRETURN(MATCH_NOMATCH);
4263 }
4264 GETCHARINCTEST(c, eptr);
4265 }
4266 break;
4267
4268 case PT_LAMP:
4269 for (i = 1; i <= min; i++)
4270 {
4271 int chartype;
4272 if (eptr >= mb->end_subject)
4273 {
4274 SCHECK_PARTIAL();
4275 RRETURN(MATCH_NOMATCH);
4276 }
4277 GETCHARINCTEST(c, eptr);
4278 chartype = UCD_CHARTYPE(c);
4279 if ((chartype == ucp_Lu ||
4280 chartype == ucp_Ll ||
4281 chartype == ucp_Lt) == prop_fail_result)
4282 RRETURN(MATCH_NOMATCH);
4283 }
4284 break;
4285
4286 case PT_GC:
4287 for (i = 1; i <= min; i++)
4288 {
4289 if (eptr >= mb->end_subject)
4290 {
4291 SCHECK_PARTIAL();
4292 RRETURN(MATCH_NOMATCH);
4293 }
4294 GETCHARINCTEST(c, eptr);
4295 if ((UCD_CATEGORY(c) == prop_value) == prop_fail_result)
4296 RRETURN(MATCH_NOMATCH);
4297 }
4298 break;
4299
4300 case PT_PC:
4301 for (i = 1; i <= min; i++)
4302 {
4303 if (eptr >= mb->end_subject)
4304 {
4305 SCHECK_PARTIAL();
4306 RRETURN(MATCH_NOMATCH);
4307 }
4308 GETCHARINCTEST(c, eptr);
4309 if ((UCD_CHARTYPE(c) == prop_value) == prop_fail_result)
4310 RRETURN(MATCH_NOMATCH);
4311 }
4312 break;
4313
4314 case PT_SC:
4315 for (i = 1; i <= min; i++)
4316 {
4317 if (eptr >= mb->end_subject)
4318 {
4319 SCHECK_PARTIAL();
4320 RRETURN(MATCH_NOMATCH);
4321 }
4322 GETCHARINCTEST(c, eptr);
4323 if ((UCD_SCRIPT(c) == prop_value) == prop_fail_result)
4324 RRETURN(MATCH_NOMATCH);
4325 }
4326 break;
4327
4328 case PT_ALNUM:
4329 for (i = 1; i <= min; i++)
4330 {
4331 int category;
4332 if (eptr >= mb->end_subject)
4333 {
4334 SCHECK_PARTIAL();
4335 RRETURN(MATCH_NOMATCH);
4336 }
4337 GETCHARINCTEST(c, eptr);
4338 category = UCD_CATEGORY(c);
4339 if ((category == ucp_L || category == ucp_N) == prop_fail_result)
4340 RRETURN(MATCH_NOMATCH);
4341 }
4342 break;
4343
4344 /* Perl space used to exclude VT, but from Perl 5.18 it is included,
4345 which means that Perl space and POSIX space are now identical. PCRE
4346 was changed at release 8.34. */
4347
4348 case PT_SPACE: /* Perl space */
4349 case PT_PXSPACE: /* POSIX space */
4350 for (i = 1; i <= min; i++)
4351 {
4352 if (eptr >= mb->end_subject)
4353 {
4354 SCHECK_PARTIAL();
4355 RRETURN(MATCH_NOMATCH);
4356 }
4357 GETCHARINCTEST(c, eptr);
4358 switch(c)
4359 {
4360 HSPACE_CASES:
4361 VSPACE_CASES:
4362 if (prop_fail_result) RRETURN(MATCH_NOMATCH);
4363 break;
4364
4365 default:
4366 if ((UCD_CATEGORY(c) == ucp_Z) == prop_fail_result)
4367 RRETURN(MATCH_NOMATCH);
4368 break;
4369 }
4370 }
4371 break;
4372
4373 case PT_WORD:
4374 for (i = 1; i <= min; i++)
4375 {
4376 int category;
4377 if (eptr >= mb->end_subject)
4378 {
4379 SCHECK_PARTIAL();
4380 RRETURN(MATCH_NOMATCH);
4381 }
4382 GETCHARINCTEST(c, eptr);
4383 category = UCD_CATEGORY(c);
4384 if ((category == ucp_L || category == ucp_N || c == CHAR_UNDERSCORE)
4385 == prop_fail_result)
4386 RRETURN(MATCH_NOMATCH);
4387 }
4388 break;
4389
4390 case PT_CLIST:
4391 for (i = 1; i <= min; i++)
4392 {
4393 const uint32_t *cp;
4394 if (eptr >= mb->end_subject)
4395 {
4396 SCHECK_PARTIAL();
4397 RRETURN(MATCH_NOMATCH);
4398 }
4399 GETCHARINCTEST(c, eptr);
4400 cp = PRIV(ucd_caseless_sets) + prop_value;
4401 for (;;)
4402 {
4403 if (c < *cp)
4404 { if (prop_fail_result) break; else { RRETURN(MATCH_NOMATCH); } }
4405 if (c == *cp++)
4406 { if (prop_fail_result) { RRETURN(MATCH_NOMATCH); } else break; }
4407 }
4408 }
4409 break;
4410
4411 case PT_UCNC:
4412 for (i = 1; i <= min; i++)
4413 {
4414 if (eptr >= mb->end_subject)
4415 {
4416 SCHECK_PARTIAL();
4417 RRETURN(MATCH_NOMATCH);
4418 }
4419 GETCHARINCTEST(c, eptr);
4420 if ((c == CHAR_DOLLAR_SIGN || c == CHAR_COMMERCIAL_AT ||
4421 c == CHAR_GRAVE_ACCENT || (c >= 0xa0 && c <= 0xd7ff) ||
4422 c >= 0xe000) == prop_fail_result)
4423 RRETURN(MATCH_NOMATCH);
4424 }
4425 break;
4426
4427 /* This should not occur */
4428
4429 default:
4430 RRETURN(PCRE2_ERROR_INTERNAL);
4431 }
4432 }
4433
4434 /* Match extended Unicode sequences. We will get here only if the
4435 support is in the binary; otherwise a compile-time error occurs. */
4436
4437 else if (ctype == OP_EXTUNI)
4438 {
4439 for (i = 1; i <= min; i++)
4440 {
4441 if (eptr >= mb->end_subject)
4442 {
4443 SCHECK_PARTIAL();
4444 RRETURN(MATCH_NOMATCH);
4445 }
4446 else
4447 {
4448 int lgb, rgb;
4449 GETCHARINCTEST(c, eptr);
4450 lgb = UCD_GRAPHBREAK(c);
4451 while (eptr < mb->end_subject)
4452 {
4453 int len = 1;
4454 if (!utf) c = *eptr; else { GETCHARLEN(c, eptr, len); }
4455 rgb = UCD_GRAPHBREAK(c);
4456 if ((PRIV(ucp_gbtable)[lgb] & (1 << rgb)) == 0) break;
4457 lgb = rgb;
4458 eptr += len;
4459 }
4460 }
4461 CHECK_PARTIAL();
4462 }
4463 }
4464
4465 else
4466 #endif /* SUPPORT_UNICODE */
4467
4468 /* Handle all other cases when the coding is UTF-8 */
4469
4470 #ifdef SUPPORT_UNICODE
4471 if (utf) switch(ctype)
4472 {
4473 case OP_ANY:
4474 for (i = 1; i <= min; i++)
4475 {
4476 if (eptr >= mb->end_subject)
4477 {
4478 SCHECK_PARTIAL();
4479 RRETURN(MATCH_NOMATCH);
4480 }
4481 if (IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH);
4482 if (mb->partial != 0 &&
4483 eptr + 1 >= mb->end_subject &&
4484 NLBLOCK->nltype == NLTYPE_FIXED &&
4485 NLBLOCK->nllen == 2 &&
4486 UCHAR21(eptr) == NLBLOCK->nl[0])
4487 {
4488 mb->hitend = TRUE;
4489 if (mb->partial > 1) RRETURN(PCRE2_ERROR_PARTIAL);
4490 }
4491 eptr++;
4492 ACROSSCHAR(eptr < mb->end_subject, *eptr, eptr++);
4493 }
4494 break;
4495
4496 case OP_ALLANY:
4497 for (i = 1; i <= min; i++)
4498 {
4499 if (eptr >= mb->end_subject)
4500 {
4501 SCHECK_PARTIAL();
4502 RRETURN(MATCH_NOMATCH);
4503 }
4504 eptr++;
4505 ACROSSCHAR(eptr < mb->end_subject, *eptr, eptr++);
4506 }
4507 break;
4508
4509 case OP_ANYBYTE:
4510 if (eptr > mb->end_subject - min) RRETURN(MATCH_NOMATCH);
4511 eptr += min;
4512 break;
4513
4514 case OP_ANYNL:
4515 for (i = 1; i <= min; i++)
4516 {
4517 if (eptr >= mb->end_subject)
4518 {
4519 SCHECK_PARTIAL();
4520 RRETURN(MATCH_NOMATCH);
4521 }
4522 GETCHARINC(c, eptr);
4523 switch(c)
4524 {
4525 default: RRETURN(MATCH_NOMATCH);
4526
4527 case CHAR_CR:
4528 if (eptr < mb->end_subject && UCHAR21(eptr) == CHAR_LF) eptr++;
4529 break;
4530
4531 case CHAR_LF:
4532 break;
4533
4534 case CHAR_VT:
4535 case CHAR_FF:
4536 case CHAR_NEL:
4537 #ifndef EBCDIC
4538 case 0x2028:
4539 case 0x2029:
4540 #endif /* Not EBCDIC */
4541 if (mb->bsr_convention == PCRE2_BSR_ANYCRLF) RRETURN(MATCH_NOMATCH);
4542 break;
4543 }
4544 }
4545 break;
4546
4547 case OP_NOT_HSPACE:
4548 for (i = 1; i <= min; i++)
4549 {
4550 if (eptr >= mb->end_subject)
4551 {
4552 SCHECK_PARTIAL();
4553 RRETURN(MATCH_NOMATCH);
4554 }
4555 GETCHARINC(c, eptr);
4556 switch(c)
4557 {
4558 HSPACE_CASES: RRETURN(MATCH_NOMATCH); /* Byte and multibyte cases */
4559 default: break;
4560 }
4561 }
4562 break;
4563
4564 case OP_HSPACE:
4565 for (i = 1; i <= min; i++)
4566 {
4567 if (eptr >= mb->end_subject)
4568 {
4569 SCHECK_PARTIAL();
4570 RRETURN(MATCH_NOMATCH);
4571 }
4572 GETCHARINC(c, eptr);
4573 switch(c)
4574 {
4575 HSPACE_CASES: break; /* Byte and multibyte cases */
4576 default: RRETURN(MATCH_NOMATCH);
4577 }
4578 }
4579 break;
4580
4581 case OP_NOT_VSPACE:
4582 for (i = 1; i <= min; i++)
4583 {
4584 if (eptr >= mb->end_subject)
4585 {
4586 SCHECK_PARTIAL();
4587 RRETURN(MATCH_NOMATCH);
4588 }
4589 GETCHARINC(c, eptr);
4590 switch(c)
4591 {
4592 VSPACE_CASES: RRETURN(MATCH_NOMATCH);
4593 default: break;
4594 }
4595 }
4596 break;
4597
4598 case OP_VSPACE:
4599 for (i = 1; i <= min; i++)
4600 {
4601 if (eptr >= mb->end_subject)
4602 {
4603 SCHECK_PARTIAL();
4604 RRETURN(MATCH_NOMATCH);
4605 }
4606 GETCHARINC(c, eptr);
4607 switch(c)
4608 {
4609 VSPACE_CASES: break;
4610 default: RRETURN(MATCH_NOMATCH);
4611 }
4612 }
4613 break;
4614
4615 case OP_NOT_DIGIT:
4616 for (i = 1; i <= min; i++)
4617 {
4618 if (eptr >= mb->end_subject)
4619 {
4620 SCHECK_PARTIAL();
4621 RRETURN(MATCH_NOMATCH);
4622 }
4623 GETCHARINC(c, eptr);
4624 if (c < 128 && (mb->ctypes[c] & ctype_digit) != 0)
4625 RRETURN(MATCH_NOMATCH);
4626 }
4627 break;
4628
4629 case OP_DIGIT:
4630 for (i = 1; i <= min; i++)
4631 {
4632 uint32_t cc;
4633 if (eptr >= mb->end_subject)
4634 {
4635 SCHECK_PARTIAL();
4636 RRETURN(MATCH_NOMATCH);
4637 }
4638 cc = UCHAR21(eptr);
4639 if (cc >= 128 || (mb->ctypes[cc] & ctype_digit) == 0)
4640 RRETURN(MATCH_NOMATCH);
4641 eptr++;
4642 /* No need to skip more bytes - we know it's a 1-byte character */
4643 }
4644 break;
4645
4646 case OP_NOT_WHITESPACE:
4647 for (i = 1; i <= min; i++)
4648 {
4649 uint32_t cc;
4650 if (eptr >= mb->end_subject)
4651 {
4652 SCHECK_PARTIAL();
4653 RRETURN(MATCH_NOMATCH);
4654 }
4655 cc = UCHAR21(eptr);
4656 if (cc < 128 && (mb->ctypes[cc] & ctype_space) != 0)
4657 RRETURN(MATCH_NOMATCH);
4658 eptr++;
4659 ACROSSCHAR(eptr < mb->end_subject, *eptr, eptr++);
4660 }
4661 break;
4662
4663 case OP_WHITESPACE:
4664 for (i = 1; i <= min; i++)
4665 {
4666 uint32_t cc;
4667 if (eptr >= mb->end_subject)
4668 {
4669 SCHECK_PARTIAL();
4670 RRETURN(MATCH_NOMATCH);
4671 }
4672 cc = UCHAR21(eptr);
4673 if (cc >= 128 || (mb->ctypes[cc] & ctype_space) == 0)
4674 RRETURN(MATCH_NOMATCH);
4675 eptr++;
4676 /* No need to skip more bytes - we know it's a 1-byte character */
4677 }
4678 break;
4679
4680 case OP_NOT_WORDCHAR:
4681 for (i = 1; i <= min; i++)
4682 {
4683 uint32_t cc;
4684 if (eptr >= mb->end_subject)
4685 {
4686 SCHECK_PARTIAL();
4687 RRETURN(MATCH_NOMATCH);
4688 }
4689 cc = UCHAR21(eptr);
4690 if (cc < 128 && (mb->ctypes[cc] & ctype_word) != 0)
4691 RRETURN(MATCH_NOMATCH);
4692 eptr++;
4693 ACROSSCHAR(eptr < mb->end_subject, *eptr, eptr++);
4694 }
4695 break;
4696
4697 case OP_WORDCHAR:
4698 for (i = 1; i <= min; i++)
4699 {
4700 uint32_t cc;
4701 if (eptr >= mb->end_subject)
4702 {
4703 SCHECK_PARTIAL();
4704 RRETURN(MATCH_NOMATCH);
4705 }
4706 cc = UCHAR21(eptr);
4707 if (cc >= 128 || (mb->ctypes[cc] & ctype_word) == 0)
4708 RRETURN(MATCH_NOMATCH);
4709 eptr++;
4710 /* No need to skip more bytes - we know it's a 1-byte character */
4711 }
4712 break;
4713
4714 default:
4715 RRETURN(PCRE2_ERROR_INTERNAL);
4716 } /* End switch(ctype) */
4717
4718 else
4719 #endif /* SUPPORT_UNICODE */
4720
4721 /* Code for the non-UTF-8 case for minimum matching of operators other
4722 than OP_PROP and OP_NOTPROP. */
4723
4724 switch(ctype)
4725 {
4726 case OP_ANY:
4727 for (i = 1; i <= min; i++)
4728 {
4729 if (eptr >= mb->end_subject)
4730 {
4731 SCHECK_PARTIAL();
4732 RRETURN(MATCH_NOMATCH);
4733 }
4734 if (IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH);
4735 if (mb->partial != 0 &&
4736 eptr + 1 >= mb->end_subject &&
4737 NLBLOCK->nltype == NLTYPE_FIXED &&
4738 NLBLOCK->nllen == 2 &&
4739 *eptr == NLBLOCK->nl[0])
4740 {
4741 mb->hitend = TRUE;
4742 if (mb->partial > 1) RRETURN(PCRE2_ERROR_PARTIAL);
4743 }
4744 eptr++;
4745 }
4746 break;
4747
4748 case OP_ALLANY:
4749 if (eptr > mb->end_subject - min)
4750 {
4751 SCHECK_PARTIAL();
4752 RRETURN(MATCH_NOMATCH);
4753 }
4754 eptr += min;
4755 break;
4756
4757 case OP_ANYBYTE:
4758 if (eptr > mb->end_subject - min)
4759 {
4760 SCHECK_PARTIAL();
4761 RRETURN(MATCH_NOMATCH);
4762 }
4763 eptr += min;
4764 break;
4765
4766 case OP_ANYNL:
4767 for (i = 1; i <= min; i++)
4768 {
4769 if (eptr >= mb->end_subject)
4770 {
4771 SCHECK_PARTIAL();
4772 RRETURN(MATCH_NOMATCH);
4773 }
4774 switch(*eptr++)
4775 {
4776 default: RRETURN(MATCH_NOMATCH);
4777
4778 case CHAR_CR:
4779 if (eptr < mb->end_subject && *eptr == CHAR_LF) eptr++;
4780 break;
4781
4782 case CHAR_LF:
4783 break;
4784
4785 case CHAR_VT:
4786 case CHAR_FF:
4787 case CHAR_NEL:
4788 #if PCRE2_CODE_UNIT_WIDTH != 8
4789 case 0x2028:
4790 case 0x2029:
4791 #endif
4792 if (mb->bsr_convention == PCRE2_BSR_ANYCRLF) RRETURN(MATCH_NOMATCH);
4793 break;
4794 }
4795 }
4796 break;
4797
4798 case OP_NOT_HSPACE:
4799 for (i = 1; i <= min; i++)
4800 {
4801 if (eptr >= mb->end_subject)
4802 {
4803 SCHECK_PARTIAL();
4804 RRETURN(MATCH_NOMATCH);
4805 }
4806 switch(*eptr++)
4807 {
4808 default: break;
4809 HSPACE_BYTE_CASES:
4810 #if PCRE2_CODE_UNIT_WIDTH != 8
4811 HSPACE_MULTIBYTE_CASES:
4812 #endif
4813 RRETURN(MATCH_NOMATCH);
4814 }
4815 }
4816 break;
4817
4818 case OP_HSPACE:
4819 for (i = 1; i <= min; i++)
4820 {
4821 if (eptr >= mb->end_subject)
4822 {
4823 SCHECK_PARTIAL();
4824 RRETURN(MATCH_NOMATCH);
4825 }
4826 switch(*eptr++)
4827 {
4828 default: RRETURN(MATCH_NOMATCH);
4829 HSPACE_BYTE_CASES:
4830 #if PCRE2_CODE_UNIT_WIDTH != 8
4831 HSPACE_MULTIBYTE_CASES:
4832 #endif
4833 break;
4834 }
4835 }
4836 break;
4837
4838 case OP_NOT_VSPACE:
4839 for (i = 1; i <= min; i++)
4840 {
4841 if (eptr >= mb->end_subject)
4842 {
4843 SCHECK_PARTIAL();
4844 RRETURN(MATCH_NOMATCH);
4845 }
4846 switch(*eptr++)
4847 {
4848 VSPACE_BYTE_CASES:
4849 #if PCRE2_CODE_UNIT_WIDTH != 8
4850 VSPACE_MULTIBYTE_CASES:
4851 #endif
4852 RRETURN(MATCH_NOMATCH);
4853 default: break;
4854 }
4855 }
4856 break;
4857
4858 case OP_VSPACE:
4859 for (i = 1; i <= min; i++)
4860 {
4861 if (eptr >= mb->end_subject)
4862 {
4863 SCHECK_PARTIAL();
4864 RRETURN(MATCH_NOMATCH);
4865 }
4866 switch(*eptr++)
4867 {
4868 default: RRETURN(MATCH_NOMATCH);
4869 VSPACE_BYTE_CASES:
4870 #if PCRE2_CODE_UNIT_WIDTH != 8
4871 VSPACE_MULTIBYTE_CASES:
4872 #endif
4873 break;
4874 }
4875 }
4876 break;
4877
4878 case OP_NOT_DIGIT:
4879 for (i = 1; i <= min; i++)
4880 {
4881 if (eptr >= mb->end_subject)
4882 {
4883 SCHECK_PARTIAL();
4884 RRETURN(MATCH_NOMATCH);
4885 }
4886 if (MAX_255(*eptr) && (mb->ctypes[*eptr] & ctype_digit) != 0)
4887 RRETURN(MATCH_NOMATCH);
4888 eptr++;
4889 }
4890 break;
4891
4892 case OP_DIGIT:
4893 for (i = 1; i <= min; i++)
4894 {
4895 if (eptr >= mb->end_subject)
4896 {
4897 SCHECK_PARTIAL();
4898 RRETURN(MATCH_NOMATCH);
4899 }
4900 if (!MAX_255(*eptr) || (mb->ctypes[*eptr] & ctype_digit) == 0)
4901 RRETURN(MATCH_NOMATCH);
4902 eptr++;
4903 }
4904 break;
4905
4906 case OP_NOT_WHITESPACE:
4907 for (i = 1; i <= min; i++)
4908 {
4909 if (eptr >= mb->end_subject)
4910 {
4911 SCHECK_PARTIAL();
4912 RRETURN(MATCH_NOMATCH);
4913 }
4914 if (MAX_255(*eptr) && (mb->ctypes[*eptr] & ctype_space) != 0)
4915 RRETURN(MATCH_NOMATCH);
4916 eptr++;
4917 }
4918 break;
4919
4920 case OP_WHITESPACE:
4921 for (i = 1; i <= min; i++)
4922 {
4923 if (eptr >= mb->end_subject)
4924 {
4925 SCHECK_PARTIAL();
4926 RRETURN(MATCH_NOMATCH);
4927 }
4928 if (!MAX_255(*eptr) || (mb->ctypes[*eptr] & ctype_space) == 0)
4929 RRETURN(MATCH_NOMATCH);
4930 eptr++;
4931 }
4932 break;
4933
4934 case OP_NOT_WORDCHAR:
4935 for (i = 1; i <= min; i++)
4936 {
4937 if (eptr >= mb->end_subject)
4938 {
4939 SCHECK_PARTIAL();
4940 RRETURN(MATCH_NOMATCH);
4941 }
4942 if (MAX_255(*eptr) && (mb->ctypes[*eptr] & ctype_word) != 0)
4943 RRETURN(MATCH_NOMATCH);
4944 eptr++;
4945 }
4946 break;
4947
4948 case OP_WORDCHAR:
4949 for (i = 1; i <= min; i++)
4950 {
4951 if (eptr >= mb->end_subject)
4952 {
4953 SCHECK_PARTIAL();
4954 RRETURN(MATCH_NOMATCH);
4955 }
4956 if (!MAX_255(*eptr) || (mb->ctypes[*eptr] & ctype_word) == 0)
4957 RRETURN(MATCH_NOMATCH);
4958 eptr++;
4959 }
4960 break;
4961
4962 default:
4963 RRETURN(PCRE2_ERROR_INTERNAL);
4964 }
4965 }
4966
4967 /* If min = max, continue at the same level without recursing */
4968
4969 if (min == max) continue;
4970
4971 /* If minimizing, we have to test the rest of the pattern before each
4972 subsequent match. Again, separate the UTF-8 case for speed, and also
4973 separate the UCP cases. */
4974
4975 if (minimize)
4976 {
4977 #ifdef SUPPORT_UNICODE
4978 if (prop_type >= 0)
4979 {
4980 switch(prop_type)
4981 {
4982 case PT_ANY:
4983 for (fi = min;; fi++)
4984 {
4985 RMATCH(eptr, ecode, offset_top, mb, eptrb, RM36);
4986 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4987 if (fi >= max) RRETURN(MATCH_NOMATCH);
4988 if (eptr >= mb->end_subject)
4989 {
4990 SCHECK_PARTIAL();
4991 RRETURN(MATCH_NOMATCH);
4992 }
4993 GETCHARINCTEST(c, eptr);
4994 if (prop_fail_result) RRETURN(MATCH_NOMATCH);
4995 }
4996 /* Control never gets here */
4997
4998 case PT_LAMP:
4999 for (fi = min;; fi++)
5000 {
5001 int chartype;
5002 RMATCH(eptr, ecode, offset_top, mb, eptrb, RM37);
5003 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5004 if (fi >= max) RRETURN(MATCH_NOMATCH);
5005 if (eptr >= mb->end_subject)
5006 {
5007 SCHECK_PARTIAL();
5008 RRETURN(MATCH_NOMATCH);
5009 }
5010 GETCHARINCTEST(c, eptr);
5011 chartype = UCD_CHARTYPE(c);
5012 if ((chartype == ucp_Lu ||
5013 chartype == ucp_Ll ||
5014 chartype == ucp_Lt) == prop_fail_result)
5015 RRETURN(MATCH_NOMATCH);
5016 }
5017 /* Control never gets here */
5018
5019 case PT_GC:
5020 for (fi = min;; fi++)
5021 {
5022 RMATCH(eptr, ecode, offset_top, mb, eptrb, RM38);
5023 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5024 if (fi >= max) RRETURN(MATCH_NOMATCH);
5025 if (eptr >= mb->end_subject)
5026 {
5027 SCHECK_PARTIAL();
5028 RRETURN(MATCH_NOMATCH);
5029 }
5030 GETCHARINCTEST(c, eptr);
5031 if ((UCD_CATEGORY(c) == prop_value) == prop_fail_result)
5032 RRETURN(MATCH_NOMATCH);
5033 }
5034 /* Control never gets here */
5035
5036 case PT_PC:
5037 for (fi = min;; fi++)
5038 {
5039 RMATCH(eptr, ecode, offset_top, mb, eptrb, RM39);
5040 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5041 if (fi >= max) RRETURN(MATCH_NOMATCH);
5042 if (eptr >= mb->end_subject)
5043 {
5044 SCHECK_PARTIAL();
5045 RRETURN(MATCH_NOMATCH);
5046 }
5047 GETCHARINCTEST(c, eptr);
5048 if ((UCD_CHARTYPE(c) == prop_value) == prop_fail_result)
5049 RRETURN(MATCH_NOMATCH);
5050 }
5051 /* Control never gets here */
5052
5053 case PT_SC:
5054 for (fi = min;; fi++)
5055 {
5056 RMATCH(eptr, ecode, offset_top, mb, eptrb, RM40);
5057 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5058 if (fi >= max) RRETURN(MATCH_NOMATCH);
5059 if (eptr >= mb->end_subject)
5060 {
5061 SCHECK_PARTIAL();
5062 RRETURN(MATCH_NOMATCH);
5063 }
5064 GETCHARINCTEST(c, eptr);
5065 if ((UCD_SCRIPT(c) == prop_value) == prop_fail_result)
5066 RRETURN(MATCH_NOMATCH);
5067 }
5068 /* Control never gets here */
5069
5070 case PT_ALNUM:
5071 for (fi = min;; fi++)
5072 {
5073 int category;
5074 RMATCH(eptr, ecode, offset_top, mb, eptrb, RM59);
5075 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5076 if (fi >= max) RRETURN(MATCH_NOMATCH);
5077 if (eptr >= mb->end_subject)
5078 {
5079 SCHECK_PARTIAL();
5080 RRETURN(MATCH_NOMATCH);
5081 }
5082 GETCHARINCTEST(c, eptr);
5083 category = UCD_CATEGORY(c);
5084 if ((category == ucp_L || category == ucp_N) == prop_fail_result)
5085 RRETURN(MATCH_NOMATCH);
5086 }
5087 /* Control never gets here */
5088
5089 /* Perl space used to exclude VT, but from Perl 5.18 it is included,
5090 which means that Perl space and POSIX space are now identical. PCRE
5091 was changed at release 8.34. */
5092
5093 case PT_SPACE: /* Perl space */
5094 case PT_PXSPACE: /* POSIX space */
5095 for (fi = min;; fi++)
5096 {
5097 RMATCH(eptr, ecode, offset_top, mb, eptrb, RM61);
5098 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5099 if (fi >= max) RRETURN(MATCH_NOMATCH);
5100 if (eptr >= mb->end_subject)
5101 {
5102 SCHECK_PARTIAL();
5103 RRETURN(MATCH_NOMATCH);
5104 }
5105 GETCHARINCTEST(c, eptr);
5106 switch(c)
5107 {
5108 HSPACE_CASES:
5109 VSPACE_CASES:
5110 if (prop_fail_result) RRETURN(MATCH_NOMATCH);
5111 break;
5112
5113 default:
5114 if ((UCD_CATEGORY(c) == ucp_Z) == prop_fail_result)
5115 RRETURN(MATCH_NOMATCH);
5116 break;
5117 }
5118 }
5119 /* Control never gets here */
5120
5121 case PT_WORD:
5122 for (fi = min;; fi++)
5123 {
5124 int category;
5125 RMATCH(eptr, ecode, offset_top, mb, eptrb, RM62);
5126 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5127 if (fi >= max) RRETURN(MATCH_NOMATCH);
5128 if (eptr >= mb->end_subject)
5129 {
5130 SCHECK_PARTIAL();
5131 RRETURN(MATCH_NOMATCH);
5132 }
5133 GETCHARINCTEST(c, eptr);
5134 category = UCD_CATEGORY(c);
5135 if ((category == ucp_L ||
5136 category == ucp_N ||
5137 c == CHAR_UNDERSCORE)
5138 == prop_fail_result)
5139 RRETURN(MATCH_NOMATCH);
5140 }
5141 /* Control never gets here */
5142
5143 case PT_CLIST:
5144 for (fi = min;; fi++)
5145 {
5146 const uint32_t *cp;
5147 RMATCH(eptr, ecode, offset_top, mb, eptrb, RM67);
5148 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5149 if (fi >= max) RRETURN(MATCH_NOMATCH);
5150 if (eptr >= mb->end_subject)
5151 {
5152 SCHECK_PARTIAL();
5153 RRETURN(MATCH_NOMATCH);
5154 }
5155 GETCHARINCTEST(c, eptr);
5156 cp = PRIV(ucd_caseless_sets) + prop_value;
5157 for (;;)
5158 {
5159 if (c < *cp)
5160 { if (prop_fail_result) break; else { RRETURN(MATCH_NOMATCH); } }
5161 if (c == *cp++)
5162 { if (prop_fail_result) { RRETURN(MATCH_NOMATCH); } else break; }
5163 }
5164 }
5165 /* Control never gets here */
5166
5167 case PT_UCNC:
5168 for (fi = min;; fi++)
5169 {
5170 RMATCH(eptr, ecode, offset_top, mb, eptrb, RM60);
5171 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5172 if (fi >= max) RRETURN(MATCH_NOMATCH);
5173 if (eptr >= mb->end_subject)
5174 {
5175 SCHECK_PARTIAL();
5176 RRETURN(MATCH_NOMATCH);
5177 }
5178 GETCHARINCTEST(c, eptr);
5179 if ((c == CHAR_DOLLAR_SIGN || c == CHAR_COMMERCIAL_AT ||
5180 c == CHAR_GRAVE_ACCENT || (c >= 0xa0 && c <= 0xd7ff) ||
5181 c >= 0xe000) == prop_fail_result)
5182 RRETURN(MATCH_NOMATCH);
5183 }
5184 /* Control never gets here */
5185
5186 /* This should never occur */
5187 default:
5188 RRETURN(PCRE2_ERROR_INTERNAL);
5189 }
5190 }
5191
5192 /* Match extended Unicode sequences. We will get here only if the
5193 support is in the binary; otherwise a compile-time error occurs. */
5194
5195 else if (ctype == OP_EXTUNI)
5196 {
5197 for (fi = min;; fi++)
5198 {
5199 RMATCH(eptr, ecode, offset_top, mb, eptrb, RM41);
5200 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5201 if (fi >= max) RRETURN(MATCH_NOMATCH);
5202 if (eptr >= mb->end_subject)
5203 {
5204 SCHECK_PARTIAL();
5205 RRETURN(MATCH_NOMATCH);
5206 }
5207 else
5208 {
5209 int lgb, rgb;
5210 GETCHARINCTEST(c, eptr);
5211 lgb = UCD_GRAPHBREAK(c);
5212 while (eptr < mb->end_subject)
5213 {
5214 int len = 1;
5215 if (!utf) c = *eptr; else { GETCHARLEN(c, eptr, len); }
5216 rgb = UCD_GRAPHBREAK(c);
5217 if ((PRIV(ucp_gbtable)[lgb] & (1 << rgb)) == 0) break;
5218 lgb = rgb;
5219 eptr += len;
5220 }
5221 }
5222 CHECK_PARTIAL();
5223 }
5224 }
5225 else
5226 #endif /* SUPPORT_UNICODE */
5227
5228 #ifdef SUPPORT_UNICODE
5229 if (utf)
5230 {
5231 for (fi = min;; fi++)
5232 {
5233 RMATCH(eptr, ecode, offset_top, mb, eptrb, RM42);
5234 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5235 if (fi >= max) RRETURN(MATCH_NOMATCH);
5236 if (eptr >= mb->end_subject)
5237 {
5238 SCHECK_PARTIAL();
5239 RRETURN(MATCH_NOMATCH);
5240 }
5241 if (ctype == OP_ANY && IS_NEWLINE(eptr))
5242 RRETURN(MATCH_NOMATCH);
5243 GETCHARINC(c, eptr);
5244 switch(ctype)
5245 {
5246 case OP_ANY: /* This is the non-NL case */
5247 if (mb->partial != 0 && /* Take care with CRLF partial */
5248 eptr >= mb->end_subject &&
5249 NLBLOCK->nltype == NLTYPE_FIXED &&
5250 NLBLOCK->nllen == 2 &&
5251 c == NLBLOCK->nl[0])
5252 {
5253 mb->hitend = TRUE;
5254 if (mb->partial > 1) RRETURN(PCRE2_ERROR_PARTIAL);
5255 }
5256 break;
5257
5258 case OP_ALLANY:
5259 case OP_ANYBYTE:
5260 break;
5261
5262 case OP_ANYNL:
5263 switch(c)
5264 {
5265 default: RRETURN(MATCH_NOMATCH);
5266 case CHAR_CR:
5267 if (eptr < mb->end_subject && UCHAR21(eptr) == CHAR_LF) eptr++;
5268 break;
5269
5270 case CHAR_LF:
5271 break;
5272
5273 case CHAR_VT:
5274 case CHAR_FF:
5275 case CHAR_NEL:
5276 #ifndef EBCDIC
5277 case 0x2028:
5278 case 0x2029:
5279 #endif /* Not EBCDIC */
5280 if (mb->bsr_convention == PCRE2_BSR_ANYCRLF) RRETURN(MATCH_NOMATCH);
5281 break;
5282 }
5283 break;
5284
5285 case OP_NOT_HSPACE:
5286 switch(c)
5287 {
5288 HSPACE_CASES: RRETURN(MATCH_NOMATCH);
5289 default: break;
5290 }
5291 break;
5292
5293 case OP_HSPACE:
5294 switch(c)
5295 {
5296 HSPACE_CASES: break;
5297 default: RRETURN(MATCH_NOMATCH);
5298 }
5299 break;
5300
5301 case OP_NOT_VSPACE:
5302 switch(c)
5303 {
5304 VSPACE_CASES: RRETURN(MATCH_NOMATCH);
5305 default: break;
5306 }
5307 break;
5308
5309 case OP_VSPACE:
5310 switch(c)
5311 {
5312 VSPACE_CASES: break;
5313 default: RRETURN(MATCH_NOMATCH);
5314 }
5315 break;
5316
5317 case OP_NOT_DIGIT:
5318 if (c < 256 && (mb->ctypes[c] & ctype_digit) != 0)
5319 RRETURN(MATCH_NOMATCH);
5320 break;
5321
5322 case OP_DIGIT:
5323 if (c >= 256 || (mb->ctypes[c] & ctype_digit) == 0)
5324 RRETURN(MATCH_NOMATCH);
5325 break;
5326
5327 case OP_NOT_WHITESPACE:
5328 if (c < 256 && (mb->ctypes[c] & ctype_space) != 0)
5329 RRETURN(MATCH_NOMATCH);
5330 break;
5331
5332 case OP_WHITESPACE:
5333 if (c >= 256 || (mb->ctypes[c] & ctype_space) == 0)
5334 RRETURN(MATCH_NOMATCH);
5335 break;
5336
5337 case OP_NOT_WORDCHAR:
5338 if (c < 256 && (mb->ctypes[c] & ctype_word) != 0)
5339 RRETURN(MATCH_NOMATCH);
5340 break;
5341
5342 case OP_WORDCHAR:
5343 if (c >= 256 || (mb->ctypes[c] & ctype_word) == 0)
5344 RRETURN(MATCH_NOMATCH);
5345 break;
5346
5347 default:
5348 RRETURN(PCRE2_ERROR_INTERNAL);
5349 }
5350 }
5351 }
5352 else
5353 #endif
5354 /* Not UTF mode */
5355 {
5356 for (fi = min;; fi++)
5357 {
5358 RMATCH(eptr, ecode, offset_top, mb, eptrb, RM43);
5359 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5360 if (fi >= max) RRETURN(MATCH_NOMATCH);
5361 if (eptr >= mb->end_subject)
5362 {
5363 SCHECK_PARTIAL();
5364 RRETURN(MATCH_NOMATCH);
5365 }
5366 if (ctype == OP_ANY && IS_NEWLINE(eptr))
5367 RRETURN(MATCH_NOMATCH);
5368 c = *eptr++;
5369 switch(ctype)
5370 {
5371 case OP_ANY: /* This is the non-NL case */
5372 if (mb->partial != 0 && /* Take care with CRLF partial */
5373 eptr >= mb->end_subject &&
5374 NLBLOCK->nltype == NLTYPE_FIXED &&
5375 NLBLOCK->nllen == 2 &&
5376 c == NLBLOCK->nl[0])
5377 {
5378 mb->hitend = TRUE;
5379 if (mb->partial > 1) RRETURN(PCRE2_ERROR_PARTIAL);
5380 }
5381 break;
5382
5383 case OP_ALLANY:
5384 case OP_ANYBYTE:
5385 break;
5386
5387 case OP_ANYNL:
5388 switch(c)
5389 {
5390 default: RRETURN(MATCH_NOMATCH);
5391 case CHAR_CR:
5392 if (eptr < mb->end_subject && *eptr == CHAR_LF) eptr++;
5393 break;
5394
5395 case CHAR_LF:
5396 break;
5397
5398 case CHAR_VT:
5399 case CHAR_FF:
5400 case CHAR_NEL:
5401 #if PCRE2_CODE_UNIT_WIDTH != 8
5402 case 0x2028:
5403 case 0x2029:
5404 #endif
5405 if (mb->bsr_convention == PCRE2_BSR_ANYCRLF) RRETURN(MATCH_NOMATCH);
5406 break;
5407 }
5408 break;
5409
5410 case OP_NOT_HSPACE:
5411 switch(c)
5412 {
5413 default: break;
5414 HSPACE_BYTE_CASES:
5415 #if PCRE2_CODE_UNIT_WIDTH != 8
5416 HSPACE_MULTIBYTE_CASES:
5417 #endif
5418 RRETURN(MATCH_NOMATCH);
5419 }
5420 break;
5421
5422 case OP_HSPACE:
5423 switch(c)
5424 {
5425 default: RRETURN(MATCH_NOMATCH);
5426 HSPACE_BYTE_CASES:
5427 #if PCRE2_CODE_UNIT_WIDTH != 8
5428 HSPACE_MULTIBYTE_CASES:
5429 #endif
5430 break;
5431 }
5432 break;
5433
5434 case OP_NOT_VSPACE:
5435 switch(c)
5436 {
5437 default: break;
5438 VSPACE_BYTE_CASES:
5439 #if PCRE2_CODE_UNIT_WIDTH != 8
5440 VSPACE_MULTIBYTE_CASES:
5441 #endif
5442 RRETURN(MATCH_NOMATCH);
5443 }
5444 break;
5445
5446 case OP_VSPACE:
5447 switch(c)
5448 {
5449 default: RRETURN(MATCH_NOMATCH);
5450 VSPACE_BYTE_CASES:
5451 #if PCRE2_CODE_UNIT_WIDTH != 8
5452 VSPACE_MULTIBYTE_CASES:
5453 #endif
5454 break;
5455 }
5456 break;
5457
5458 case OP_NOT_DIGIT:
5459 if (MAX_255(c) && (mb->ctypes[c] & ctype_digit) != 0) RRETURN(MATCH_NOMATCH);
5460 break;
5461
5462 case OP_DIGIT:
5463 if (!MAX_255(c) || (mb->ctypes[c] & ctype_digit) == 0) RRETURN(MATCH_NOMATCH);
5464 break;
5465
5466 case OP_NOT_WHITESPACE:
5467 if (MAX_255(c) && (mb->ctypes[c] & ctype_space) != 0) RRETURN(MATCH_NOMATCH);
5468 break;
5469
5470 case OP_WHITESPACE:
5471 if (!MAX_255(c) || (mb->ctypes[c] & ctype_space) == 0) RRETURN(MATCH_NOMATCH);
5472 break;
5473
5474 case OP_NOT_WORDCHAR:
5475 if (MAX_255(c) && (mb->ctypes[c] & ctype_word) != 0) RRETURN(MATCH_NOMATCH);
5476 break;
5477
5478 case OP_WORDCHAR:
5479 if (!MAX_255(c) || (mb->ctypes[c] & ctype_word) == 0) RRETURN(MATCH_NOMATCH);
5480 break;
5481
5482 default:
5483 RRETURN(PCRE2_ERROR_INTERNAL);
5484 }
5485 }
5486 }
5487 /* Control never gets here */
5488 }
5489
5490 /* If maximizing, it is worth using inline code for speed, doing the type
5491 test once at the start (i.e. keep it out of the loop). Again, keep the
5492 UTF-8 and UCP stuff separate. */
5493
5494 else
5495 {
5496 pp = eptr; /* Remember where we started */
5497
5498 #ifdef SUPPORT_UNICODE
5499 if (prop_type >= 0)
5500 {
5501 switch(prop_type)
5502 {
5503 case PT_ANY:
5504 for (i = min; i < max; i++)
5505 {
5506 int len = 1;
5507 if (eptr >= mb->end_subject)
5508 {
5509 SCHECK_PARTIAL();
5510 break;
5511 }
5512 GETCHARLENTEST(c, eptr, len);
5513 if (prop_fail_result) break;
5514 eptr+= len;
5515 }
5516 break;
5517
5518 case PT_LAMP:
5519 for (i = min; i < max; i++)
5520 {
5521 int chartype;
5522 int len = 1;
5523 if (eptr >= mb->end_subject)
5524 {
5525 SCHECK_PARTIAL();
5526 break;
5527 }
5528 GETCHARLENTEST(c, eptr, len);
5529 chartype = UCD_CHARTYPE(c);
5530 if ((chartype == ucp_Lu ||
5531 chartype == ucp_Ll ||
5532 chartype == ucp_Lt) == prop_fail_result)
5533 break;
5534 eptr+= len;
5535 }
5536 break;
5537
5538 case PT_GC:
5539 for (i = min; i < max; i++)
5540 {
5541 int len = 1;
5542 if (eptr >= mb->end_subject)
5543 {
5544 SCHECK_PARTIAL();
5545 break;
5546 }
5547 GETCHARLENTEST(c, eptr, len);
5548 if ((UCD_CATEGORY(c) == prop_value) == prop_fail_result) break;
5549 eptr+= len;
5550 }
5551 break;
5552
5553 case PT_PC:
5554 for (i = min; i < max; i++)
5555 {
5556 int len = 1;
5557 if (eptr >= mb->end_subject)
5558 {
5559 SCHECK_PARTIAL();
5560 break;
5561 }
5562 GETCHARLENTEST(c, eptr, len);
5563 if ((UCD_CHARTYPE(c) == prop_value) == prop_fail_result) break;
5564 eptr+= len;
5565 }
5566 break;
5567
5568 case PT_SC:
5569 for (i = min; i < max; i++)
5570 {
5571 int len = 1;
5572 if (eptr >= mb->end_subject)
5573 {
5574 SCHECK_PARTIAL();
5575 break;
5576 }
5577 GETCHARLENTEST(c, eptr, len);
5578 if ((UCD_SCRIPT(c) == prop_value) == prop_fail_result) break;
5579 eptr+= len;
5580 }
5581 break;
5582
5583 case PT_ALNUM:
5584 for (i = min; i < max; i++)
5585 {
5586 int category;
5587 int len = 1;
5588 if (eptr >= mb->end_subject)
5589 {
5590 SCHECK_PARTIAL();
5591 break;
5592 }
5593 GETCHARLENTEST(c, eptr, len);
5594 category = UCD_CATEGORY(c);
5595 if ((category == ucp_L || category == ucp_N) == prop_fail_result)
5596 break;
5597 eptr+= len;
5598 }
5599 break;
5600
5601 /* Perl space used to exclude VT, but from Perl 5.18 it is included,
5602 which means that Perl space and POSIX space are now identical. PCRE
5603 was changed at release 8.34. */
5604
5605 case PT_SPACE: /* Perl space */
5606 case PT_PXSPACE: /* POSIX space */
5607 for (i = min; i < max; i++)
5608 {
5609 int len = 1;
5610 if (eptr >= mb->end_subject)
5611 {
5612 SCHECK_PARTIAL();
5613 break;
5614 }
5615 GETCHARLENTEST(c, eptr, len);
5616 switch(c)
5617 {
5618 HSPACE_CASES:
5619 VSPACE_CASES:
5620 if (prop_fail_result) goto ENDLOOP99; /* Break the loop */
5621 break;
5622
5623 default:
5624 if ((UCD_CATEGORY(c) == ucp_Z) == prop_fail_result)
5625 goto ENDLOOP99; /* Break the loop */
5626 break;
5627 }
5628 eptr+= len;
5629 }
5630 ENDLOOP99:
5631 break;
5632
5633 case PT_WORD:
5634 for (i = min; i < max; i++)
5635 {
5636 int category;
5637 int len = 1;
5638 if (eptr >= mb->end_subject)
5639 {
5640 SCHECK_PARTIAL();
5641 break;
5642 }
5643 GETCHARLENTEST(c, eptr, len);
5644 category = UCD_CATEGORY(c);
5645 if ((category == ucp_L || category == ucp_N ||
5646 c == CHAR_UNDERSCORE) == prop_fail_result)
5647 break;
5648 eptr+= len;
5649 }
5650 break;
5651
5652 case PT_CLIST:
5653 for (i = min; i < max; i++)
5654 {
5655 const uint32_t *cp;
5656 int len = 1;
5657 if (eptr >= mb->end_subject)
5658 {
5659 SCHECK_PARTIAL();
5660 break;
5661 }
5662 GETCHARLENTEST(c, eptr, len);
5663 cp = PRIV(ucd_caseless_sets) + prop_value;
5664 for (;;)
5665 {
5666 if (c < *cp)
5667 { if (prop_fail_result) break; else goto GOT_MAX; }
5668 if (c == *cp++)
5669 { if (prop_fail_result) goto GOT_MAX; else break; }
5670 }
5671 eptr += len;
5672 }
5673 GOT_MAX:
5674 break;
5675
5676 case PT_UCNC:
5677 for (i = min; i < max; i++)
5678 {
5679 int len = 1;
5680 if (eptr >= mb->end_subject)
5681 {
5682 SCHECK_PARTIAL();
5683 break;
5684 }
5685 GETCHARLENTEST(c, eptr, len);
5686 if ((c == CHAR_DOLLAR_SIGN || c == CHAR_COMMERCIAL_AT ||
5687 c == CHAR_GRAVE_ACCENT || (c >= 0xa0 && c <= 0xd7ff) ||
5688 c >= 0xe000) == prop_fail_result)
5689 break;
5690 eptr += len;
5691 }
5692 break;
5693
5694 default:
5695 RRETURN(PCRE2_ERROR_INTERNAL);
5696 }
5697
5698 /* eptr is now past the end of the maximum run */
5699
5700 if (possessive) continue; /* No backtracking */
5701
5702 /* After \C in UTF mode, pp might be in the middle of a Unicode
5703 character. Use <= pp to ensure backtracking doesn't go too far. */
5704
5705 for(;;)
5706 {
5707 if (eptr <= pp) goto TAIL_RECURSE;
5708 RMATCH(eptr, ecode, offset_top, mb, eptrb, RM44);
5709 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5710 eptr--;
5711 if (utf) BACKCHAR(eptr);
5712 }
5713 }
5714
5715 /* Match extended Unicode grapheme clusters. We will get here only if the
5716 support is in the binary; otherwise a compile-time error occurs. */
5717
5718 else if (ctype == OP_EXTUNI)
5719 {
5720 for (i = min; i < max; i++)
5721 {
5722 if (eptr >= mb->end_subject)
5723 {
5724 SCHECK_PARTIAL();
5725 break;
5726 }
5727 else
5728 {
5729 int lgb, rgb;
5730 GETCHARINCTEST(c, eptr);
5731 lgb = UCD_GRAPHBREAK(c);
5732 while (eptr < mb->end_subject)
5733 {
5734 int len = 1;
5735 if (!utf) c = *eptr; else { GETCHARLEN(c, eptr, len); }
5736 rgb = UCD_GRAPHBREAK(c);
5737 if ((PRIV(ucp_gbtable)[lgb] & (1 << rgb)) == 0) break;
5738 lgb = rgb;
5739 eptr += len;
5740 }
5741 }
5742 CHECK_PARTIAL();
5743 }
5744
5745 /* eptr is now past the end of the maximum run */
5746
5747 if (possessive) continue; /* No backtracking */
5748
5749 /* We use <= pp rather than == pp to detect the start of the run while
5750 backtracking because the use of \C in UTF mode can cause BACKCHAR to
5751 move back past pp. This is just palliative; the use of \C in UTF mode
5752 is fraught with danger. */
5753
5754 for(;;)
5755 {
5756 int lgb, rgb;
5757 PCRE2_SPTR fptr;
5758
5759 if (eptr <= pp) goto TAIL_RECURSE; /* At start of char run */
5760 RMATCH(eptr, ecode, offset_top, mb, eptrb, RM45);
5761 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5762
5763 /* Backtracking over an extended grapheme cluster involves inspecting
5764 the previous two characters (if present) to see if a break is
5765 permitted between them. */
5766
5767 eptr--;
5768 if (!utf) c = *eptr; else
5769 {
5770 BACKCHAR(eptr);
5771 GETCHAR(c, eptr);
5772 }
5773 rgb = UCD_GRAPHBREAK(c);
5774
5775 for (;;)
5776 {
5777 if (eptr <= pp) goto TAIL_RECURSE; /* At start of char run */
5778 fptr = eptr - 1;
5779 if (!utf) c = *fptr; else
5780 {
5781 BACKCHAR(fptr);
5782 GETCHAR(c, fptr);
5783 }
5784 lgb = UCD_GRAPHBREAK(c);
5785 if ((PRIV(ucp_gbtable)[lgb] & (1 << rgb)) == 0) break;
5786 eptr = fptr;
5787 rgb = lgb;
5788 }
5789 }
5790 }
5791
5792 else
5793 #endif /* SUPPORT_UNICODE */
5794
5795 #ifdef SUPPORT_UNICODE
5796 if (utf)
5797 {
5798 switch(ctype)
5799 {
5800 case OP_ANY:
5801 for (i = min; i < max; i++)
5802 {
5803 if (eptr >= mb->end_subject)
5804 {
5805 SCHECK_PARTIAL();
5806 break;
5807 }
5808 if (IS_NEWLINE(eptr)) break;
5809 if (mb->partial != 0 && /* Take care with CRLF partial */
5810 eptr + 1 >= mb->end_subject &&
5811 NLBLOCK->nltype == NLTYPE_FIXED &&
5812 NLBLOCK->nllen == 2 &&
5813 UCHAR21(eptr) == NLBLOCK->nl[0])
5814 {
5815 mb->hitend = TRUE;
5816 if (mb->partial > 1) RRETURN(PCRE2_ERROR_PARTIAL);
5817 }
5818 eptr++;
5819 ACROSSCHAR(eptr < mb->end_subject, *eptr, eptr++);
5820 }
5821 break;
5822
5823 case OP_ALLANY:
5824 if (max < INT_MAX)
5825 {
5826 for (i = min; i < max; i++)
5827 {
5828 if (eptr >= mb->end_subject)
5829 {
5830 SCHECK_PARTIAL();
5831 break;
5832 }
5833 eptr++;
5834 ACROSSCHAR(eptr < mb->end_subject, *eptr, eptr++);
5835 }
5836 }
5837 else
5838 {
5839 eptr = mb->end_subject; /* Unlimited UTF-8 repeat */
5840 SCHECK_PARTIAL();
5841 }
5842 break;
5843
5844 /* The byte case is the same as non-UTF8 */
5845
5846 case OP_ANYBYTE:
5847 c = max - min;
5848 if (c > (uint32_t)(mb->end_subject - eptr))
5849 {
5850 eptr = mb->end_subject;
5851 SCHECK_PARTIAL();
5852 }
5853 else eptr += c;
5854 break;
5855
5856 case OP_ANYNL:
5857 for (i = min; i < max; i++)
5858 {
5859 int len = 1;
5860 if (eptr >= mb->end_subject)
5861 {
5862 SCHECK_PARTIAL();
5863 break;
5864 }
5865 GETCHARLEN(c, eptr, len);
5866 if (c == CHAR_CR)
5867 {
5868 if (++eptr >= mb->end_subject) break;
5869 if (UCHAR21(eptr) == CHAR_LF) eptr++;
5870 }
5871 else
5872 {
5873 if (c != CHAR_LF &&
5874 (mb->bsr_convention == PCRE2_BSR_ANYCRLF ||
5875 (c != CHAR_VT && c != CHAR_FF && c != CHAR_NEL
5876 #ifndef EBCDIC
5877 && c != 0x2028 && c != 0x2029
5878 #endif /* Not EBCDIC */
5879 )))
5880 break;
5881 eptr += len;
5882 }
5883 }
5884 break;
5885
5886 case OP_NOT_HSPACE:
5887 case OP_HSPACE:
5888 for (i = min; i < max; i++)
5889 {
5890 BOOL gotspace;
5891 int len = 1;
5892 if (eptr >= mb->end_subject)
5893 {
5894 SCHECK_PARTIAL();
5895 break;
5896 }
5897 GETCHARLEN(c, eptr, len);
5898 switch(c)
5899 {
5900 HSPACE_CASES: gotspace = TRUE; break;
5901 default: gotspace = FALSE; break;
5902 }
5903 if (gotspace == (ctype == OP_NOT_HSPACE)) break;
5904 eptr += len;
5905 }
5906 break;
5907
5908 case OP_NOT_VSPACE:
5909 case OP_VSPACE:
5910 for (i = min; i < max; i++)
5911 {
5912 BOOL gotspace;
5913 int len = 1;
5914 if (eptr >= mb->end_subject)
5915 {
5916 SCHECK_PARTIAL();
5917 break;
5918 }
5919 GETCHARLEN(c, eptr, len);
5920 switch(c)
5921 {
5922 VSPACE_CASES: gotspace = TRUE; break;
5923 default: gotspace = FALSE; break;
5924 }
5925 if (gotspace == (ctype == OP_NOT_VSPACE)) break;
5926 eptr += len;
5927 }
5928 break;
5929
5930 case OP_NOT_DIGIT:
5931 for (i = min; i < max; i++)
5932 {
5933 int len = 1;
5934 if (eptr >= mb->end_subject)
5935 {
5936 SCHECK_PARTIAL();
5937 break;
5938 }
5939 GETCHARLEN(c, eptr, len);
5940 if (c < 256 && (mb->ctypes[c] & ctype_digit) != 0) break;
5941 eptr+= len;
5942 }
5943 break;
5944
5945 case OP_DIGIT:
5946 for (i = min; i < max; i++)
5947 {
5948 int len = 1;
5949 if (eptr >= mb->end_subject)
5950 {
5951 SCHECK_PARTIAL();
5952 break;
5953 }
5954 GETCHARLEN(c, eptr, len);
5955 if (c >= 256 ||(mb->ctypes[c] & ctype_digit) == 0) break;
5956 eptr+= len;
5957 }
5958 break;
5959
5960 case OP_NOT_WHITESPACE:
5961 for (i = min; i < max; i++)
5962 {
5963 int len = 1;
5964 if (eptr >= mb->end_subject)
5965 {
5966 SCHECK_PARTIAL();
5967 break;
5968 }
5969 GETCHARLEN(c, eptr, len);
5970 if (c < 256 && (mb->ctypes[c] & ctype_space) != 0) break;
5971 eptr+= len;
5972 }
5973 break;
5974
5975 case OP_WHITESPACE:
5976 for (i = min; i < max; i++)
5977 {
5978 int len = 1;
5979 if (eptr >= mb->end_subject)
5980 {
5981 SCHECK_PARTIAL();
5982 break;
5983 }
5984 GETCHARLEN(c, eptr, len);
5985 if (c >= 256 ||(mb->ctypes[c] & ctype_space) == 0) break;
5986 eptr+= len;
5987 }
5988 break;
5989
5990 case OP_NOT_WORDCHAR:
5991 for (i = min; i < max; i++)
5992 {
5993 int len = 1;
5994 if (eptr >= mb->end_subject)
5995 {
5996 SCHECK_PARTIAL();
5997 break;
5998 }
5999 GETCHARLEN(c, eptr, len);
6000 if (c < 256 && (mb->ctypes[c] & ctype_word) != 0) break;
6001 eptr+= len;
6002 }
6003 break;
6004
6005 case OP_WORDCHAR:
6006 for (i = min; i < max; i++)
6007 {
6008 int len = 1;
6009 if (eptr >= mb->end_subject)
6010 {
6011 SCHECK_PARTIAL();
6012 break;
6013 }
6014 GETCHARLEN(c, eptr, len);
6015 if (c >= 256 || (mb->ctypes[c] & ctype_word) == 0) break;
6016 eptr+= len;
6017 }
6018 break;
6019
6020 default:
6021 RRETURN(PCRE2_ERROR_INTERNAL);
6022 }
6023
6024 if (possessive) continue; /* No backtracking */
6025
6026 /* After \C in UTF mode, pp might be in the middle of a Unicode
6027 character. Use <= pp to ensure backtracking doesn't go too far. */
6028
6029 for(;;)
6030 {
6031 if (eptr <= pp) goto TAIL_RECURSE;
6032 RMATCH(eptr, ecode, offset_top, mb, eptrb, RM46);
6033 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
6034 eptr--;
6035 BACKCHAR(eptr);
6036 if (ctype == OP_ANYNL && eptr > pp && UCHAR21(eptr) == CHAR_NL &&
6037 UCHAR21(eptr - 1) == CHAR_CR) eptr--;
6038 }
6039 }
6040 else
6041 #endif /* SUPPORT_UNICODE */
6042 /* Not UTF mode */
6043 {
6044 switch(ctype)
6045 {
6046 case OP_ANY:
6047 for (i = min; i < max; i++)
6048 {
6049 if (eptr >= mb->end_subject)
6050 {
6051 SCHECK_PARTIAL();
6052 break;
6053 }
6054 if (IS_NEWLINE(eptr)) break;
6055 if (mb->partial != 0 && /* Take care with CRLF partial */
6056 eptr + 1 >= mb->end_subject &&
6057 NLBLOCK->nltype == NLTYPE_FIXED &&
6058 NLBLOCK->nllen == 2 &&
6059 *eptr == NLBLOCK->nl[0])
6060 {
6061 mb->hitend = TRUE;
6062 if (mb->partial > 1) RRETURN(PCRE2_ERROR_PARTIAL);
6063 }
6064 eptr++;
6065 }
6066 break;
6067
6068 case OP_ALLANY:
6069 case OP_ANYBYTE:
6070 c = max - min;
6071 if (c > (uint32_t)(mb->end_subject - eptr))
6072 {
6073 eptr = mb->end_subject;
6074 SCHECK_PARTIAL();
6075 }
6076 else eptr += c;
6077 break;
6078
6079 case OP_ANYNL:
6080 for (i = min; i < max; i++)
6081 {
6082 if (eptr >= mb->end_subject)
6083 {
6084 SCHECK_PARTIAL();
6085 break;
6086 }
6087 c = *eptr;
6088 if (c == CHAR_CR)
6089 {
6090 if (++eptr >= mb->end_subject) break;
6091 if (*eptr == CHAR_LF) eptr++;
6092 }
6093 else
6094 {
6095 if (c != CHAR_LF && (mb->bsr_convention == PCRE2_BSR_ANYCRLF ||
6096 (c != CHAR_VT && c != CHAR_FF && c != CHAR_NEL
6097 #if PCRE2_CODE_UNIT_WIDTH != 8
6098 && c != 0x2028 && c != 0x2029
6099 #endif
6100 ))) break;
6101 eptr++;
6102 }
6103 }
6104 break;
6105
6106 case OP_NOT_HSPACE:
6107 for (i = min; i < max; i++)
6108 {
6109 if (eptr >= mb->end_subject)
6110 {
6111 SCHECK_PARTIAL();
6112 break;
6113 }
6114 switch(*eptr)
6115 {
6116 default: eptr++; break;
6117 HSPACE_BYTE_CASES:
6118 #if PCRE2_CODE_UNIT_WIDTH != 8
6119 HSPACE_MULTIBYTE_CASES:
6120 #endif
6121 goto ENDLOOP00;
6122 }
6123 }
6124 ENDLOOP00:
6125 break;
6126
6127 case OP_HSPACE:
6128 for (i = min; i < max; i++)
6129 {
6130 if (eptr >= mb->end_subject)
6131 {
6132 SCHECK_PARTIAL();
6133 break;
6134 }
6135 switch(*eptr)
6136 {
6137 default: goto ENDLOOP01;
6138 HSPACE_BYTE_CASES:
6139 #if PCRE2_CODE_UNIT_WIDTH != 8
6140 HSPACE_MULTIBYTE_CASES:
6141 #endif
6142 eptr++; break;
6143 }
6144 }
6145 ENDLOOP01:
6146 break;
6147
6148 case OP_NOT_VSPACE:
6149 for (i = min; i < max; i++)
6150 {
6151 if (eptr >= mb->end_subject)
6152 {
6153 SCHECK_PARTIAL();
6154 break;
6155 }
6156 switch(*eptr)
6157 {
6158 default: eptr++; break;
6159 VSPACE_BYTE_CASES:
6160 #if PCRE2_CODE_UNIT_WIDTH != 8
6161 VSPACE_MULTIBYTE_CASES:
6162 #endif
6163 goto ENDLOOP02;
6164 }
6165 }
6166 ENDLOOP02:
6167 break;
6168
6169 case OP_VSPACE:
6170 for (i = min; i < max; i++)
6171 {
6172 if (eptr >= mb->end_subject)
6173 {
6174 SCHECK_PARTIAL();
6175 break;
6176 }
6177 switch(*eptr)
6178 {
6179 default: goto ENDLOOP03;
6180 VSPACE_BYTE_CASES:
6181 #if PCRE2_CODE_UNIT_WIDTH != 8
6182 VSPACE_MULTIBYTE_CASES:
6183 #endif
6184 eptr++; break;
6185 }
6186 }
6187 ENDLOOP03:
6188 break;
6189
6190 case OP_NOT_DIGIT:
6191 for (i = min; i < max; i++)
6192 {
6193 if (eptr >= mb->end_subject)
6194 {
6195 SCHECK_PARTIAL();
6196 break;
6197 }
6198 if (MAX_255(*eptr) && (mb->ctypes[*eptr] & ctype_digit) != 0) break;
6199 eptr++;
6200 }
6201 break;
6202
6203 case OP_DIGIT:
6204 for (i = min; i < max; i++)
6205 {
6206 if (eptr >= mb->end_subject)
6207 {
6208 SCHECK_PARTIAL();
6209 break;
6210 }
6211 if (!MAX_255(*eptr) || (mb->ctypes[*eptr] & ctype_digit) == 0) break;
6212 eptr++;
6213 }
6214 break;
6215
6216 case OP_NOT_WHITESPACE:
6217 for (i = min; i < max; i++)
6218 {
6219 if (eptr >= mb->end_subject)
6220 {
6221 SCHECK_PARTIAL();
6222 break;
6223 }
6224 if (MAX_255(*eptr) && (mb->ctypes[*eptr] & ctype_space) != 0) break;
6225 eptr++;
6226 }
6227 break;
6228
6229 case OP_WHITESPACE:
6230 for (i = min; i < max; i++)
6231 {
6232 if (eptr >= mb->end_subject)
6233 {
6234 SCHECK_PARTIAL();
6235 break;
6236 }
6237 if (!MAX_255(*eptr) || (mb->ctypes[*eptr] & ctype_space) == 0) break;
6238 eptr++;
6239 }
6240 break;
6241
6242 case OP_NOT_WORDCHAR:
6243 for (i = min; i < max; i++)
6244 {
6245 if (eptr >= mb->end_subject)
6246 {
6247 SCHECK_PARTIAL();
6248 break;
6249 }
6250 if (MAX_255(*eptr) && (mb->ctypes[*eptr] & ctype_word) != 0) break;
6251 eptr++;
6252 }
6253 break;
6254
6255 case OP_WORDCHAR:
6256 for (i = min; i < max; i++)
6257 {
6258 if (eptr >= mb->end_subject)
6259 {
6260 SCHECK_PARTIAL();
6261 break;
6262 }
6263 if (!MAX_255(*eptr) || (mb->ctypes[*eptr] & ctype_word) == 0) break;
6264 eptr++;
6265 }
6266 break;
6267
6268 default:
6269 RRETURN(PCRE2_ERROR_INTERNAL);
6270 }
6271
6272 if (possessive) continue; /* No backtracking */
6273 for (;;)
6274 {
6275 if (eptr == pp) goto TAIL_RECURSE;
6276 RMATCH(eptr, ecode, offset_top, mb, eptrb, RM47);
6277 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
6278 eptr--;
6279 if (ctype == OP_ANYNL && eptr > pp && *eptr == CHAR_LF &&
6280 eptr[-1] == CHAR_CR) eptr--;
6281 }
6282 }
6283
6284 /* Control never gets here */
6285 }
6286
6287 /* There's been some horrible disaster. Arrival here can only mean there is
6288 something seriously wrong in the code above or the OP_xxx definitions. */
6289
6290 default:
6291 RRETURN(PCRE2_ERROR_INTERNAL);
6292 }
6293
6294 /* Do not stick any code in here without much thought; it is assumed
6295 that "continue" in the code above comes out to here to repeat the main
6296 loop. */
6297
6298 } /* End of main loop */
6299 /* Control never reaches here */
6300
6301
6302 /* When compiling to use the heap rather than the stack for recursive calls to
6303 match(), the RRETURN() macro jumps here. The number that is saved in
6304 frame->Xwhere indicates which label we actually want to return to. */
6305
6306 #ifdef HEAP_MATCH_RECURSE
6307 #define LBL(val) case val: goto L_RM##val;
6308 HEAP_RETURN:
6309 switch (frame->Xwhere)
6310 {
6311 LBL( 1) LBL( 2) LBL( 3) LBL( 4) LBL( 5) LBL( 6) LBL( 7) LBL( 8)
6312 LBL( 9) LBL(10) LBL(11) LBL(12) LBL(13) LBL(14) LBL(15) LBL(17)
6313 LBL(19) LBL(24) LBL(25) LBL(26) LBL(27) LBL(29) LBL(31) LBL(33)
6314 LBL(35) LBL(43) LBL(47) LBL(48) LBL(49) LBL(50) LBL(51) LBL(52)
6315 LBL(53) LBL(54) LBL(55) LBL(56) LBL(57) LBL(58) LBL(63) LBL(64)
6316 LBL(65) LBL(66) LBL(68)
6317 #ifdef SUPPORT_WIDE_CHARS
6318 LBL(20) LBL(21)
6319 #endif
6320 #ifdef SUPPORT_UNICODE
6321 LBL(16) LBL(18)
6322 LBL(22) LBL(23) LBL(28) LBL(30)
6323 LBL(32) LBL(34) LBL(42) LBL(46)
6324 LBL(36) LBL(37) LBL(38) LBL(39) LBL(40) LBL(41) LBL(44) LBL(45)
6325 LBL(59) LBL(60) LBL(61) LBL(62) LBL(67)
6326 #endif /* SUPPORT_UNICODE */
6327 default:
6328 return PCRE2_ERROR_INTERNAL;
6329 }
6330 #undef LBL
6331 #endif /* HEAP_MATCH_RECURSE */
6332 }
6333
6334
6335 /***************************************************************************
6336 ****************************************************************************
6337 RECURSION IN THE match() FUNCTION
6338
6339 Undefine all the macros that were defined above to handle this. */
6340
6341 #ifdef HEAP_MATCH_RECURSE
6342 #undef eptr
6343 #undef ecode
6344 #undef mstart
6345 #undef offset_top
6346 #undef eptrb
6347 #undef flags
6348
6349 #undef callpat
6350 #undef charptr
6351 #undef data
6352 #undef next_ecode
6353 #undef pp
6354 #undef prev
6355 #undef saved_eptr
6356
6357 #undef new_recursive
6358
6359 #undef cur_is_word
6360 #undef condition
6361 #undef prev_is_word
6362
6363 #undef ctype
6364 #undef length
6365 #undef max
6366 #undef min
6367 #undef number
6368 #undef offset
6369 #undef op
6370 #undef save_capture_last
6371 #undef save_offset1
6372 #undef save_offset2
6373 #undef save_offset3
6374
6375 #undef newptrb
6376 #endif /* HEAP_MATCH_RECURSE */
6377
6378 /* These two are defined as macros in both cases */
6379
6380 #undef fc
6381 #undef fi
6382
6383 /***************************************************************************
6384 ***************************************************************************/
6385
6386
6387 #ifdef HEAP_MATCH_RECURSE
6388 /*************************************************
6389 * Release allocated heap frames *
6390 *************************************************/
6391
6392 /* This function releases all the allocated frames. The base frame is on the
6393 machine stack, and so must not be freed.
6394
6395 Argument:
6396 frame_base the address of the base frame
6397 mb the match block
6398
6399 Returns: nothing
6400 */
6401
6402 static void
release_match_heapframes(heapframe * frame_base,match_block * mb)6403 release_match_heapframes (heapframe *frame_base, match_block *mb)
6404 {
6405 heapframe *nextframe = frame_base->Xnextframe;
6406 while (nextframe != NULL)
6407 {
6408 heapframe *oldframe = nextframe;
6409 nextframe = nextframe->Xnextframe;
6410 mb->stack_memctl.free(oldframe, mb->stack_memctl.memory_data);
6411 }
6412 }
6413 #endif /* HEAP_MATCH_RECURSE */
6414
6415
6416
6417 /*************************************************
6418 * Match a Regular Expression *
6419 *************************************************/
6420
6421 /* This function applies a compiled pattern to a subject string and picks out
6422 portions of the string if it matches. Two elements in the vector are set for
6423 each substring: the offsets to the start and end of the substring.
6424
6425 Arguments:
6426 code points to the compiled expression
6427 subject points to the subject string
6428 length length of subject string (may contain binary zeros)
6429 start_offset where to start in the subject string
6430 options option bits
6431 match_data points to a match_data block
6432 mcontext points a PCRE2 context
6433
6434 Returns: > 0 => success; value is the number of ovector pairs filled
6435 = 0 => success, but ovector is not big enough
6436 -1 => failed to match (PCRE2_ERROR_NOMATCH)
6437 -2 => partial match (PCRE2_ERROR_PARTIAL)
6438 < -2 => some kind of unexpected problem
6439 */
6440
6441 PCRE2_EXP_DEFN int PCRE2_CALL_CONVENTION
pcre2_match(const pcre2_code * code,PCRE2_SPTR subject,PCRE2_SIZE length,PCRE2_SIZE start_offset,uint32_t options,pcre2_match_data * match_data,pcre2_match_context * mcontext)6442 pcre2_match(const pcre2_code *code, PCRE2_SPTR subject, PCRE2_SIZE length,
6443 PCRE2_SIZE start_offset, uint32_t options, pcre2_match_data *match_data,
6444 pcre2_match_context *mcontext)
6445 {
6446 int rc;
6447 int ocount;
6448
6449 const uint8_t *start_bits = NULL;
6450
6451 const pcre2_real_code *re = (const pcre2_real_code *)code;
6452
6453 BOOL anchored;
6454 BOOL firstline;
6455 BOOL has_first_cu = FALSE;
6456 BOOL has_req_cu = FALSE;
6457 BOOL startline;
6458 BOOL using_temporary_offsets = FALSE;
6459 BOOL utf;
6460
6461 PCRE2_UCHAR first_cu = 0;
6462 PCRE2_UCHAR first_cu2 = 0;
6463 PCRE2_UCHAR req_cu = 0;
6464 PCRE2_UCHAR req_cu2 = 0;
6465
6466 PCRE2_SPTR bumpalong_limit;
6467 PCRE2_SPTR end_subject;
6468 PCRE2_SPTR start_match = subject + start_offset;
6469 PCRE2_SPTR req_cu_ptr = start_match - 1;
6470 PCRE2_SPTR start_partial = NULL;
6471 PCRE2_SPTR match_partial = NULL;
6472
6473 /* We need to have mb pointing to a match block, because the IS_NEWLINE macro
6474 is used below, and it expects NLBLOCK to be defined as a pointer. */
6475
6476 match_block actual_match_block;
6477 match_block *mb = &actual_match_block;
6478
6479 #ifdef HEAP_MATCH_RECURSE
6480 heapframe frame_zero;
6481 frame_zero.Xprevframe = NULL; /* Marks the top level */
6482 frame_zero.Xnextframe = NULL; /* None are allocated yet */
6483 mb->match_frames_base = &frame_zero;
6484 #endif
6485
6486 /* A length equal to PCRE2_ZERO_TERMINATED implies a zero-terminated
6487 subject string. */
6488
6489 if (length == PCRE2_ZERO_TERMINATED) length = PRIV(strlen)(subject);
6490 end_subject = subject + length;
6491
6492 /* Plausibility checks */
6493
6494 if ((options & ~PUBLIC_MATCH_OPTIONS) != 0) return PCRE2_ERROR_BADOPTION;
6495 if (code == NULL || subject == NULL || match_data == NULL)
6496 return PCRE2_ERROR_NULL;
6497 if (start_offset > length) return PCRE2_ERROR_BADOFFSET;
6498
6499 /* Check that the first field in the block is the magic number. */
6500
6501 if (re->magic_number != MAGIC_NUMBER) return PCRE2_ERROR_BADMAGIC;
6502
6503 /* Check the code unit width. */
6504
6505 if ((re->flags & PCRE2_MODE_MASK) != PCRE2_CODE_UNIT_WIDTH/8)
6506 return PCRE2_ERROR_BADMODE;
6507
6508 /* PCRE2_NOTEMPTY and PCRE2_NOTEMPTY_ATSTART are match-time flags in the
6509 options variable for this function. Users of PCRE2 who are not calling the
6510 function directly would like to have a way of setting these flags, in the same
6511 way that they can set pcre2_compile() flags like PCRE2_NO_AUTOPOSSESS with
6512 constructions like (*NO_AUTOPOSSESS). To enable this, (*NOTEMPTY) and
6513 (*NOTEMPTY_ATSTART) set bits in the pattern's "flag" function which can now be
6514 transferred to the options for this function. The bits are guaranteed to be
6515 adjacent, but do not have the same values. This bit of Boolean trickery assumes
6516 that the match-time bits are not more significant than the flag bits. If by
6517 accident this is not the case, a compile-time division by zero error will
6518 occur. */
6519
6520 #define FF (PCRE2_NOTEMPTY_SET|PCRE2_NE_ATST_SET)
6521 #define OO (PCRE2_NOTEMPTY|PCRE2_NOTEMPTY_ATSTART)
6522 options |= (re->flags & FF) / ((FF & (~FF+1)) / (OO & (~OO+1)));
6523 #undef FF
6524 #undef OO
6525
6526 /* A NULL match context means "use a default context" */
6527
6528 if (mcontext == NULL)
6529 mcontext = (pcre2_match_context *)(&PRIV(default_match_context));
6530
6531 /* These two settings are used in the code for checking a UTF string that
6532 follows immediately afterwards. Other values in the mb block are used only
6533 during interpretive pcre_match() processing, not when the JIT support is in
6534 use, so they are set up later. */
6535
6536 utf = (re->overall_options & PCRE2_UTF) != 0;
6537 mb->partial = ((options & PCRE2_PARTIAL_HARD) != 0)? 2 :
6538 ((options & PCRE2_PARTIAL_SOFT) != 0)? 1 : 0;
6539
6540 /* Check a UTF string for validity if required. For 8-bit and 16-bit strings,
6541 we must also check that a starting offset does not point into the middle of a
6542 multiunit character. We check only the portion of the subject that is going to
6543 be inspected during matching - from the offset minus the maximum back reference
6544 to the given length. This saves time when a small part of a large subject is
6545 being matched by the use of a starting offset. Note that the maximum lookbehind
6546 is a number of characters, not code units. */
6547
6548 #ifdef SUPPORT_UNICODE
6549 if (utf && (options & PCRE2_NO_UTF_CHECK) == 0)
6550 {
6551 PCRE2_SPTR check_subject = start_match; /* start_match includes offset */
6552
6553 if (start_offset > 0)
6554 {
6555 #if PCRE2_CODE_UNIT_WIDTH != 32
6556 unsigned int i;
6557 if (start_match < end_subject && NOT_FIRSTCU(*start_match))
6558 return PCRE2_ERROR_BADUTFOFFSET;
6559 for (i = re->max_lookbehind; i > 0 && check_subject > subject; i--)
6560 {
6561 check_subject--;
6562 while (check_subject > subject &&
6563 #if PCRE2_CODE_UNIT_WIDTH == 8
6564 (*check_subject & 0xc0) == 0x80)
6565 #else /* 16-bit */
6566 (*check_subject & 0xfc00) == 0xdc00)
6567 #endif /* PCRE2_CODE_UNIT_WIDTH == 8 */
6568 check_subject--;
6569 }
6570 #else
6571 /* In the 32-bit library, one code unit equals one character. However,
6572 we cannot just subtract the lookbehind and then compare pointers, because
6573 a very large lookbehind could create an invalid pointer. */
6574
6575 if (start_offset >= re->max_lookbehind)
6576 check_subject -= re->max_lookbehind;
6577 else
6578 check_subject = subject;
6579 #endif /* PCRE2_CODE_UNIT_WIDTH != 32 */
6580 }
6581
6582 /* Validate the relevant portion of the subject. After an error, adjust the
6583 offset to be an absolute offset in the whole string. */
6584
6585 match_data->rc = PRIV(valid_utf)(check_subject,
6586 length - (check_subject - subject), &(match_data->startchar));
6587 if (match_data->rc != 0)
6588 {
6589 match_data->startchar += check_subject - subject;
6590 return match_data->rc;
6591 }
6592 }
6593 #endif /* SUPPORT_UNICODE */
6594
6595 /* It is an error to set an offset limit without setting the flag at compile
6596 time. */
6597
6598 if (mcontext->offset_limit != PCRE2_UNSET &&
6599 (re->overall_options & PCRE2_USE_OFFSET_LIMIT) == 0)
6600 return PCRE2_ERROR_BADOFFSETLIMIT;
6601
6602 /* If the pattern was successfully studied with JIT support, run the JIT
6603 executable instead of the rest of this function. Most options must be set at
6604 compile time for the JIT code to be usable. Fallback to the normal code path if
6605 an unsupported option is set or if JIT returns BADOPTION (which means that the
6606 selected normal or partial matching mode was not compiled). */
6607
6608 #ifdef SUPPORT_JIT
6609 if (re->executable_jit != NULL && (options & ~PUBLIC_JIT_MATCH_OPTIONS) == 0)
6610 {
6611 rc = pcre2_jit_match(code, subject, length, start_offset, options,
6612 match_data, mcontext);
6613 if (rc != PCRE2_ERROR_JIT_BADOPTION) return rc;
6614 }
6615 #endif
6616
6617 /* Carry on with non-JIT matching. */
6618
6619 anchored = ((re->overall_options | options) & PCRE2_ANCHORED) != 0;
6620 firstline = (re->overall_options & PCRE2_FIRSTLINE) != 0;
6621 startline = (re->flags & PCRE2_STARTLINE) != 0;
6622 bumpalong_limit = (mcontext->offset_limit == PCRE2_UNSET)?
6623 end_subject : subject + mcontext->offset_limit;
6624
6625 /* Fill in the fields in the match block. */
6626
6627 mb->callout = mcontext->callout;
6628 mb->callout_data = mcontext->callout_data;
6629 mb->memctl = mcontext->memctl;
6630 #ifdef HEAP_MATCH_RECURSE
6631 mb->stack_memctl = mcontext->stack_memctl;
6632 #endif
6633
6634 mb->start_subject = subject;
6635 mb->start_offset = start_offset;
6636 mb->end_subject = end_subject;
6637 mb->hasthen = (re->flags & PCRE2_HASTHEN) != 0;
6638
6639 mb->moptions = options; /* Match options */
6640 mb->poptions = re->overall_options; /* Pattern options */
6641
6642 mb->ignore_skip_arg = 0;
6643 mb->mark = mb->nomatch_mark = NULL; /* In case never set */
6644 mb->recursive = NULL; /* No recursion at top level */
6645 mb->ovecsave_chain = NULL; /* No ovecsave blocks yet */
6646 mb->hitend = FALSE;
6647
6648 /* The name table is needed for finding all the numbers associated with a
6649 given name, for condition testing. The code follows the name table. */
6650
6651 mb->name_table = (PCRE2_UCHAR *)((uint8_t *)re + sizeof(pcre2_real_code));
6652 mb->name_count = re->name_count;
6653 mb->name_entry_size = re->name_entry_size;
6654 mb->start_code = mb->name_table + re->name_count * re->name_entry_size;
6655
6656 /* Limits set in the pattern override the match context only if they are
6657 smaller. */
6658
6659 mb->match_limit = (mcontext->match_limit < re->limit_match)?
6660 mcontext->match_limit : re->limit_match;
6661 mb->match_limit_recursion = (mcontext->recursion_limit < re->limit_recursion)?
6662 mcontext->recursion_limit : re->limit_recursion;
6663
6664 /* Pointers to the individual character tables */
6665
6666 mb->lcc = re->tables + lcc_offset;
6667 mb->fcc = re->tables + fcc_offset;
6668 mb->ctypes = re->tables + ctypes_offset;
6669
6670 /* Process the \R and newline settings. */
6671
6672 mb->bsr_convention = re->bsr_convention;
6673 mb->nltype = NLTYPE_FIXED;
6674 switch(re->newline_convention)
6675 {
6676 case PCRE2_NEWLINE_CR:
6677 mb->nllen = 1;
6678 mb->nl[0] = CHAR_CR;
6679 break;
6680
6681 case PCRE2_NEWLINE_LF:
6682 mb->nllen = 1;
6683 mb->nl[0] = CHAR_NL;
6684 break;
6685
6686 case PCRE2_NEWLINE_CRLF:
6687 mb->nllen = 2;
6688 mb->nl[0] = CHAR_CR;
6689 mb->nl[1] = CHAR_NL;
6690 break;
6691
6692 case PCRE2_NEWLINE_ANY:
6693 mb->nltype = NLTYPE_ANY;
6694 break;
6695
6696 case PCRE2_NEWLINE_ANYCRLF:
6697 mb->nltype = NLTYPE_ANYCRLF;
6698 break;
6699
6700 default: return PCRE2_ERROR_INTERNAL;
6701 }
6702
6703 /* If the expression has got more back references than the offsets supplied can
6704 hold, we get a temporary chunk of memory to use during the matching. Otherwise,
6705 we can use the vector supplied. The size of the ovector is three times the
6706 value in the oveccount field. Two-thirds of it is pairs for storing matching
6707 offsets, and the top third is working space. */
6708
6709 if (re->top_backref >= match_data->oveccount)
6710 {
6711 ocount = re->top_backref * 3 + 3;
6712 mb->ovector = (PCRE2_SIZE *)(mb->memctl.malloc(ocount * sizeof(PCRE2_SIZE),
6713 mb->memctl.memory_data));
6714 if (mb->ovector == NULL) return PCRE2_ERROR_NOMEMORY;
6715 using_temporary_offsets = TRUE;
6716 }
6717 else
6718 {
6719 ocount = 3 * match_data->oveccount;
6720 mb->ovector = match_data->ovector;
6721 }
6722
6723 mb->offset_end = ocount;
6724 mb->offset_max = (2*ocount)/3;
6725
6726 /* Reset the working variable associated with each extraction. These should
6727 never be used unless previously set, but they get saved and restored, and so we
6728 initialize them to avoid reading uninitialized locations. Also, unset the
6729 offsets for the matched string. This is really just for tidiness with callouts,
6730 in case they inspect these fields. */
6731
6732 if (ocount > 0)
6733 {
6734 register PCRE2_SIZE *iptr = mb->ovector + ocount;
6735 register PCRE2_SIZE *iend = iptr - re->top_bracket;
6736 if (iend < mb->ovector + 2) iend = mb->ovector + 2;
6737 while (--iptr >= iend) *iptr = PCRE2_UNSET;
6738 mb->ovector[0] = mb->ovector[1] = PCRE2_UNSET;
6739 }
6740
6741 /* Set up the first code unit to match, if available. The first_codeunit value
6742 is never set for an anchored regular expression, but the anchoring may be
6743 forced at run time, so we have to test for anchoring. The first code unit may
6744 be unset for an unanchored pattern, of course. If there's no first code unit
6745 there may be a bitmap of possible first characters. */
6746
6747 if (!anchored)
6748 {
6749 if ((re->flags & PCRE2_FIRSTSET) != 0)
6750 {
6751 has_first_cu = TRUE;
6752 first_cu = first_cu2 = (PCRE2_UCHAR)(re->first_codeunit);
6753 if ((re->flags & PCRE2_FIRSTCASELESS) != 0)
6754 {
6755 first_cu2 = TABLE_GET(first_cu, mb->fcc, first_cu);
6756 #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 8
6757 if (utf && first_cu > 127) first_cu2 = UCD_OTHERCASE(first_cu);
6758 #endif
6759 }
6760 }
6761 else
6762 if (!startline && (re->flags & PCRE2_FIRSTMAPSET) != 0)
6763 start_bits = re->start_bitmap;
6764 }
6765
6766 /* For anchored or unanchored matches, there may be a "last known required
6767 character" set. */
6768
6769 if ((re->flags & PCRE2_LASTSET) != 0)
6770 {
6771 has_req_cu = TRUE;
6772 req_cu = req_cu2 = (PCRE2_UCHAR)(re->last_codeunit);
6773 if ((re->flags & PCRE2_LASTCASELESS) != 0)
6774 {
6775 req_cu2 = TABLE_GET(req_cu, mb->fcc, req_cu);
6776 #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 8
6777 if (utf && req_cu > 127) req_cu2 = UCD_OTHERCASE(req_cu);
6778 #endif
6779 }
6780 }
6781
6782
6783 /* ==========================================================================*/
6784
6785 /* Loop for handling unanchored repeated matching attempts; for anchored regexs
6786 the loop runs just once. */
6787
6788 for(;;)
6789 {
6790 PCRE2_SPTR new_start_match;
6791 mb->capture_last = 0;
6792
6793 /* ----------------- Start of match optimizations ---------------- */
6794
6795 /* There are some optimizations that avoid running the match if a known
6796 starting point is not found, or if a known later code unit is not present.
6797 However, there is an option (settable at compile time) that disables these,
6798 for testing and for ensuring that all callouts do actually occur. */
6799
6800 if ((re->overall_options & PCRE2_NO_START_OPTIMIZE) == 0)
6801 {
6802 PCRE2_SPTR save_end_subject = end_subject;
6803
6804 /* If firstline is TRUE, the start of the match is constrained to the first
6805 line of a multiline string. That is, the match must be before or at the
6806 first newline. Implement this by temporarily adjusting end_subject so that
6807 we stop the optimization scans at a newline. If the match fails at the
6808 newline, later code breaks this loop. */
6809
6810 if (firstline)
6811 {
6812 PCRE2_SPTR t = start_match;
6813 #ifdef SUPPORT_UNICODE
6814 if (utf)
6815 {
6816 while (t < mb->end_subject && !IS_NEWLINE(t))
6817 {
6818 t++;
6819 ACROSSCHAR(t < end_subject, *t, t++);
6820 }
6821 }
6822 else
6823 #endif
6824 while (t < mb->end_subject && !IS_NEWLINE(t)) t++;
6825 end_subject = t;
6826 }
6827
6828 /* Advance to a unique first code unit if there is one. In 8-bit mode, the
6829 use of memchr() gives a big speed up. */
6830
6831 if (has_first_cu)
6832 {
6833 PCRE2_UCHAR smc;
6834 if (first_cu != first_cu2)
6835 while (start_match < end_subject &&
6836 (smc = UCHAR21TEST(start_match)) != first_cu && smc != first_cu2)
6837 start_match++;
6838 else
6839 {
6840 #if PCRE2_CODE_UNIT_WIDTH != 8
6841 while (start_match < end_subject && UCHAR21TEST(start_match) != first_cu)
6842 start_match++;
6843 #else
6844 start_match = memchr(start_match, first_cu, end_subject - start_match);
6845 if (start_match == NULL) start_match = end_subject;
6846 #endif
6847 }
6848 }
6849
6850 /* Or to just after a linebreak for a multiline match */
6851
6852 else if (startline)
6853 {
6854 if (start_match > mb->start_subject + start_offset)
6855 {
6856 #ifdef SUPPORT_UNICODE
6857 if (utf)
6858 {
6859 while (start_match < end_subject && !WAS_NEWLINE(start_match))
6860 {
6861 start_match++;
6862 ACROSSCHAR(start_match < end_subject, *start_match,
6863 start_match++);
6864 }
6865 }
6866 else
6867 #endif
6868 while (start_match < end_subject && !WAS_NEWLINE(start_match))
6869 start_match++;
6870
6871 /* If we have just passed a CR and the newline option is ANY or
6872 ANYCRLF, and we are now at a LF, advance the match position by one more
6873 code unit. */
6874
6875 if (start_match[-1] == CHAR_CR &&
6876 (mb->nltype == NLTYPE_ANY || mb->nltype == NLTYPE_ANYCRLF) &&
6877 start_match < end_subject &&
6878 UCHAR21TEST(start_match) == CHAR_NL)
6879 start_match++;
6880 }
6881 }
6882
6883 /* Or to a non-unique first code unit if any have been identified. The
6884 bitmap contains only 256 bits. When code units are 16 or 32 bits wide, all
6885 code units greater than 254 set the 255 bit. */
6886
6887 else if (start_bits != NULL)
6888 {
6889 while (start_match < end_subject)
6890 {
6891 register uint32_t c = UCHAR21TEST(start_match);
6892 #if PCRE2_CODE_UNIT_WIDTH != 8
6893 if (c > 255) c = 255;
6894 #endif
6895 if ((start_bits[c/8] & (1 << (c&7))) != 0) break;
6896 start_match++;
6897 }
6898 }
6899
6900 /* Restore fudged end_subject */
6901
6902 end_subject = save_end_subject;
6903
6904 /* The following two optimizations are disabled for partial matching. */
6905
6906 if (!mb->partial)
6907 {
6908 /* The minimum matching length is a lower bound; no actual string of that
6909 length may actually match the pattern. Although the value is, strictly,
6910 in characters, we treat it as code units to avoid spending too much time
6911 in this optimization. */
6912
6913 if (end_subject - start_match < re->minlength)
6914 {
6915 rc = MATCH_NOMATCH;
6916 break;
6917 }
6918
6919 /* If req_cu is set, we know that that code unit must appear in the
6920 subject for the match to succeed. If the first code unit is set, req_cu
6921 must be later in the subject; otherwise the test starts at the match
6922 point. This optimization can save a huge amount of backtracking in
6923 patterns with nested unlimited repeats that aren't going to match.
6924 Writing separate code for cased/caseless versions makes it go faster, as
6925 does using an autoincrement and backing off on a match.
6926
6927 HOWEVER: when the subject string is very, very long, searching to its end
6928 can take a long time, and give bad performance on quite ordinary
6929 patterns. This showed up when somebody was matching something like
6930 /^\d+C/ on a 32-megabyte string... so we don't do this when the string is
6931 sufficiently long. */
6932
6933 if (has_req_cu && end_subject - start_match < REQ_CU_MAX)
6934 {
6935 register PCRE2_SPTR p = start_match + (has_first_cu? 1:0);
6936
6937 /* We don't need to repeat the search if we haven't yet reached the
6938 place we found it at last time. */
6939
6940 if (p > req_cu_ptr)
6941 {
6942 if (req_cu != req_cu2)
6943 {
6944 while (p < end_subject)
6945 {
6946 register uint32_t pp = UCHAR21INCTEST(p);
6947 if (pp == req_cu || pp == req_cu2) { p--; break; }
6948 }
6949 }
6950 else
6951 {
6952 while (p < end_subject)
6953 {
6954 if (UCHAR21INCTEST(p) == req_cu) { p--; break; }
6955 }
6956 }
6957
6958 /* If we can't find the required code unit, break the matching loop,
6959 forcing a match failure. */
6960
6961 if (p >= end_subject)
6962 {
6963 rc = MATCH_NOMATCH;
6964 break;
6965 }
6966
6967 /* If we have found the required code unit, save the point where we
6968 found it, so that we don't search again next time round the loop if
6969 the start hasn't passed this code unit yet. */
6970
6971 req_cu_ptr = p;
6972 }
6973 }
6974 }
6975 }
6976
6977 /* ------------ End of start of match optimizations ------------ */
6978
6979 /* Give no match if we have passed the bumpalong limit. */
6980
6981 if (start_match > bumpalong_limit)
6982 {
6983 rc = MATCH_NOMATCH;
6984 break;
6985 }
6986
6987 /* OK, we can now run the match. If "hitend" is set afterwards, remember the
6988 first starting point for which a partial match was found. */
6989
6990 mb->start_match_ptr = start_match;
6991 mb->start_used_ptr = start_match;
6992 mb->last_used_ptr = start_match;
6993 mb->match_call_count = 0;
6994 mb->match_function_type = 0;
6995 mb->end_offset_top = 0;
6996 mb->skip_arg_count = 0;
6997 rc = match(start_match, mb->start_code, start_match, 2, mb, NULL, 0);
6998
6999 if (mb->hitend && start_partial == NULL)
7000 {
7001 start_partial = mb->start_used_ptr;
7002 match_partial = start_match;
7003 }
7004
7005 switch(rc)
7006 {
7007 /* If MATCH_SKIP_ARG reaches this level it means that a MARK that matched
7008 the SKIP's arg was not found. In this circumstance, Perl ignores the SKIP
7009 entirely. The only way we can do that is to re-do the match at the same
7010 point, with a flag to force SKIP with an argument to be ignored. Just
7011 treating this case as NOMATCH does not work because it does not check other
7012 alternatives in patterns such as A(*SKIP:A)B|AC when the subject is AC. */
7013
7014 case MATCH_SKIP_ARG:
7015 new_start_match = start_match;
7016 mb->ignore_skip_arg = mb->skip_arg_count;
7017 break;
7018
7019 /* SKIP passes back the next starting point explicitly, but if it is no
7020 greater than the match we have just done, treat it as NOMATCH. */
7021
7022 case MATCH_SKIP:
7023 if (mb->start_match_ptr > start_match)
7024 {
7025 new_start_match = mb->start_match_ptr;
7026 break;
7027 }
7028 /* Fall through */
7029
7030 /* NOMATCH and PRUNE advance by one character. THEN at this level acts
7031 exactly like PRUNE. Unset ignore SKIP-with-argument. */
7032
7033 case MATCH_NOMATCH:
7034 case MATCH_PRUNE:
7035 case MATCH_THEN:
7036 mb->ignore_skip_arg = 0;
7037 new_start_match = start_match + 1;
7038 #ifdef SUPPORT_UNICODE
7039 if (utf)
7040 ACROSSCHAR(new_start_match < end_subject, *new_start_match,
7041 new_start_match++);
7042 #endif
7043 break;
7044
7045 /* COMMIT disables the bumpalong, but otherwise behaves as NOMATCH. */
7046
7047 case MATCH_COMMIT:
7048 rc = MATCH_NOMATCH;
7049 goto ENDLOOP;
7050
7051 /* Any other return is either a match, or some kind of error. */
7052
7053 default:
7054 goto ENDLOOP;
7055 }
7056
7057 /* Control reaches here for the various types of "no match at this point"
7058 result. Reset the code to MATCH_NOMATCH for subsequent checking. */
7059
7060 rc = MATCH_NOMATCH;
7061
7062 /* If PCRE2_FIRSTLINE is set, the match must happen before or at the first
7063 newline in the subject (though it may continue over the newline). Therefore,
7064 if we have just failed to match, starting at a newline, do not continue. */
7065
7066 if (firstline && IS_NEWLINE(start_match)) break;
7067
7068 /* Advance to new matching position */
7069
7070 start_match = new_start_match;
7071
7072 /* Break the loop if the pattern is anchored or if we have passed the end of
7073 the subject. */
7074
7075 if (anchored || start_match > end_subject) break;
7076
7077 /* If we have just passed a CR and we are now at a LF, and the pattern does
7078 not contain any explicit matches for \r or \n, and the newline option is CRLF
7079 or ANY or ANYCRLF, advance the match position by one more code unit. In
7080 normal matching start_match will aways be greater than the first position at
7081 this stage, but a failed *SKIP can cause a return at the same point, which is
7082 why the first test exists. */
7083
7084 if (start_match > subject + start_offset &&
7085 start_match[-1] == CHAR_CR &&
7086 start_match < end_subject &&
7087 *start_match == CHAR_NL &&
7088 (re->flags & PCRE2_HASCRORLF) == 0 &&
7089 (mb->nltype == NLTYPE_ANY ||
7090 mb->nltype == NLTYPE_ANYCRLF ||
7091 mb->nllen == 2))
7092 start_match++;
7093
7094 mb->mark = NULL; /* Reset for start of next match attempt */
7095 } /* End of for(;;) "bumpalong" loop */
7096
7097 /* ==========================================================================*/
7098
7099 /* When we reach here, one of the stopping conditions is true:
7100
7101 (1) The match succeeded, either completely, or partially;
7102
7103 (2) The pattern is anchored or the match was failed by (*COMMIT);
7104
7105 (3) We are past the end of the subject or the bumpalong limit;
7106
7107 (4) PCRE2_FIRSTLINE is set and we have failed to match at a newline, because
7108 this option requests that a match occur at or before the first newline in
7109 the subject.
7110
7111 (5) Some kind of error occurred.
7112
7113 */
7114
7115 ENDLOOP:
7116
7117 #ifdef HEAP_MATCH_RECURSE
7118 release_match_heapframes(&frame_zero, mb);
7119 #endif
7120
7121 /* Release any frames that were saved from recursions. */
7122
7123 while (mb->ovecsave_chain != NULL)
7124 {
7125 ovecsave_frame *this = mb->ovecsave_chain;
7126 mb->ovecsave_chain = this->next;
7127 mb->memctl.free(this, mb->memctl.memory_data);
7128 }
7129
7130 /* Fill in fields that are always returned in the match data. */
7131
7132 match_data->code = re;
7133 match_data->subject = subject;
7134 match_data->mark = mb->mark;
7135 match_data->matchedby = PCRE2_MATCHEDBY_INTERPRETER;
7136
7137 /* Handle a fully successful match. */
7138
7139 if (rc == MATCH_MATCH || rc == MATCH_ACCEPT)
7140 {
7141 uint32_t arg_offset_max = 2 * match_data->oveccount;
7142
7143 /* When the offset vector is big enough to deal with any backreferences,
7144 captured substring offsets will already be set up. In the case where we had
7145 to get some local memory to hold offsets for backreference processing, copy
7146 those that we can. In this case there need not be overflow if certain parts
7147 of the pattern were not used, even though there are more capturing
7148 parentheses than vector slots. */
7149
7150 if (using_temporary_offsets)
7151 {
7152 if (arg_offset_max >= 4)
7153 {
7154 memcpy(match_data->ovector + 2, mb->ovector + 2,
7155 (arg_offset_max - 2) * sizeof(PCRE2_SIZE));
7156 }
7157 if (mb->end_offset_top > arg_offset_max) mb->capture_last |= OVFLBIT;
7158 mb->memctl.free(mb->ovector, mb->memctl.memory_data);
7159 }
7160
7161 /* Set the return code to the number of captured strings, or 0 if there were
7162 too many to fit into the ovector. */
7163
7164 match_data->rc = ((mb->capture_last & OVFLBIT) != 0)?
7165 0 : mb->end_offset_top/2;
7166
7167 /* If there is space in the offset vector, set any pairs that follow the
7168 highest-numbered captured string but are less than the number of capturing
7169 groups in the pattern (and are within the ovector) to PCRE2_UNSET. It is
7170 documented that this happens. In earlier versions, the whole set of potential
7171 capturing offsets was initialized each time round the loop, but this is
7172 handled differently now. "Gaps" are set to PCRE2_UNSET dynamically instead
7173 (this fixed a bug). Thus, it is only those at the end that need setting here.
7174 We can't just mark them all unset at the start of the whole thing because
7175 they may get set in one branch that is not the final matching branch. */
7176
7177 if (mb->end_offset_top/2 <= re->top_bracket)
7178 {
7179 register PCRE2_SIZE *iptr, *iend;
7180 int resetcount = re->top_bracket + 1;
7181 if (resetcount > match_data->oveccount) resetcount = match_data->oveccount;
7182 iptr = match_data->ovector + mb->end_offset_top;
7183 iend = match_data->ovector + 2 * resetcount;
7184 while (iptr < iend) *iptr++ = PCRE2_UNSET;
7185 }
7186
7187 /* If there is space, set up the whole thing as substring 0. The value of
7188 mb->start_match_ptr might be modified if \K was encountered on the success
7189 matching path. */
7190
7191 if (match_data->oveccount < 1) rc = 0; else
7192 {
7193 match_data->ovector[0] = mb->start_match_ptr - mb->start_subject;
7194 match_data->ovector[1] = mb->end_match_ptr - mb->start_subject;
7195 }
7196
7197 /* Set the remaining returned values */
7198
7199 match_data->startchar = start_match - subject;
7200 match_data->leftchar = mb->start_used_ptr - subject;
7201 match_data->rightchar = ((mb->last_used_ptr > mb->end_match_ptr)?
7202 mb->last_used_ptr : mb->end_match_ptr) - subject;
7203 return match_data->rc;
7204 }
7205
7206 /* Control gets here if there has been a partial match, an error, or if the
7207 overall match attempt has failed at all permitted starting positions. Any mark
7208 data is in the nomatch_mark field. */
7209
7210 match_data->mark = mb->nomatch_mark;
7211
7212 /* For anything other than nomatch or partial match, just return the code. */
7213
7214 if (rc != MATCH_NOMATCH && rc != PCRE2_ERROR_PARTIAL)
7215 match_data->rc = rc;
7216
7217 /* Else handle a partial match. */
7218
7219 else if (match_partial != NULL)
7220 {
7221 if (match_data->oveccount > 0)
7222 {
7223 match_data->ovector[0] = match_partial - subject;
7224 match_data->ovector[1] = end_subject - subject;
7225 }
7226 match_data->startchar = match_partial - subject;
7227 match_data->leftchar = start_partial - subject;
7228 match_data->rightchar = end_subject - subject;
7229 match_data->rc = PCRE2_ERROR_PARTIAL;
7230 }
7231
7232 /* Else this is the classic nomatch case. */
7233
7234 else match_data->rc = PCRE2_ERROR_NOMATCH;
7235
7236 /* Free any temporary offsets. */
7237
7238 if (using_temporary_offsets)
7239 mb->memctl.free(mb->ovector, mb->memctl.memory_data);
7240 return match_data->rc;
7241 }
7242
7243 /* End of pcre2_match.c */
7244