1 /*************************************************
2 * Perl-Compatible Regular Expressions *
3 *************************************************/
4
5 /* PCRE is a library of functions to support regular expressions whose syntax
6 and semantics are as close as possible to those of the Perl 5 language.
7
8 Written by Philip Hazel
9 Original API code Copyright (c) 1997-2012 University of Cambridge
10 New API code Copyright (c) 2015-2019 University of Cambridge
11
12 -----------------------------------------------------------------------------
13 Redistribution and use in source and binary forms, with or without
14 modification, are permitted provided that the following conditions are met:
15
16 * Redistributions of source code must retain the above copyright notice,
17 this list of conditions and the following disclaimer.
18
19 * Redistributions in binary form must reproduce the above copyright
20 notice, this list of conditions and the following disclaimer in the
21 documentation and/or other materials provided with the distribution.
22
23 * Neither the name of the University of Cambridge nor the names of its
24 contributors may be used to endorse or promote products derived from
25 this software without specific prior written permission.
26
27 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
28 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
29 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
30 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
31 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
32 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
33 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
34 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
35 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
36 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
37 POSSIBILITY OF SUCH DAMAGE.
38 -----------------------------------------------------------------------------
39 */
40
41
42 #ifdef HAVE_CONFIG_H
43 #include "config.h"
44 #endif
45
46 /* These defines enable debugging code */
47
48 /* #define DEBUG_FRAMES_DISPLAY */
49 /* #define DEBUG_SHOW_OPS */
50 /* #define DEBUG_SHOW_RMATCH */
51
52 #ifdef DEBUG_FRAME_DISPLAY
53 #include <stdarg.h>
54 #endif
55
56 /* These defines identify the name of the block containing "static"
57 information, and fields within it. */
58
59 #define NLBLOCK mb /* Block containing newline information */
60 #define PSSTART start_subject /* Field containing processed string start */
61 #define PSEND end_subject /* Field containing processed string end */
62
63 #include "pcre2_internal.h"
64
65 #define RECURSE_UNSET 0xffffffffu /* Bigger than max group number */
66
67 /* Masks for identifying the public options that are permitted at match time. */
68
69 #define PUBLIC_MATCH_OPTIONS \
70 (PCRE2_ANCHORED|PCRE2_ENDANCHORED|PCRE2_NOTBOL|PCRE2_NOTEOL|PCRE2_NOTEMPTY| \
71 PCRE2_NOTEMPTY_ATSTART|PCRE2_NO_UTF_CHECK|PCRE2_PARTIAL_HARD| \
72 PCRE2_PARTIAL_SOFT|PCRE2_NO_JIT|PCRE2_COPY_MATCHED_SUBJECT)
73
74 #define PUBLIC_JIT_MATCH_OPTIONS \
75 (PCRE2_NO_UTF_CHECK|PCRE2_NOTBOL|PCRE2_NOTEOL|PCRE2_NOTEMPTY|\
76 PCRE2_NOTEMPTY_ATSTART|PCRE2_PARTIAL_SOFT|PCRE2_PARTIAL_HARD|\
77 PCRE2_COPY_MATCHED_SUBJECT)
78
79 /* Non-error returns from and within the match() function. Error returns are
80 externally defined PCRE2_ERROR_xxx codes, which are all negative. */
81
82 #define MATCH_MATCH 1
83 #define MATCH_NOMATCH 0
84
85 /* Special internal returns used in the match() function. Make them
86 sufficiently negative to avoid the external error codes. */
87
88 #define MATCH_ACCEPT (-999)
89 #define MATCH_KETRPOS (-998)
90 /* The next 5 must be kept together and in sequence so that a test that checks
91 for any one of them can use a range. */
92 #define MATCH_COMMIT (-997)
93 #define MATCH_PRUNE (-996)
94 #define MATCH_SKIP (-995)
95 #define MATCH_SKIP_ARG (-994)
96 #define MATCH_THEN (-993)
97 #define MATCH_BACKTRACK_MAX MATCH_THEN
98 #define MATCH_BACKTRACK_MIN MATCH_COMMIT
99
100 /* Group frame type values. Zero means the frame is not a group frame. The
101 lower 16 bits are used for data (e.g. the capture number). Group frames are
102 used for most groups so that information about the start is easily available at
103 the end without having to scan back through intermediate frames (backtrack
104 points). */
105
106 #define GF_CAPTURE 0x00010000u
107 #define GF_NOCAPTURE 0x00020000u
108 #define GF_CONDASSERT 0x00030000u
109 #define GF_RECURSE 0x00040000u
110
111 /* Masks for the identity and data parts of the group frame type. */
112
113 #define GF_IDMASK(a) ((a) & 0xffff0000u)
114 #define GF_DATAMASK(a) ((a) & 0x0000ffffu)
115
116 /* Repetition types */
117
118 enum { REPTYPE_MIN, REPTYPE_MAX, REPTYPE_POS };
119
120 /* Min and max values for the common repeats; a maximum of UINT32_MAX =>
121 infinity. */
122
123 static const uint32_t rep_min[] = {
124 0, 0, /* * and *? */
125 1, 1, /* + and +? */
126 0, 0, /* ? and ?? */
127 0, 0, /* dummy placefillers for OP_CR[MIN]RANGE */
128 0, 1, 0 }; /* OP_CRPOS{STAR, PLUS, QUERY} */
129
130 static const uint32_t rep_max[] = {
131 UINT32_MAX, UINT32_MAX, /* * and *? */
132 UINT32_MAX, UINT32_MAX, /* + and +? */
133 1, 1, /* ? and ?? */
134 0, 0, /* dummy placefillers for OP_CR[MIN]RANGE */
135 UINT32_MAX, UINT32_MAX, 1 }; /* OP_CRPOS{STAR, PLUS, QUERY} */
136
137 /* Repetition types - must include OP_CRPOSRANGE (not needed above) */
138
139 static const uint32_t rep_typ[] = {
140 REPTYPE_MAX, REPTYPE_MIN, /* * and *? */
141 REPTYPE_MAX, REPTYPE_MIN, /* + and +? */
142 REPTYPE_MAX, REPTYPE_MIN, /* ? and ?? */
143 REPTYPE_MAX, REPTYPE_MIN, /* OP_CRRANGE and OP_CRMINRANGE */
144 REPTYPE_POS, REPTYPE_POS, /* OP_CRPOSSTAR, OP_CRPOSPLUS */
145 REPTYPE_POS, REPTYPE_POS }; /* OP_CRPOSQUERY, OP_CRPOSRANGE */
146
147 /* Numbers for RMATCH calls at backtracking points. When these lists are
148 changed, the code at RETURN_SWITCH below must be updated in sync. */
149
150 enum { RM1=1, RM2, RM3, RM4, RM5, RM6, RM7, RM8, RM9, RM10,
151 RM11, RM12, RM13, RM14, RM15, RM16, RM17, RM18, RM19, RM20,
152 RM21, RM22, RM23, RM24, RM25, RM26, RM27, RM28, RM29, RM30,
153 RM31, RM32, RM33, RM34, RM35, RM36 };
154
155 #ifdef SUPPORT_WIDE_CHARS
156 enum { RM100=100, RM101 };
157 #endif
158
159 #ifdef SUPPORT_UNICODE
160 enum { RM200=200, RM201, RM202, RM203, RM204, RM205, RM206, RM207,
161 RM208, RM209, RM210, RM211, RM212, RM213, RM214, RM215,
162 RM216, RM217, RM218, RM219, RM220, RM221, RM222 };
163 #endif
164
165 /* Define short names for general fields in the current backtrack frame, which
166 is always pointed to by the F variable. Occasional references to fields in
167 other frames are written out explicitly. There are also some fields in the
168 current frame whose names start with "temp" that are used for short-term,
169 localised backtracking memory. These are #defined with Lxxx names at the point
170 of use and undefined afterwards. */
171
172 #define Fback_frame F->back_frame
173 #define Fcapture_last F->capture_last
174 #define Fcurrent_recurse F->current_recurse
175 #define Fecode F->ecode
176 #define Feptr F->eptr
177 #define Fgroup_frame_type F->group_frame_type
178 #define Flast_group_offset F->last_group_offset
179 #define Flength F->length
180 #define Fmark F->mark
181 #define Frdepth F->rdepth
182 #define Fstart_match F->start_match
183 #define Foffset_top F->offset_top
184 #define Foccu F->occu
185 #define Fop F->op
186 #define Fovector F->ovector
187 #define Freturn_id F->return_id
188
189
190 #ifdef DEBUG_FRAMES_DISPLAY
191 /*************************************************
192 * Display current frames and contents *
193 *************************************************/
194
195 /* This debugging function displays the current set of frames and their
196 contents. It is not called automatically from anywhere, the intention being
197 that calls can be inserted where necessary when debugging frame-related
198 problems.
199
200 Arguments:
201 f the file to write to
202 F the current top frame
203 P a previous frame of interest
204 frame_size the frame size
205 mb points to the match block
206 s identification text
207
208 Returns: nothing
209 */
210
211 static void
display_frames(FILE * f,heapframe * F,heapframe * P,PCRE2_SIZE frame_size,match_block * mb,const char * s,...)212 display_frames(FILE *f, heapframe *F, heapframe *P, PCRE2_SIZE frame_size,
213 match_block *mb, const char *s, ...)
214 {
215 uint32_t i;
216 heapframe *Q;
217 va_list ap;
218 va_start(ap, s);
219
220 fprintf(f, "FRAMES ");
221 vfprintf(f, s, ap);
222 va_end(ap);
223
224 if (P != NULL) fprintf(f, " P=%lu",
225 ((char *)P - (char *)(mb->match_frames))/frame_size);
226 fprintf(f, "\n");
227
228 for (i = 0, Q = mb->match_frames;
229 Q <= F;
230 i++, Q = (heapframe *)((char *)Q + frame_size))
231 {
232 fprintf(f, "Frame %d type=%x subj=%lu code=%d back=%lu id=%d",
233 i, Q->group_frame_type, Q->eptr - mb->start_subject, *(Q->ecode),
234 Q->back_frame, Q->return_id);
235
236 if (Q->last_group_offset == PCRE2_UNSET)
237 fprintf(f, " lgoffset=unset\n");
238 else
239 fprintf(f, " lgoffset=%lu\n", Q->last_group_offset/frame_size);
240 }
241 }
242
243 #endif
244
245
246
247 /*************************************************
248 * Process a callout *
249 *************************************************/
250
251 /* This function is called for all callouts, whether "standalone" or at the
252 start of a conditional group. Feptr will be pointing to either OP_CALLOUT or
253 OP_CALLOUT_STR. A callout block is allocated in pcre2_match() and initialized
254 with fixed values.
255
256 Arguments:
257 F points to the current backtracking frame
258 mb points to the match block
259 lengthptr where to return the length of the callout item
260
261 Returns: the return from the callout
262 or 0 if no callout function exists
263 */
264
265 static int
do_callout(heapframe * F,match_block * mb,PCRE2_SIZE * lengthptr)266 do_callout(heapframe *F, match_block *mb, PCRE2_SIZE *lengthptr)
267 {
268 int rc;
269 PCRE2_SIZE save0, save1;
270 PCRE2_SIZE *callout_ovector;
271 pcre2_callout_block *cb;
272
273 *lengthptr = (*Fecode == OP_CALLOUT)?
274 PRIV(OP_lengths)[OP_CALLOUT] : GET(Fecode, 1 + 2*LINK_SIZE);
275
276 if (mb->callout == NULL) return 0; /* No callout function provided */
277
278 /* The original matching code (pre 10.30) worked directly with the ovector
279 passed by the user, and this was passed to callouts. Now that the working
280 ovector is in the backtracking frame, it no longer needs to reserve space for
281 the overall match offsets (which would waste space in the frame). For backward
282 compatibility, however, we pass capture_top and offset_vector to the callout as
283 if for the extended ovector, and we ensure that the first two slots are unset
284 by preserving and restoring their current contents. Picky compilers complain if
285 references such as Fovector[-2] are use directly, so we set up a separate
286 pointer. */
287
288 callout_ovector = (PCRE2_SIZE *)(Fovector) - 2;
289
290 /* The cb->version, cb->subject, cb->subject_length, and cb->start_match fields
291 are set externally. The first 3 never change; the last is updated for each
292 bumpalong. */
293
294 cb = mb->cb;
295 cb->capture_top = (uint32_t)Foffset_top/2 + 1;
296 cb->capture_last = Fcapture_last;
297 cb->offset_vector = callout_ovector;
298 cb->mark = mb->nomatch_mark;
299 cb->current_position = (PCRE2_SIZE)(Feptr - mb->start_subject);
300 cb->pattern_position = GET(Fecode, 1);
301 cb->next_item_length = GET(Fecode, 1 + LINK_SIZE);
302
303 if (*Fecode == OP_CALLOUT) /* Numerical callout */
304 {
305 cb->callout_number = Fecode[1 + 2*LINK_SIZE];
306 cb->callout_string_offset = 0;
307 cb->callout_string = NULL;
308 cb->callout_string_length = 0;
309 }
310 else /* String callout */
311 {
312 cb->callout_number = 0;
313 cb->callout_string_offset = GET(Fecode, 1 + 3*LINK_SIZE);
314 cb->callout_string = Fecode + (1 + 4*LINK_SIZE) + 1;
315 cb->callout_string_length =
316 *lengthptr - (1 + 4*LINK_SIZE) - 2;
317 }
318
319 save0 = callout_ovector[0];
320 save1 = callout_ovector[1];
321 callout_ovector[0] = callout_ovector[1] = PCRE2_UNSET;
322 rc = mb->callout(cb, mb->callout_data);
323 callout_ovector[0] = save0;
324 callout_ovector[1] = save1;
325 cb->callout_flags = 0;
326 return rc;
327 }
328
329
330
331 /*************************************************
332 * Match a back-reference *
333 *************************************************/
334
335 /* This function is called only when it is known that the offset lies within
336 the offsets that have so far been used in the match. Note that in caseless
337 UTF-8 mode, the number of subject bytes matched may be different to the number
338 of reference bytes. (In theory this could also happen in UTF-16 mode, but it
339 seems unlikely.)
340
341 Arguments:
342 offset index into the offset vector
343 caseless TRUE if caseless
344 F the current backtracking frame pointer
345 mb points to match block
346 lengthptr pointer for returning the length matched
347
348 Returns: = 0 sucessful match; number of code units matched is set
349 < 0 no match
350 > 0 partial match
351 */
352
353 static int
match_ref(PCRE2_SIZE offset,BOOL caseless,heapframe * F,match_block * mb,PCRE2_SIZE * lengthptr)354 match_ref(PCRE2_SIZE offset, BOOL caseless, heapframe *F, match_block *mb,
355 PCRE2_SIZE *lengthptr)
356 {
357 PCRE2_SPTR p;
358 PCRE2_SIZE length;
359 PCRE2_SPTR eptr;
360 PCRE2_SPTR eptr_start;
361
362 /* Deal with an unset group. The default is no match, but there is an option to
363 match an empty string. */
364
365 if (offset >= Foffset_top || Fovector[offset] == PCRE2_UNSET)
366 {
367 if ((mb->poptions & PCRE2_MATCH_UNSET_BACKREF) != 0)
368 {
369 *lengthptr = 0;
370 return 0; /* Match */
371 }
372 else return -1; /* No match */
373 }
374
375 /* Separate the caseless and UTF cases for speed. */
376
377 eptr = eptr_start = Feptr;
378 p = mb->start_subject + Fovector[offset];
379 length = Fovector[offset+1] - Fovector[offset];
380
381 if (caseless)
382 {
383 #if defined SUPPORT_UNICODE
384 if ((mb->poptions & PCRE2_UTF) != 0)
385 {
386 /* Match characters up to the end of the reference. NOTE: the number of
387 code units matched may differ, because in UTF-8 there are some characters
388 whose upper and lower case codes have different numbers of bytes. For
389 example, U+023A (2 bytes in UTF-8) is the upper case version of U+2C65 (3
390 bytes in UTF-8); a sequence of 3 of the former uses 6 bytes, as does a
391 sequence of two of the latter. It is important, therefore, to check the
392 length along the reference, not along the subject (earlier code did this
393 wrong). */
394
395 PCRE2_SPTR endptr = p + length;
396 while (p < endptr)
397 {
398 uint32_t c, d;
399 const ucd_record *ur;
400 if (eptr >= mb->end_subject) return 1; /* Partial match */
401 GETCHARINC(c, eptr);
402 GETCHARINC(d, p);
403 ur = GET_UCD(d);
404 if (c != d && c != (uint32_t)((int)d + ur->other_case))
405 {
406 const uint32_t *pp = PRIV(ucd_caseless_sets) + ur->caseset;
407 for (;;)
408 {
409 if (c < *pp) return -1; /* No match */
410 if (c == *pp++) break;
411 }
412 }
413 }
414 }
415 else
416 #endif
417
418 /* Not in UTF mode */
419
420 {
421 for (; length > 0; length--)
422 {
423 uint32_t cc, cp;
424 if (eptr >= mb->end_subject) return 1; /* Partial match */
425 cc = UCHAR21TEST(eptr);
426 cp = UCHAR21TEST(p);
427 if (TABLE_GET(cp, mb->lcc, cp) != TABLE_GET(cc, mb->lcc, cc))
428 return -1; /* No match */
429 p++;
430 eptr++;
431 }
432 }
433 }
434
435 /* In the caseful case, we can just compare the code units, whether or not we
436 are in UTF mode. When partial matching, we have to do this unit-by-unit. */
437
438 else
439 {
440 if (mb->partial != 0)
441 {
442 for (; length > 0; length--)
443 {
444 if (eptr >= mb->end_subject) return 1; /* Partial match */
445 if (UCHAR21INCTEST(p) != UCHAR21INCTEST(eptr)) return -1; /* No match */
446 }
447 }
448
449 /* Not partial matching */
450
451 else
452 {
453 if ((PCRE2_SIZE)(mb->end_subject - eptr) < length) return 1; /* Partial */
454 if (memcmp(p, eptr, CU2BYTES(length)) != 0) return -1; /* No match */
455 eptr += length;
456 }
457 }
458
459 *lengthptr = eptr - eptr_start;
460 return 0; /* Match */
461 }
462
463
464
465 /******************************************************************************
466 *******************************************************************************
467 "Recursion" in the match() function
468
469 The original match() function was highly recursive, but this proved to be the
470 source of a number of problems over the years, mostly because of the relatively
471 small system stacks that are commonly found. As new features were added to
472 patterns, various kludges were invented to reduce the amount of stack used,
473 making the code hard to understand in places.
474
475 A version did exist that used individual frames on the heap instead of calling
476 match() recursively, but this ran substantially slower. The current version is
477 a refactoring that uses a vector of frames to remember backtracking points.
478 This runs no slower, and possibly even a bit faster than the original recursive
479 implementation. An initial vector of size START_FRAMES_SIZE (enough for maybe
480 50 frames) is allocated on the system stack. If this is not big enough, the
481 heap is used for a larger vector.
482
483 *******************************************************************************
484 ******************************************************************************/
485
486
487
488
489 /*************************************************
490 * Macros for the match() function *
491 *************************************************/
492
493 /* These macros pack up tests that are used for partial matching several times
494 in the code. We set the "hit end" flag if the pointer is at the end of the
495 subject and also past the earliest inspected character (i.e. something has been
496 matched, even if not part of the actual matched string). For hard partial
497 matching, we then return immediately. The second one is used when we already
498 know we are past the end of the subject. */
499
500 #define CHECK_PARTIAL()\
501 if (mb->partial != 0 && Feptr >= mb->end_subject && \
502 Feptr > mb->start_used_ptr) \
503 { \
504 mb->hitend = TRUE; \
505 if (mb->partial > 1) return PCRE2_ERROR_PARTIAL; \
506 }
507
508 #define SCHECK_PARTIAL()\
509 if (mb->partial != 0 && Feptr > mb->start_used_ptr) \
510 { \
511 mb->hitend = TRUE; \
512 if (mb->partial > 1) return PCRE2_ERROR_PARTIAL; \
513 }
514
515 /* These macros are used to implement backtracking. They simulate a recursive
516 call to the match() function by means of a local vector of frames which
517 remember the backtracking points. */
518
519 #define RMATCH(ra,rb)\
520 {\
521 start_ecode = ra;\
522 Freturn_id = rb;\
523 goto MATCH_RECURSE;\
524 L_##rb:;\
525 }
526
527 #define RRETURN(ra)\
528 {\
529 rrc = ra;\
530 goto RETURN_SWITCH;\
531 }
532
533
534
535 /*************************************************
536 * Match from current position *
537 *************************************************/
538
539 /* This function is called to run one match attempt at a single starting point
540 in the subject.
541
542 Performance note: It might be tempting to extract commonly used fields from the
543 mb structure (e.g. end_subject) into individual variables to improve
544 performance. Tests using gcc on a SPARC disproved this; in the first case, it
545 made performance worse.
546
547 Arguments:
548 start_eptr starting character in subject
549 start_ecode starting position in compiled code
550 ovector pointer to the final output vector
551 oveccount number of pairs in ovector
552 top_bracket number of capturing parentheses in the pattern
553 frame_size size of each backtracking frame
554 mb pointer to "static" variables block
555
556 Returns: MATCH_MATCH if matched ) these values are >= 0
557 MATCH_NOMATCH if failed to match )
558 negative MATCH_xxx value for PRUNE, SKIP, etc
559 negative PCRE2_ERROR_xxx value if aborted by an error condition
560 (e.g. stopped by repeated call or depth limit)
561 */
562
563 static int
match(PCRE2_SPTR start_eptr,PCRE2_SPTR start_ecode,PCRE2_SIZE * ovector,uint16_t oveccount,uint16_t top_bracket,PCRE2_SIZE frame_size,match_block * mb)564 match(PCRE2_SPTR start_eptr, PCRE2_SPTR start_ecode, PCRE2_SIZE *ovector,
565 uint16_t oveccount, uint16_t top_bracket, PCRE2_SIZE frame_size,
566 match_block *mb)
567 {
568 /* Frame-handling variables */
569
570 heapframe *F; /* Current frame pointer */
571 heapframe *N = NULL; /* Temporary frame pointers */
572 heapframe *P = NULL;
573 heapframe *assert_accept_frame; /* For passing back the frame with captures */
574 PCRE2_SIZE frame_copy_size; /* Amount to copy when creating a new frame */
575
576 /* Local variables that do not need to be preserved over calls to RRMATCH(). */
577
578 PCRE2_SPTR bracode; /* Temp pointer to start of group */
579 PCRE2_SIZE offset; /* Used for group offsets */
580 PCRE2_SIZE length; /* Used for various length calculations */
581
582 int rrc; /* Return from functions & backtracking "recursions" */
583 #ifdef SUPPORT_UNICODE
584 int proptype; /* Type of character property */
585 #endif
586
587 uint32_t i; /* Used for local loops */
588 uint32_t fc; /* Character values */
589 uint32_t number; /* Used for group and other numbers */
590 uint32_t reptype = 0; /* Type of repetition (0 to avoid compiler warning) */
591 uint32_t group_frame_type; /* Specifies type for new group frames */
592
593 BOOL condition; /* Used in conditional groups */
594 BOOL cur_is_word; /* Used in "word" tests */
595 BOOL prev_is_word; /* Used in "word" tests */
596
597 /* UTF flag */
598
599 #ifdef SUPPORT_UNICODE
600 BOOL utf = (mb->poptions & PCRE2_UTF) != 0;
601 #else
602 BOOL utf = FALSE;
603 #endif
604
605 /* This is the length of the last part of a backtracking frame that must be
606 copied when a new frame is created. */
607
608 frame_copy_size = frame_size - offsetof(heapframe, eptr);
609
610 /* Set up the first current frame at the start of the vector, and initialize
611 fields that are not reset for new frames. */
612
613 F = mb->match_frames;
614 Frdepth = 0; /* "Recursion" depth */
615 Fcapture_last = 0; /* Number of most recent capture */
616 Fcurrent_recurse = RECURSE_UNSET; /* Not pattern recursing. */
617 Fstart_match = Feptr = start_eptr; /* Current data pointer and start match */
618 Fmark = NULL; /* Most recent mark */
619 Foffset_top = 0; /* End of captures within the frame */
620 Flast_group_offset = PCRE2_UNSET; /* Saved frame of most recent group */
621 group_frame_type = 0; /* Not a start of group frame */
622 goto NEW_FRAME; /* Start processing with this frame */
623
624 /* Come back here when we want to create a new frame for remembering a
625 backtracking point. */
626
627 MATCH_RECURSE:
628
629 /* Set up a new backtracking frame. If the vector is full, get a new one
630 on the heap, doubling the size, but constrained by the heap limit. */
631
632 N = (heapframe *)((char *)F + frame_size);
633 if (N >= mb->match_frames_top)
634 {
635 PCRE2_SIZE newsize = mb->frame_vector_size * 2;
636 heapframe *new;
637
638 if ((newsize / 1024) > mb->heap_limit)
639 {
640 PCRE2_SIZE maxsize = ((mb->heap_limit * 1024)/frame_size) * frame_size;
641 if (mb->frame_vector_size >= maxsize) return PCRE2_ERROR_HEAPLIMIT;
642 newsize = maxsize;
643 }
644
645 new = mb->memctl.malloc(newsize, mb->memctl.memory_data);
646 if (new == NULL) return PCRE2_ERROR_NOMEMORY;
647 memcpy(new, mb->match_frames, mb->frame_vector_size);
648
649 F = (heapframe *)((char *)new + ((char *)F - (char *)mb->match_frames));
650 N = (heapframe *)((char *)F + frame_size);
651
652 if (mb->match_frames != mb->stack_frames)
653 mb->memctl.free(mb->match_frames, mb->memctl.memory_data);
654 mb->match_frames = new;
655 mb->match_frames_top = (heapframe *)((char *)mb->match_frames + newsize);
656 mb->frame_vector_size = newsize;
657 }
658
659 #ifdef DEBUG_SHOW_RMATCH
660 fprintf(stderr, "++ RMATCH %2d frame=%d", Freturn_id, Frdepth + 1);
661 if (group_frame_type != 0)
662 {
663 fprintf(stderr, " type=%x ", group_frame_type);
664 switch (GF_IDMASK(group_frame_type))
665 {
666 case GF_CAPTURE:
667 fprintf(stderr, "capture=%d", GF_DATAMASK(group_frame_type));
668 break;
669
670 case GF_NOCAPTURE:
671 fprintf(stderr, "nocapture op=%d", GF_DATAMASK(group_frame_type));
672 break;
673
674 case GF_CONDASSERT:
675 fprintf(stderr, "condassert op=%d", GF_DATAMASK(group_frame_type));
676 break;
677
678 case GF_RECURSE:
679 fprintf(stderr, "recurse=%d", GF_DATAMASK(group_frame_type));
680 break;
681
682 default:
683 fprintf(stderr, "*** unknown ***");
684 break;
685 }
686 }
687 fprintf(stderr, "\n");
688 #endif
689
690 /* Copy those fields that must be copied into the new frame, increase the
691 "recursion" depth (i.e. the new frame's index) and then make the new frame
692 current. */
693
694 memcpy((char *)N + offsetof(heapframe, eptr),
695 (char *)F + offsetof(heapframe, eptr),
696 frame_copy_size);
697
698 N->rdepth = Frdepth + 1;
699 F = N;
700
701 /* Carry on processing with a new frame. */
702
703 NEW_FRAME:
704 Fgroup_frame_type = group_frame_type;
705 Fecode = start_ecode; /* Starting code pointer */
706 Fback_frame = frame_size; /* Default is go back one frame */
707
708 /* If this is a special type of group frame, remember its offset for quick
709 access at the end of the group. If this is a recursion, set a new current
710 recursion value. */
711
712 if (group_frame_type != 0)
713 {
714 Flast_group_offset = (char *)F - (char *)mb->match_frames;
715 if (GF_IDMASK(group_frame_type) == GF_RECURSE)
716 Fcurrent_recurse = GF_DATAMASK(group_frame_type);
717 group_frame_type = 0;
718 }
719
720
721 /* ========================================================================= */
722 /* This is the main processing loop. First check that we haven't recorded too
723 many backtracks (search tree is too large), or that we haven't exceeded the
724 recursive depth limit (used too many backtracking frames). If not, process the
725 opcodes. */
726
727 if (mb->match_call_count++ >= mb->match_limit) return PCRE2_ERROR_MATCHLIMIT;
728 if (Frdepth >= mb->match_limit_depth) return PCRE2_ERROR_DEPTHLIMIT;
729
730 for (;;)
731 {
732 #ifdef DEBUG_SHOW_OPS
733 fprintf(stderr, "++ op=%d\n", *Fecode);
734 #endif
735
736 Fop = (uint8_t)(*Fecode); /* Cast needed for 16-bit and 32-bit modes */
737 switch(Fop)
738 {
739 /* ===================================================================== */
740 /* Before OP_ACCEPT there may be any number of OP_CLOSE opcodes, to close
741 any currently open capturing brackets. Unlike reaching the end of a group,
742 where we know the starting frame is at the top of the chained frames, in
743 this case we have to search back for the relevant frame in case other types
744 of group that use chained frames have intervened. Multiple OP_CLOSEs always
745 come innermost first, which matches the chain order. We can ignore this in
746 a recursion, because captures are not passed out of recursions. */
747
748 case OP_CLOSE:
749 if (Fcurrent_recurse == RECURSE_UNSET)
750 {
751 number = GET2(Fecode, 1);
752 offset = Flast_group_offset;
753 for(;;)
754 {
755 if (offset == PCRE2_UNSET) return PCRE2_ERROR_INTERNAL;
756 N = (heapframe *)((char *)mb->match_frames + offset);
757 P = (heapframe *)((char *)N - frame_size);
758 if (N->group_frame_type == (GF_CAPTURE | number)) break;
759 offset = P->last_group_offset;
760 }
761 offset = (number << 1) - 2;
762 Fcapture_last = number;
763 Fovector[offset] = P->eptr - mb->start_subject;
764 Fovector[offset+1] = Feptr - mb->start_subject;
765 if (offset >= Foffset_top) Foffset_top = offset + 2;
766 }
767 Fecode += PRIV(OP_lengths)[*Fecode];
768 break;
769
770
771 /* ===================================================================== */
772 /* Real or forced end of the pattern, assertion, or recursion. In an
773 assertion ACCEPT, update the last used pointer and remember the current
774 frame so that the captures and mark can be fished out of it. */
775
776 case OP_ASSERT_ACCEPT:
777 if (Feptr > mb->last_used_ptr) mb->last_used_ptr = Feptr;
778 assert_accept_frame = F;
779 RRETURN(MATCH_ACCEPT);
780
781 /* If recursing, we have to find the most recent recursion. */
782
783 case OP_ACCEPT:
784 case OP_END:
785
786 /* Handle end of a recursion. */
787
788 if (Fcurrent_recurse != RECURSE_UNSET)
789 {
790 offset = Flast_group_offset;
791 for(;;)
792 {
793 if (offset == PCRE2_UNSET) return PCRE2_ERROR_INTERNAL;
794 N = (heapframe *)((char *)mb->match_frames + offset);
795 P = (heapframe *)((char *)N - frame_size);
796 if (GF_IDMASK(N->group_frame_type) == GF_RECURSE) break;
797 offset = P->last_group_offset;
798 }
799
800 /* N is now the frame of the recursion; the previous frame is at the
801 OP_RECURSE position. Go back there, copying the current subject position
802 and mark, and move on past the OP_RECURSE. */
803
804 P->eptr = Feptr;
805 P->mark = Fmark;
806 F = P;
807 Fecode += 1 + LINK_SIZE;
808 continue;
809 }
810
811 /* Not a recursion. Fail for an empty string match if either PCRE2_NOTEMPTY
812 is set, or if PCRE2_NOTEMPTY_ATSTART is set and we have matched at the
813 start of the subject. In both cases, backtracking will then try other
814 alternatives, if any. */
815
816 if (Feptr == Fstart_match &&
817 ((mb->moptions & PCRE2_NOTEMPTY) != 0 ||
818 ((mb->moptions & PCRE2_NOTEMPTY_ATSTART) != 0 &&
819 Fstart_match == mb->start_subject + mb->start_offset)))
820 RRETURN(MATCH_NOMATCH);
821
822 /* Also fail if PCRE2_ENDANCHORED is set and the end of the match is not
823 the end of the subject. After (*ACCEPT) we fail the entire match (at this
824 position) but backtrack on reaching the end of the pattern. */
825
826 if (Feptr < mb->end_subject &&
827 ((mb->moptions | mb->poptions) & PCRE2_ENDANCHORED) != 0)
828 {
829 if (Fop == OP_END) RRETURN(MATCH_NOMATCH);
830 return MATCH_NOMATCH;
831 }
832
833 /* We have a successful match of the whole pattern. Record the result and
834 then do a direct return from the function. If there is space in the offset
835 vector, set any pairs that follow the highest-numbered captured string but
836 are less than the number of capturing groups in the pattern to PCRE2_UNSET.
837 It is documented that this happens. "Gaps" are set to PCRE2_UNSET
838 dynamically. It is only those at the end that need setting here. */
839
840 mb->end_match_ptr = Feptr; /* Record where we ended */
841 mb->end_offset_top = Foffset_top; /* and how many extracts were taken */
842 mb->mark = Fmark; /* and the last success mark */
843 if (Feptr > mb->last_used_ptr) mb->last_used_ptr = Feptr;
844
845 ovector[0] = Fstart_match - mb->start_subject;
846 ovector[1] = Feptr - mb->start_subject;
847
848 /* Set i to the smaller of the sizes of the external and frame ovectors. */
849
850 i = 2 * ((top_bracket + 1 > oveccount)? oveccount : top_bracket + 1);
851 memcpy(ovector + 2, Fovector, (i - 2) * sizeof(PCRE2_SIZE));
852 while (--i >= Foffset_top + 2) ovector[i] = PCRE2_UNSET;
853 return MATCH_MATCH; /* Note: NOT RRETURN */
854
855
856 /*===================================================================== */
857 /* Match any single character type except newline; have to take care with
858 CRLF newlines and partial matching. */
859
860 case OP_ANY:
861 if (IS_NEWLINE(Feptr)) RRETURN(MATCH_NOMATCH);
862 if (mb->partial != 0 &&
863 Feptr == mb->end_subject - 1 &&
864 NLBLOCK->nltype == NLTYPE_FIXED &&
865 NLBLOCK->nllen == 2 &&
866 UCHAR21TEST(Feptr) == NLBLOCK->nl[0])
867 {
868 mb->hitend = TRUE;
869 if (mb->partial > 1) return PCRE2_ERROR_PARTIAL;
870 }
871 /* Fall through */
872
873 /* Match any single character whatsoever. */
874
875 case OP_ALLANY:
876 if (Feptr >= mb->end_subject) /* DO NOT merge the Feptr++ here; it must */
877 { /* not be updated before SCHECK_PARTIAL. */
878 SCHECK_PARTIAL();
879 RRETURN(MATCH_NOMATCH);
880 }
881 Feptr++;
882 #ifdef SUPPORT_UNICODE
883 if (utf) ACROSSCHAR(Feptr < mb->end_subject, Feptr, Feptr++);
884 #endif
885 Fecode++;
886 break;
887
888
889 /* ===================================================================== */
890 /* Match a single code unit, even in UTF mode. This opcode really does
891 match any code unit, even newline. (It really should be called ANYCODEUNIT,
892 of course - the byte name is from pre-16 bit days.) */
893
894 case OP_ANYBYTE:
895 if (Feptr >= mb->end_subject) /* DO NOT merge the Feptr++ here; it must */
896 { /* not be updated before SCHECK_PARTIAL. */
897 SCHECK_PARTIAL();
898 RRETURN(MATCH_NOMATCH);
899 }
900 Feptr++;
901 Fecode++;
902 break;
903
904
905 /* ===================================================================== */
906 /* Match a single character, casefully */
907
908 case OP_CHAR:
909 #ifdef SUPPORT_UNICODE
910 if (utf)
911 {
912 Flength = 1;
913 Fecode++;
914 GETCHARLEN(fc, Fecode, Flength);
915 if (Flength > (PCRE2_SIZE)(mb->end_subject - Feptr))
916 {
917 CHECK_PARTIAL(); /* Not SCHECK_PARTIAL() */
918 RRETURN(MATCH_NOMATCH);
919 }
920 for (; Flength > 0; Flength--)
921 {
922 if (*Fecode++ != UCHAR21INC(Feptr)) RRETURN(MATCH_NOMATCH);
923 }
924 }
925 else
926 #endif
927 /* Not UTF mode */
928 {
929 if (mb->end_subject - Feptr < 1)
930 {
931 SCHECK_PARTIAL(); /* This one can use SCHECK_PARTIAL() */
932 RRETURN(MATCH_NOMATCH);
933 }
934 if (Fecode[1] != *Feptr++) RRETURN(MATCH_NOMATCH);
935 Fecode += 2;
936 }
937 break;
938
939
940 /* ===================================================================== */
941 /* Match a single character, caselessly. If we are at the end of the
942 subject, give up immediately. We get here only when the pattern character
943 has at most one other case. Characters with more than two cases are coded
944 as OP_PROP with the pseudo-property PT_CLIST. */
945
946 case OP_CHARI:
947 if (Feptr >= mb->end_subject)
948 {
949 SCHECK_PARTIAL();
950 RRETURN(MATCH_NOMATCH);
951 }
952
953 #ifdef SUPPORT_UNICODE
954 if (utf)
955 {
956 Flength = 1;
957 Fecode++;
958 GETCHARLEN(fc, Fecode, Flength);
959
960 /* If the pattern character's value is < 128, we know that its other case
961 (if any) is also < 128 (and therefore only one code unit long in all
962 code-unit widths), so we can use the fast lookup table. We checked above
963 that there is at least one character left in the subject. */
964
965 if (fc < 128)
966 {
967 uint32_t cc = UCHAR21(Feptr);
968 if (mb->lcc[fc] != TABLE_GET(cc, mb->lcc, cc)) RRETURN(MATCH_NOMATCH);
969 Fecode++;
970 Feptr++;
971 }
972
973 /* Otherwise we must pick up the subject character and use Unicode
974 property support to test its other case. Note that we cannot use the
975 value of "Flength" to check for sufficient bytes left, because the other
976 case of the character may have more or fewer code units. */
977
978 else
979 {
980 uint32_t dc;
981 GETCHARINC(dc, Feptr);
982 Fecode += Flength;
983 if (dc != fc && dc != UCD_OTHERCASE(fc)) RRETURN(MATCH_NOMATCH);
984 }
985 }
986 else
987 #endif /* SUPPORT_UNICODE */
988
989 /* Not UTF mode; use the table for characters < 256. */
990 {
991 if (TABLE_GET(Fecode[1], mb->lcc, Fecode[1])
992 != TABLE_GET(*Feptr, mb->lcc, *Feptr)) RRETURN(MATCH_NOMATCH);
993 Feptr++;
994 Fecode += 2;
995 }
996 break;
997
998
999 /* ===================================================================== */
1000 /* Match not a single character. */
1001
1002 case OP_NOT:
1003 case OP_NOTI:
1004 if (Feptr >= mb->end_subject)
1005 {
1006 SCHECK_PARTIAL();
1007 RRETURN(MATCH_NOMATCH);
1008 }
1009 #ifdef SUPPORT_UNICODE
1010 if (utf)
1011 {
1012 uint32_t ch;
1013 Fecode++;
1014 GETCHARINC(ch, Fecode);
1015 GETCHARINC(fc, Feptr);
1016 if (ch == fc)
1017 {
1018 RRETURN(MATCH_NOMATCH); /* Caseful match */
1019 }
1020 else if (Fop == OP_NOTI) /* If caseless */
1021 {
1022 if (ch > 127)
1023 ch = UCD_OTHERCASE(ch);
1024 else
1025 ch = TABLE_GET(ch, mb->fcc, ch);
1026 if (ch == fc) RRETURN(MATCH_NOMATCH);
1027 }
1028 }
1029 else
1030 #endif /* SUPPORT_UNICODE */
1031 {
1032 uint32_t ch = Fecode[1];
1033 fc = *Feptr++;
1034 if (ch == fc || (Fop == OP_NOTI && TABLE_GET(ch, mb->fcc, ch) == fc))
1035 RRETURN(MATCH_NOMATCH);
1036 Fecode += 2;
1037 }
1038 break;
1039
1040
1041 /* ===================================================================== */
1042 /* Match a single character repeatedly. */
1043
1044 #define Loclength F->temp_size
1045 #define Lstart_eptr F->temp_sptr[0]
1046 #define Lcharptr F->temp_sptr[1]
1047 #define Lmin F->temp_32[0]
1048 #define Lmax F->temp_32[1]
1049 #define Lc F->temp_32[2]
1050 #define Loc F->temp_32[3]
1051
1052 case OP_EXACT:
1053 case OP_EXACTI:
1054 Lmin = Lmax = GET2(Fecode, 1);
1055 Fecode += 1 + IMM2_SIZE;
1056 goto REPEATCHAR;
1057
1058 case OP_POSUPTO:
1059 case OP_POSUPTOI:
1060 reptype = REPTYPE_POS;
1061 Lmin = 0;
1062 Lmax = GET2(Fecode, 1);
1063 Fecode += 1 + IMM2_SIZE;
1064 goto REPEATCHAR;
1065
1066 case OP_UPTO:
1067 case OP_UPTOI:
1068 reptype = REPTYPE_MAX;
1069 Lmin = 0;
1070 Lmax = GET2(Fecode, 1);
1071 Fecode += 1 + IMM2_SIZE;
1072 goto REPEATCHAR;
1073
1074 case OP_MINUPTO:
1075 case OP_MINUPTOI:
1076 reptype = REPTYPE_MIN;
1077 Lmin = 0;
1078 Lmax = GET2(Fecode, 1);
1079 Fecode += 1 + IMM2_SIZE;
1080 goto REPEATCHAR;
1081
1082 case OP_POSSTAR:
1083 case OP_POSSTARI:
1084 reptype = REPTYPE_POS;
1085 Lmin = 0;
1086 Lmax = UINT32_MAX;
1087 Fecode++;
1088 goto REPEATCHAR;
1089
1090 case OP_POSPLUS:
1091 case OP_POSPLUSI:
1092 reptype = REPTYPE_POS;
1093 Lmin = 1;
1094 Lmax = UINT32_MAX;
1095 Fecode++;
1096 goto REPEATCHAR;
1097
1098 case OP_POSQUERY:
1099 case OP_POSQUERYI:
1100 reptype = REPTYPE_POS;
1101 Lmin = 0;
1102 Lmax = 1;
1103 Fecode++;
1104 goto REPEATCHAR;
1105
1106 case OP_STAR:
1107 case OP_STARI:
1108 case OP_MINSTAR:
1109 case OP_MINSTARI:
1110 case OP_PLUS:
1111 case OP_PLUSI:
1112 case OP_MINPLUS:
1113 case OP_MINPLUSI:
1114 case OP_QUERY:
1115 case OP_QUERYI:
1116 case OP_MINQUERY:
1117 case OP_MINQUERYI:
1118 fc = *Fecode++ - ((Fop < OP_STARI)? OP_STAR : OP_STARI);
1119 Lmin = rep_min[fc];
1120 Lmax = rep_max[fc];
1121 reptype = rep_typ[fc];
1122
1123 /* Common code for all repeated single-character matches. We first check
1124 for the minimum number of characters. If the minimum equals the maximum, we
1125 are done. Otherwise, if minimizing, check the rest of the pattern for a
1126 match; if there isn't one, advance up to the maximum, one character at a
1127 time.
1128
1129 If maximizing, advance up to the maximum number of matching characters,
1130 until Feptr is past the end of the maximum run. If possessive, we are
1131 then done (no backing up). Otherwise, match at this position; anything
1132 other than no match is immediately returned. For nomatch, back up one
1133 character, unless we are matching \R and the last thing matched was
1134 \r\n, in which case, back up two code units until we reach the first
1135 optional character position.
1136
1137 The various UTF/non-UTF and caseful/caseless cases are handled separately,
1138 for speed. */
1139
1140 REPEATCHAR:
1141 #ifdef SUPPORT_UNICODE
1142 if (utf)
1143 {
1144 Flength = 1;
1145 Lcharptr = Fecode;
1146 GETCHARLEN(fc, Fecode, Flength);
1147 Fecode += Flength;
1148
1149 /* Handle multi-code-unit character matching, caseful and caseless. */
1150
1151 if (Flength > 1)
1152 {
1153 uint32_t othercase;
1154
1155 if (Fop >= OP_STARI && /* Caseless */
1156 (othercase = UCD_OTHERCASE(fc)) != fc)
1157 Loclength = PRIV(ord2utf)(othercase, Foccu);
1158 else Loclength = 0;
1159
1160 for (i = 1; i <= Lmin; i++)
1161 {
1162 if (Feptr <= mb->end_subject - Flength &&
1163 memcmp(Feptr, Lcharptr, CU2BYTES(Flength)) == 0) Feptr += Flength;
1164 else if (Loclength > 0 &&
1165 Feptr <= mb->end_subject - Loclength &&
1166 memcmp(Feptr, Foccu, CU2BYTES(Loclength)) == 0)
1167 Feptr += Loclength;
1168 else
1169 {
1170 CHECK_PARTIAL();
1171 RRETURN(MATCH_NOMATCH);
1172 }
1173 }
1174
1175 if (Lmin == Lmax) continue;
1176
1177 if (reptype == REPTYPE_MIN)
1178 {
1179 for (;;)
1180 {
1181 RMATCH(Fecode, RM202);
1182 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1183 if (Lmin++ >= Lmax) RRETURN(MATCH_NOMATCH);
1184 if (Feptr <= mb->end_subject - Flength &&
1185 memcmp(Feptr, Lcharptr, CU2BYTES(Flength)) == 0) Feptr += Flength;
1186 else if (Loclength > 0 &&
1187 Feptr <= mb->end_subject - Loclength &&
1188 memcmp(Feptr, Foccu, CU2BYTES(Loclength)) == 0)
1189 Feptr += Loclength;
1190 else
1191 {
1192 CHECK_PARTIAL();
1193 RRETURN(MATCH_NOMATCH);
1194 }
1195 }
1196 /* Control never gets here */
1197 }
1198
1199 else /* Maximize */
1200 {
1201 Lstart_eptr = Feptr;
1202 for (i = Lmin; i < Lmax; i++)
1203 {
1204 if (Feptr <= mb->end_subject - Flength &&
1205 memcmp(Feptr, Lcharptr, CU2BYTES(Flength)) == 0)
1206 Feptr += Flength;
1207 else if (Loclength > 0 &&
1208 Feptr <= mb->end_subject - Loclength &&
1209 memcmp(Feptr, Foccu, CU2BYTES(Loclength)) == 0)
1210 Feptr += Loclength;
1211 else
1212 {
1213 CHECK_PARTIAL();
1214 break;
1215 }
1216 }
1217
1218 /* After \C in UTF mode, Lstart_eptr might be in the middle of a
1219 Unicode character. Use <= Lstart_eptr to ensure backtracking doesn't
1220 go too far. */
1221
1222 if (reptype != REPTYPE_POS) for(;;)
1223 {
1224 if (Feptr <= Lstart_eptr) break;
1225 RMATCH(Fecode, RM203);
1226 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1227 Feptr--;
1228 BACKCHAR(Feptr);
1229 }
1230 }
1231 break; /* End of repeated wide character handling */
1232 }
1233
1234 /* Length of UTF character is 1. Put it into the preserved variable and
1235 fall through to the non-UTF code. */
1236
1237 Lc = fc;
1238 }
1239 else
1240 #endif /* SUPPORT_UNICODE */
1241
1242 /* When not in UTF mode, load a single-code-unit character. Then proceed as
1243 above. */
1244
1245 Lc = *Fecode++;
1246
1247 /* Caseless comparison */
1248
1249 if (Fop >= OP_STARI)
1250 {
1251 #if PCRE2_CODE_UNIT_WIDTH == 8
1252 /* Lc must be < 128 in UTF-8 mode. */
1253 Loc = mb->fcc[Lc];
1254 #else /* 16-bit & 32-bit */
1255 #ifdef SUPPORT_UNICODE
1256 if (utf && Lc > 127) Loc = UCD_OTHERCASE(Lc);
1257 else
1258 #endif /* SUPPORT_UNICODE */
1259 Loc = TABLE_GET(Lc, mb->fcc, Lc);
1260 #endif /* PCRE2_CODE_UNIT_WIDTH == 8 */
1261
1262 for (i = 1; i <= Lmin; i++)
1263 {
1264 uint32_t cc; /* Faster than PCRE2_UCHAR */
1265 if (Feptr >= mb->end_subject)
1266 {
1267 SCHECK_PARTIAL();
1268 RRETURN(MATCH_NOMATCH);
1269 }
1270 cc = UCHAR21TEST(Feptr);
1271 if (Lc != cc && Loc != cc) RRETURN(MATCH_NOMATCH);
1272 Feptr++;
1273 }
1274 if (Lmin == Lmax) continue;
1275
1276 if (reptype == REPTYPE_MIN)
1277 {
1278 for (;;)
1279 {
1280 uint32_t cc; /* Faster than PCRE2_UCHAR */
1281 RMATCH(Fecode, RM25);
1282 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1283 if (Lmin++ >= Lmax) RRETURN(MATCH_NOMATCH);
1284 if (Feptr >= mb->end_subject)
1285 {
1286 SCHECK_PARTIAL();
1287 RRETURN(MATCH_NOMATCH);
1288 }
1289 cc = UCHAR21TEST(Feptr);
1290 if (Lc != cc && Loc != cc) RRETURN(MATCH_NOMATCH);
1291 Feptr++;
1292 }
1293 /* Control never gets here */
1294 }
1295
1296 else /* Maximize */
1297 {
1298 Lstart_eptr = Feptr;
1299 for (i = Lmin; i < Lmax; i++)
1300 {
1301 uint32_t cc; /* Faster than PCRE2_UCHAR */
1302 if (Feptr >= mb->end_subject)
1303 {
1304 SCHECK_PARTIAL();
1305 break;
1306 }
1307 cc = UCHAR21TEST(Feptr);
1308 if (Lc != cc && Loc != cc) break;
1309 Feptr++;
1310 }
1311 if (reptype != REPTYPE_POS) for (;;)
1312 {
1313 if (Feptr == Lstart_eptr) break;
1314 RMATCH(Fecode, RM26);
1315 Feptr--;
1316 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1317 }
1318 }
1319 }
1320
1321 /* Caseful comparisons (includes all multi-byte characters) */
1322
1323 else
1324 {
1325 for (i = 1; i <= Lmin; i++)
1326 {
1327 if (Feptr >= mb->end_subject)
1328 {
1329 SCHECK_PARTIAL();
1330 RRETURN(MATCH_NOMATCH);
1331 }
1332 if (Lc != UCHAR21INCTEST(Feptr)) RRETURN(MATCH_NOMATCH);
1333 }
1334
1335 if (Lmin == Lmax) continue;
1336
1337 if (reptype == REPTYPE_MIN)
1338 {
1339 for (;;)
1340 {
1341 RMATCH(Fecode, RM27);
1342 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1343 if (Lmin++ >= Lmax) RRETURN(MATCH_NOMATCH);
1344 if (Feptr >= mb->end_subject)
1345 {
1346 SCHECK_PARTIAL();
1347 RRETURN(MATCH_NOMATCH);
1348 }
1349 if (Lc != UCHAR21INCTEST(Feptr)) RRETURN(MATCH_NOMATCH);
1350 }
1351 /* Control never gets here */
1352 }
1353 else /* Maximize */
1354 {
1355 Lstart_eptr = Feptr;
1356 for (i = Lmin; i < Lmax; i++)
1357 {
1358 if (Feptr >= mb->end_subject)
1359 {
1360 SCHECK_PARTIAL();
1361 break;
1362 }
1363
1364 if (Lc != UCHAR21TEST(Feptr)) break;
1365 Feptr++;
1366 }
1367
1368 if (reptype != REPTYPE_POS) for (;;)
1369 {
1370 if (Feptr <= Lstart_eptr) break;
1371 RMATCH(Fecode, RM28);
1372 Feptr--;
1373 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1374 }
1375 }
1376 }
1377 break;
1378
1379 #undef Loclength
1380 #undef Lstart_eptr
1381 #undef Lcharptr
1382 #undef Lmin
1383 #undef Lmax
1384 #undef Lc
1385 #undef Loc
1386
1387
1388 /* ===================================================================== */
1389 /* Match a negated single one-byte character repeatedly. This is almost a
1390 repeat of the code for a repeated single character, but I haven't found a
1391 nice way of commoning these up that doesn't require a test of the
1392 positive/negative option for each character match. Maybe that wouldn't add
1393 very much to the time taken, but character matching *is* what this is all
1394 about... */
1395
1396 #define Lstart_eptr F->temp_sptr[0]
1397 #define Lmin F->temp_32[0]
1398 #define Lmax F->temp_32[1]
1399 #define Lc F->temp_32[2]
1400 #define Loc F->temp_32[3]
1401
1402 case OP_NOTEXACT:
1403 case OP_NOTEXACTI:
1404 Lmin = Lmax = GET2(Fecode, 1);
1405 Fecode += 1 + IMM2_SIZE;
1406 goto REPEATNOTCHAR;
1407
1408 case OP_NOTUPTO:
1409 case OP_NOTUPTOI:
1410 Lmin = 0;
1411 Lmax = GET2(Fecode, 1);
1412 reptype = REPTYPE_MAX;
1413 Fecode += 1 + IMM2_SIZE;
1414 goto REPEATNOTCHAR;
1415
1416 case OP_NOTMINUPTO:
1417 case OP_NOTMINUPTOI:
1418 Lmin = 0;
1419 Lmax = GET2(Fecode, 1);
1420 reptype = REPTYPE_MIN;
1421 Fecode += 1 + IMM2_SIZE;
1422 goto REPEATNOTCHAR;
1423
1424 case OP_NOTPOSSTAR:
1425 case OP_NOTPOSSTARI:
1426 reptype = REPTYPE_POS;
1427 Lmin = 0;
1428 Lmax = UINT32_MAX;
1429 Fecode++;
1430 goto REPEATNOTCHAR;
1431
1432 case OP_NOTPOSPLUS:
1433 case OP_NOTPOSPLUSI:
1434 reptype = REPTYPE_POS;
1435 Lmin = 1;
1436 Lmax = UINT32_MAX;
1437 Fecode++;
1438 goto REPEATNOTCHAR;
1439
1440 case OP_NOTPOSQUERY:
1441 case OP_NOTPOSQUERYI:
1442 reptype = REPTYPE_POS;
1443 Lmin = 0;
1444 Lmax = 1;
1445 Fecode++;
1446 goto REPEATNOTCHAR;
1447
1448 case OP_NOTPOSUPTO:
1449 case OP_NOTPOSUPTOI:
1450 reptype = REPTYPE_POS;
1451 Lmin = 0;
1452 Lmax = GET2(Fecode, 1);
1453 Fecode += 1 + IMM2_SIZE;
1454 goto REPEATNOTCHAR;
1455
1456 case OP_NOTSTAR:
1457 case OP_NOTSTARI:
1458 case OP_NOTMINSTAR:
1459 case OP_NOTMINSTARI:
1460 case OP_NOTPLUS:
1461 case OP_NOTPLUSI:
1462 case OP_NOTMINPLUS:
1463 case OP_NOTMINPLUSI:
1464 case OP_NOTQUERY:
1465 case OP_NOTQUERYI:
1466 case OP_NOTMINQUERY:
1467 case OP_NOTMINQUERYI:
1468 fc = *Fecode++ - ((Fop >= OP_NOTSTARI)? OP_NOTSTARI: OP_NOTSTAR);
1469 Lmin = rep_min[fc];
1470 Lmax = rep_max[fc];
1471 reptype = rep_typ[fc];
1472
1473 /* Common code for all repeated single-character non-matches. */
1474
1475 REPEATNOTCHAR:
1476 GETCHARINCTEST(Lc, Fecode);
1477
1478 /* The code is duplicated for the caseless and caseful cases, for speed,
1479 since matching characters is likely to be quite common. First, ensure the
1480 minimum number of matches are present. If Lmin = Lmax, we are done.
1481 Otherwise, if minimizing, keep trying the rest of the expression and
1482 advancing one matching character if failing, up to the maximum.
1483 Alternatively, if maximizing, find the maximum number of characters and
1484 work backwards. */
1485
1486 if (Fop >= OP_NOTSTARI) /* Caseless */
1487 {
1488 #ifdef SUPPORT_UNICODE
1489 if (utf && Lc > 127)
1490 Loc = UCD_OTHERCASE(Lc);
1491 else
1492 #endif /* SUPPORT_UNICODE */
1493
1494 Loc = TABLE_GET(Lc, mb->fcc, Lc); /* Other case from table */
1495
1496 #ifdef SUPPORT_UNICODE
1497 if (utf)
1498 {
1499 uint32_t d;
1500 for (i = 1; i <= Lmin; i++)
1501 {
1502 if (Feptr >= mb->end_subject)
1503 {
1504 SCHECK_PARTIAL();
1505 RRETURN(MATCH_NOMATCH);
1506 }
1507 GETCHARINC(d, Feptr);
1508 if (Lc == d || Loc == d) RRETURN(MATCH_NOMATCH);
1509 }
1510 }
1511 else
1512 #endif /* SUPPORT_UNICODE */
1513
1514 /* Not UTF mode */
1515 {
1516 for (i = 1; i <= Lmin; i++)
1517 {
1518 if (Feptr >= mb->end_subject)
1519 {
1520 SCHECK_PARTIAL();
1521 RRETURN(MATCH_NOMATCH);
1522 }
1523 if (Lc == *Feptr || Loc == *Feptr) RRETURN(MATCH_NOMATCH);
1524 Feptr++;
1525 }
1526 }
1527
1528 if (Lmin == Lmax) continue; /* Finished for exact count */
1529
1530 if (reptype == REPTYPE_MIN)
1531 {
1532 #ifdef SUPPORT_UNICODE
1533 if (utf)
1534 {
1535 uint32_t d;
1536 for (;;)
1537 {
1538 RMATCH(Fecode, RM204);
1539 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1540 if (Lmin++ >= Lmax) RRETURN(MATCH_NOMATCH);
1541 if (Feptr >= mb->end_subject)
1542 {
1543 SCHECK_PARTIAL();
1544 RRETURN(MATCH_NOMATCH);
1545 }
1546 GETCHARINC(d, Feptr);
1547 if (Lc == d || Loc == d) RRETURN(MATCH_NOMATCH);
1548 }
1549 }
1550 else
1551 #endif /*SUPPORT_UNICODE */
1552
1553 /* Not UTF mode */
1554 {
1555 for (;;)
1556 {
1557 RMATCH(Fecode, RM29);
1558 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1559 if (Lmin++ >= Lmax) RRETURN(MATCH_NOMATCH);
1560 if (Feptr >= mb->end_subject)
1561 {
1562 SCHECK_PARTIAL();
1563 RRETURN(MATCH_NOMATCH);
1564 }
1565 if (Lc == *Feptr || Loc == *Feptr) RRETURN(MATCH_NOMATCH);
1566 Feptr++;
1567 }
1568 }
1569 /* Control never gets here */
1570 }
1571
1572 /* Maximize case */
1573
1574 else
1575 {
1576 Lstart_eptr = Feptr;
1577
1578 #ifdef SUPPORT_UNICODE
1579 if (utf)
1580 {
1581 uint32_t d;
1582 for (i = Lmin; i < Lmax; i++)
1583 {
1584 int len = 1;
1585 if (Feptr >= mb->end_subject)
1586 {
1587 SCHECK_PARTIAL();
1588 break;
1589 }
1590 GETCHARLEN(d, Feptr, len);
1591 if (Lc == d || Loc == d) break;
1592 Feptr += len;
1593 }
1594
1595 /* After \C in UTF mode, Lstart_eptr might be in the middle of a
1596 Unicode character. Use <= Lstart_eptr to ensure backtracking doesn't
1597 go too far. */
1598
1599 if (reptype != REPTYPE_POS) for(;;)
1600 {
1601 if (Feptr <= Lstart_eptr) break;
1602 RMATCH(Fecode, RM205);
1603 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1604 Feptr--;
1605 BACKCHAR(Feptr);
1606 }
1607 }
1608 else
1609 #endif /* SUPPORT_UNICODE */
1610
1611 /* Not UTF mode */
1612 {
1613 for (i = Lmin; i < Lmax; i++)
1614 {
1615 if (Feptr >= mb->end_subject)
1616 {
1617 SCHECK_PARTIAL();
1618 break;
1619 }
1620 if (Lc == *Feptr || Loc == *Feptr) break;
1621 Feptr++;
1622 }
1623 if (reptype != REPTYPE_POS) for (;;)
1624 {
1625 if (Feptr == Lstart_eptr) break;
1626 RMATCH(Fecode, RM30);
1627 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1628 Feptr--;
1629 }
1630 }
1631 }
1632 }
1633
1634 /* Caseful comparisons */
1635
1636 else
1637 {
1638 #ifdef SUPPORT_UNICODE
1639 if (utf)
1640 {
1641 uint32_t d;
1642 for (i = 1; i <= Lmin; i++)
1643 {
1644 if (Feptr >= mb->end_subject)
1645 {
1646 SCHECK_PARTIAL();
1647 RRETURN(MATCH_NOMATCH);
1648 }
1649 GETCHARINC(d, Feptr);
1650 if (Lc == d) RRETURN(MATCH_NOMATCH);
1651 }
1652 }
1653 else
1654 #endif
1655 /* Not UTF mode */
1656 {
1657 for (i = 1; i <= Lmin; i++)
1658 {
1659 if (Feptr >= mb->end_subject)
1660 {
1661 SCHECK_PARTIAL();
1662 RRETURN(MATCH_NOMATCH);
1663 }
1664 if (Lc == *Feptr++) RRETURN(MATCH_NOMATCH);
1665 }
1666 }
1667
1668 if (Lmin == Lmax) continue;
1669
1670 if (reptype == REPTYPE_MIN)
1671 {
1672 #ifdef SUPPORT_UNICODE
1673 if (utf)
1674 {
1675 uint32_t d;
1676 for (;;)
1677 {
1678 RMATCH(Fecode, RM206);
1679 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1680 if (Lmin++ >= Lmax) RRETURN(MATCH_NOMATCH);
1681 if (Feptr >= mb->end_subject)
1682 {
1683 SCHECK_PARTIAL();
1684 RRETURN(MATCH_NOMATCH);
1685 }
1686 GETCHARINC(d, Feptr);
1687 if (Lc == d) RRETURN(MATCH_NOMATCH);
1688 }
1689 }
1690 else
1691 #endif
1692 /* Not UTF mode */
1693 {
1694 for (;;)
1695 {
1696 RMATCH(Fecode, RM31);
1697 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1698 if (Lmin++ >= Lmax) RRETURN(MATCH_NOMATCH);
1699 if (Feptr >= mb->end_subject)
1700 {
1701 SCHECK_PARTIAL();
1702 RRETURN(MATCH_NOMATCH);
1703 }
1704 if (Lc == *Feptr++) RRETURN(MATCH_NOMATCH);
1705 }
1706 }
1707 /* Control never gets here */
1708 }
1709
1710 /* Maximize case */
1711
1712 else
1713 {
1714 Lstart_eptr = Feptr;
1715
1716 #ifdef SUPPORT_UNICODE
1717 if (utf)
1718 {
1719 uint32_t d;
1720 for (i = Lmin; i < Lmax; i++)
1721 {
1722 int len = 1;
1723 if (Feptr >= mb->end_subject)
1724 {
1725 SCHECK_PARTIAL();
1726 break;
1727 }
1728 GETCHARLEN(d, Feptr, len);
1729 if (Lc == d) break;
1730 Feptr += len;
1731 }
1732
1733 /* After \C in UTF mode, Lstart_eptr might be in the middle of a
1734 Unicode character. Use <= Lstart_eptr to ensure backtracking doesn't
1735 go too far. */
1736
1737 if (reptype != REPTYPE_POS) for(;;)
1738 {
1739 if (Feptr <= Lstart_eptr) break;
1740 RMATCH(Fecode, RM207);
1741 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1742 Feptr--;
1743 BACKCHAR(Feptr);
1744 }
1745 }
1746 else
1747 #endif
1748 /* Not UTF mode */
1749 {
1750 for (i = Lmin; i < Lmax; i++)
1751 {
1752 if (Feptr >= mb->end_subject)
1753 {
1754 SCHECK_PARTIAL();
1755 break;
1756 }
1757 if (Lc == *Feptr) break;
1758 Feptr++;
1759 }
1760 if (reptype != REPTYPE_POS) for (;;)
1761 {
1762 if (Feptr == Lstart_eptr) break;
1763 RMATCH(Fecode, RM32);
1764 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1765 Feptr--;
1766 }
1767 }
1768 }
1769 }
1770 break;
1771
1772 #undef Lstart_eptr
1773 #undef Lmin
1774 #undef Lmax
1775 #undef Lc
1776 #undef Loc
1777
1778
1779 /* ===================================================================== */
1780 /* Match a bit-mapped character class, possibly repeatedly. These opcodes
1781 are used when all the characters in the class have values in the range
1782 0-255, and either the matching is caseful, or the characters are in the
1783 range 0-127 when UTF processing is enabled. The only difference between
1784 OP_CLASS and OP_NCLASS occurs when a data character outside the range is
1785 encountered. */
1786
1787 #define Lmin F->temp_32[0]
1788 #define Lmax F->temp_32[1]
1789 #define Lstart_eptr F->temp_sptr[0]
1790 #define Lbyte_map_address F->temp_sptr[1]
1791 #define Lbyte_map ((unsigned char *)Lbyte_map_address)
1792
1793 case OP_NCLASS:
1794 case OP_CLASS:
1795 {
1796 Lbyte_map_address = Fecode + 1; /* Save for matching */
1797 Fecode += 1 + (32 / sizeof(PCRE2_UCHAR)); /* Advance past the item */
1798
1799 /* Look past the end of the item to see if there is repeat information
1800 following. Then obey similar code to character type repeats. */
1801
1802 switch (*Fecode)
1803 {
1804 case OP_CRSTAR:
1805 case OP_CRMINSTAR:
1806 case OP_CRPLUS:
1807 case OP_CRMINPLUS:
1808 case OP_CRQUERY:
1809 case OP_CRMINQUERY:
1810 case OP_CRPOSSTAR:
1811 case OP_CRPOSPLUS:
1812 case OP_CRPOSQUERY:
1813 fc = *Fecode++ - OP_CRSTAR;
1814 Lmin = rep_min[fc];
1815 Lmax = rep_max[fc];
1816 reptype = rep_typ[fc];
1817 break;
1818
1819 case OP_CRRANGE:
1820 case OP_CRMINRANGE:
1821 case OP_CRPOSRANGE:
1822 Lmin = GET2(Fecode, 1);
1823 Lmax = GET2(Fecode, 1 + IMM2_SIZE);
1824 if (Lmax == 0) Lmax = UINT32_MAX; /* Max 0 => infinity */
1825 reptype = rep_typ[*Fecode - OP_CRSTAR];
1826 Fecode += 1 + 2 * IMM2_SIZE;
1827 break;
1828
1829 default: /* No repeat follows */
1830 Lmin = Lmax = 1;
1831 break;
1832 }
1833
1834 /* First, ensure the minimum number of matches are present. */
1835
1836 #ifdef SUPPORT_UNICODE
1837 if (utf)
1838 {
1839 for (i = 1; i <= Lmin; i++)
1840 {
1841 if (Feptr >= mb->end_subject)
1842 {
1843 SCHECK_PARTIAL();
1844 RRETURN(MATCH_NOMATCH);
1845 }
1846 GETCHARINC(fc, Feptr);
1847 if (fc > 255)
1848 {
1849 if (Fop == OP_CLASS) RRETURN(MATCH_NOMATCH);
1850 }
1851 else
1852 if ((Lbyte_map[fc/8] & (1u << (fc&7))) == 0) RRETURN(MATCH_NOMATCH);
1853 }
1854 }
1855 else
1856 #endif
1857 /* Not UTF mode */
1858 {
1859 for (i = 1; i <= Lmin; i++)
1860 {
1861 if (Feptr >= mb->end_subject)
1862 {
1863 SCHECK_PARTIAL();
1864 RRETURN(MATCH_NOMATCH);
1865 }
1866 fc = *Feptr++;
1867 #if PCRE2_CODE_UNIT_WIDTH != 8
1868 if (fc > 255)
1869 {
1870 if (Fop == OP_CLASS) RRETURN(MATCH_NOMATCH);
1871 }
1872 else
1873 #endif
1874 if ((Lbyte_map[fc/8] & (1u << (fc&7))) == 0) RRETURN(MATCH_NOMATCH);
1875 }
1876 }
1877
1878 /* If Lmax == Lmin we are done. Continue with main loop. */
1879
1880 if (Lmin == Lmax) continue;
1881
1882 /* If minimizing, keep testing the rest of the expression and advancing
1883 the pointer while it matches the class. */
1884
1885 if (reptype == REPTYPE_MIN)
1886 {
1887 #ifdef SUPPORT_UNICODE
1888 if (utf)
1889 {
1890 for (;;)
1891 {
1892 RMATCH(Fecode, RM200);
1893 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1894 if (Lmin++ >= Lmax) RRETURN(MATCH_NOMATCH);
1895 if (Feptr >= mb->end_subject)
1896 {
1897 SCHECK_PARTIAL();
1898 RRETURN(MATCH_NOMATCH);
1899 }
1900 GETCHARINC(fc, Feptr);
1901 if (fc > 255)
1902 {
1903 if (Fop == OP_CLASS) RRETURN(MATCH_NOMATCH);
1904 }
1905 else
1906 if ((Lbyte_map[fc/8] & (1u << (fc&7))) == 0) RRETURN(MATCH_NOMATCH);
1907 }
1908 }
1909 else
1910 #endif
1911 /* Not UTF mode */
1912 {
1913 for (;;)
1914 {
1915 RMATCH(Fecode, RM23);
1916 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1917 if (Lmin++ >= Lmax) RRETURN(MATCH_NOMATCH);
1918 if (Feptr >= mb->end_subject)
1919 {
1920 SCHECK_PARTIAL();
1921 RRETURN(MATCH_NOMATCH);
1922 }
1923 fc = *Feptr++;
1924 #if PCRE2_CODE_UNIT_WIDTH != 8
1925 if (fc > 255)
1926 {
1927 if (Fop == OP_CLASS) RRETURN(MATCH_NOMATCH);
1928 }
1929 else
1930 #endif
1931 if ((Lbyte_map[fc/8] & (1u << (fc&7))) == 0) RRETURN(MATCH_NOMATCH);
1932 }
1933 }
1934 /* Control never gets here */
1935 }
1936
1937 /* If maximizing, find the longest possible run, then work backwards. */
1938
1939 else
1940 {
1941 Lstart_eptr = Feptr;
1942
1943 #ifdef SUPPORT_UNICODE
1944 if (utf)
1945 {
1946 for (i = Lmin; i < Lmax; i++)
1947 {
1948 int len = 1;
1949 if (Feptr >= mb->end_subject)
1950 {
1951 SCHECK_PARTIAL();
1952 break;
1953 }
1954 GETCHARLEN(fc, Feptr, len);
1955 if (fc > 255)
1956 {
1957 if (Fop == OP_CLASS) break;
1958 }
1959 else
1960 if ((Lbyte_map[fc/8] & (1u << (fc&7))) == 0) break;
1961 Feptr += len;
1962 }
1963
1964 if (reptype == REPTYPE_POS) continue; /* No backtracking */
1965
1966 /* After \C in UTF mode, Lstart_eptr might be in the middle of a
1967 Unicode character. Use <= Lstart_eptr to ensure backtracking doesn't
1968 go too far. */
1969
1970 for (;;)
1971 {
1972 RMATCH(Fecode, RM201);
1973 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1974 if (Feptr-- <= Lstart_eptr) break; /* Tried at original position */
1975 BACKCHAR(Feptr);
1976 }
1977 }
1978 else
1979 #endif
1980 /* Not UTF mode */
1981 {
1982 for (i = Lmin; i < Lmax; i++)
1983 {
1984 if (Feptr >= mb->end_subject)
1985 {
1986 SCHECK_PARTIAL();
1987 break;
1988 }
1989 fc = *Feptr;
1990 #if PCRE2_CODE_UNIT_WIDTH != 8
1991 if (fc > 255)
1992 {
1993 if (Fop == OP_CLASS) break;
1994 }
1995 else
1996 #endif
1997 if ((Lbyte_map[fc/8] & (1u << (fc&7))) == 0) break;
1998 Feptr++;
1999 }
2000
2001 if (reptype == REPTYPE_POS) continue; /* No backtracking */
2002
2003 while (Feptr >= Lstart_eptr)
2004 {
2005 RMATCH(Fecode, RM24);
2006 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2007 Feptr--;
2008 }
2009 }
2010
2011 RRETURN(MATCH_NOMATCH);
2012 }
2013 }
2014 /* Control never gets here */
2015
2016 #undef Lbyte_map_address
2017 #undef Lbyte_map
2018 #undef Lstart_eptr
2019 #undef Lmin
2020 #undef Lmax
2021
2022
2023 /* ===================================================================== */
2024 /* Match an extended character class. In the 8-bit library, this opcode is
2025 encountered only when UTF-8 mode mode is supported. In the 16-bit and
2026 32-bit libraries, codepoints greater than 255 may be encountered even when
2027 UTF is not supported. */
2028
2029 #define Lstart_eptr F->temp_sptr[0]
2030 #define Lxclass_data F->temp_sptr[1]
2031 #define Lmin F->temp_32[0]
2032 #define Lmax F->temp_32[1]
2033
2034 #ifdef SUPPORT_WIDE_CHARS
2035 case OP_XCLASS:
2036 {
2037 Lxclass_data = Fecode + 1 + LINK_SIZE; /* Save for matching */
2038 Fecode += GET(Fecode, 1); /* Advance past the item */
2039
2040 switch (*Fecode)
2041 {
2042 case OP_CRSTAR:
2043 case OP_CRMINSTAR:
2044 case OP_CRPLUS:
2045 case OP_CRMINPLUS:
2046 case OP_CRQUERY:
2047 case OP_CRMINQUERY:
2048 case OP_CRPOSSTAR:
2049 case OP_CRPOSPLUS:
2050 case OP_CRPOSQUERY:
2051 fc = *Fecode++ - OP_CRSTAR;
2052 Lmin = rep_min[fc];
2053 Lmax = rep_max[fc];
2054 reptype = rep_typ[fc];
2055 break;
2056
2057 case OP_CRRANGE:
2058 case OP_CRMINRANGE:
2059 case OP_CRPOSRANGE:
2060 Lmin = GET2(Fecode, 1);
2061 Lmax = GET2(Fecode, 1 + IMM2_SIZE);
2062 if (Lmax == 0) Lmax = UINT32_MAX; /* Max 0 => infinity */
2063 reptype = rep_typ[*Fecode - OP_CRSTAR];
2064 Fecode += 1 + 2 * IMM2_SIZE;
2065 break;
2066
2067 default: /* No repeat follows */
2068 Lmin = Lmax = 1;
2069 break;
2070 }
2071
2072 /* First, ensure the minimum number of matches are present. */
2073
2074 for (i = 1; i <= Lmin; i++)
2075 {
2076 if (Feptr >= mb->end_subject)
2077 {
2078 SCHECK_PARTIAL();
2079 RRETURN(MATCH_NOMATCH);
2080 }
2081 GETCHARINCTEST(fc, Feptr);
2082 if (!PRIV(xclass)(fc, Lxclass_data, utf)) RRETURN(MATCH_NOMATCH);
2083 }
2084
2085 /* If Lmax == Lmin we can just continue with the main loop. */
2086
2087 if (Lmin == Lmax) continue;
2088
2089 /* If minimizing, keep testing the rest of the expression and advancing
2090 the pointer while it matches the class. */
2091
2092 if (reptype == REPTYPE_MIN)
2093 {
2094 for (;;)
2095 {
2096 RMATCH(Fecode, RM100);
2097 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2098 if (Lmin++ >= Lmax) RRETURN(MATCH_NOMATCH);
2099 if (Feptr >= mb->end_subject)
2100 {
2101 SCHECK_PARTIAL();
2102 RRETURN(MATCH_NOMATCH);
2103 }
2104 GETCHARINCTEST(fc, Feptr);
2105 if (!PRIV(xclass)(fc, Lxclass_data, utf)) RRETURN(MATCH_NOMATCH);
2106 }
2107 /* Control never gets here */
2108 }
2109
2110 /* If maximizing, find the longest possible run, then work backwards. */
2111
2112 else
2113 {
2114 Lstart_eptr = Feptr;
2115 for (i = Lmin; i < Lmax; i++)
2116 {
2117 int len = 1;
2118 if (Feptr >= mb->end_subject)
2119 {
2120 SCHECK_PARTIAL();
2121 break;
2122 }
2123 #ifdef SUPPORT_UNICODE
2124 GETCHARLENTEST(fc, Feptr, len);
2125 #else
2126 fc = *Feptr;
2127 #endif
2128 if (!PRIV(xclass)(fc, Lxclass_data, utf)) break;
2129 Feptr += len;
2130 }
2131
2132 if (reptype == REPTYPE_POS) continue; /* No backtracking */
2133
2134 /* After \C in UTF mode, Lstart_eptr might be in the middle of a
2135 Unicode character. Use <= Lstart_eptr to ensure backtracking doesn't
2136 go too far. */
2137
2138 for(;;)
2139 {
2140 RMATCH(Fecode, RM101);
2141 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2142 if (Feptr-- <= Lstart_eptr) break; /* Tried at original position */
2143 #ifdef SUPPORT_UNICODE
2144 if (utf) BACKCHAR(Feptr);
2145 #endif
2146 }
2147 RRETURN(MATCH_NOMATCH);
2148 }
2149
2150 /* Control never gets here */
2151 }
2152 #endif /* SUPPORT_WIDE_CHARS: end of XCLASS */
2153
2154 #undef Lstart_eptr
2155 #undef Lxclass_data
2156 #undef Lmin
2157 #undef Lmax
2158
2159
2160 /* ===================================================================== */
2161 /* Match various character types when PCRE2_UCP is not set. These opcodes
2162 are not generated when PCRE2_UCP is set - instead appropriate property
2163 tests are compiled. */
2164
2165 case OP_NOT_DIGIT:
2166 if (Feptr >= mb->end_subject)
2167 {
2168 SCHECK_PARTIAL();
2169 RRETURN(MATCH_NOMATCH);
2170 }
2171 GETCHARINCTEST(fc, Feptr);
2172 if (CHMAX_255(fc) && (mb->ctypes[fc] & ctype_digit) != 0)
2173 RRETURN(MATCH_NOMATCH);
2174 Fecode++;
2175 break;
2176
2177 case OP_DIGIT:
2178 if (Feptr >= mb->end_subject)
2179 {
2180 SCHECK_PARTIAL();
2181 RRETURN(MATCH_NOMATCH);
2182 }
2183 GETCHARINCTEST(fc, Feptr);
2184 if (!CHMAX_255(fc) || (mb->ctypes[fc] & ctype_digit) == 0)
2185 RRETURN(MATCH_NOMATCH);
2186 Fecode++;
2187 break;
2188
2189 case OP_NOT_WHITESPACE:
2190 if (Feptr >= mb->end_subject)
2191 {
2192 SCHECK_PARTIAL();
2193 RRETURN(MATCH_NOMATCH);
2194 }
2195 GETCHARINCTEST(fc, Feptr);
2196 if (CHMAX_255(fc) && (mb->ctypes[fc] & ctype_space) != 0)
2197 RRETURN(MATCH_NOMATCH);
2198 Fecode++;
2199 break;
2200
2201 case OP_WHITESPACE:
2202 if (Feptr >= mb->end_subject)
2203 {
2204 SCHECK_PARTIAL();
2205 RRETURN(MATCH_NOMATCH);
2206 }
2207 GETCHARINCTEST(fc, Feptr);
2208 if (!CHMAX_255(fc) || (mb->ctypes[fc] & ctype_space) == 0)
2209 RRETURN(MATCH_NOMATCH);
2210 Fecode++;
2211 break;
2212
2213 case OP_NOT_WORDCHAR:
2214 if (Feptr >= mb->end_subject)
2215 {
2216 SCHECK_PARTIAL();
2217 RRETURN(MATCH_NOMATCH);
2218 }
2219 GETCHARINCTEST(fc, Feptr);
2220 if (CHMAX_255(fc) && (mb->ctypes[fc] & ctype_word) != 0)
2221 RRETURN(MATCH_NOMATCH);
2222 Fecode++;
2223 break;
2224
2225 case OP_WORDCHAR:
2226 if (Feptr >= mb->end_subject)
2227 {
2228 SCHECK_PARTIAL();
2229 RRETURN(MATCH_NOMATCH);
2230 }
2231 GETCHARINCTEST(fc, Feptr);
2232 if (!CHMAX_255(fc) || (mb->ctypes[fc] & ctype_word) == 0)
2233 RRETURN(MATCH_NOMATCH);
2234 Fecode++;
2235 break;
2236
2237 case OP_ANYNL:
2238 if (Feptr >= mb->end_subject)
2239 {
2240 SCHECK_PARTIAL();
2241 RRETURN(MATCH_NOMATCH);
2242 }
2243 GETCHARINCTEST(fc, Feptr);
2244 switch(fc)
2245 {
2246 default: RRETURN(MATCH_NOMATCH);
2247
2248 case CHAR_CR:
2249 if (Feptr >= mb->end_subject)
2250 {
2251 SCHECK_PARTIAL();
2252 }
2253 else if (UCHAR21TEST(Feptr) == CHAR_LF) Feptr++;
2254 break;
2255
2256 case CHAR_LF:
2257 break;
2258
2259 case CHAR_VT:
2260 case CHAR_FF:
2261 case CHAR_NEL:
2262 #ifndef EBCDIC
2263 case 0x2028:
2264 case 0x2029:
2265 #endif /* Not EBCDIC */
2266 if (mb->bsr_convention == PCRE2_BSR_ANYCRLF) RRETURN(MATCH_NOMATCH);
2267 break;
2268 }
2269 Fecode++;
2270 break;
2271
2272 case OP_NOT_HSPACE:
2273 if (Feptr >= mb->end_subject)
2274 {
2275 SCHECK_PARTIAL();
2276 RRETURN(MATCH_NOMATCH);
2277 }
2278 GETCHARINCTEST(fc, Feptr);
2279 switch(fc)
2280 {
2281 HSPACE_CASES: RRETURN(MATCH_NOMATCH); /* Byte and multibyte cases */
2282 default: break;
2283 }
2284 Fecode++;
2285 break;
2286
2287 case OP_HSPACE:
2288 if (Feptr >= mb->end_subject)
2289 {
2290 SCHECK_PARTIAL();
2291 RRETURN(MATCH_NOMATCH);
2292 }
2293 GETCHARINCTEST(fc, Feptr);
2294 switch(fc)
2295 {
2296 HSPACE_CASES: break; /* Byte and multibyte cases */
2297 default: RRETURN(MATCH_NOMATCH);
2298 }
2299 Fecode++;
2300 break;
2301
2302 case OP_NOT_VSPACE:
2303 if (Feptr >= mb->end_subject)
2304 {
2305 SCHECK_PARTIAL();
2306 RRETURN(MATCH_NOMATCH);
2307 }
2308 GETCHARINCTEST(fc, Feptr);
2309 switch(fc)
2310 {
2311 VSPACE_CASES: RRETURN(MATCH_NOMATCH);
2312 default: break;
2313 }
2314 Fecode++;
2315 break;
2316
2317 case OP_VSPACE:
2318 if (Feptr >= mb->end_subject)
2319 {
2320 SCHECK_PARTIAL();
2321 RRETURN(MATCH_NOMATCH);
2322 }
2323 GETCHARINCTEST(fc, Feptr);
2324 switch(fc)
2325 {
2326 VSPACE_CASES: break;
2327 default: RRETURN(MATCH_NOMATCH);
2328 }
2329 Fecode++;
2330 break;
2331
2332
2333 #ifdef SUPPORT_UNICODE
2334
2335 /* ===================================================================== */
2336 /* Check the next character by Unicode property. We will get here only
2337 if the support is in the binary; otherwise a compile-time error occurs. */
2338
2339 case OP_PROP:
2340 case OP_NOTPROP:
2341 if (Feptr >= mb->end_subject)
2342 {
2343 SCHECK_PARTIAL();
2344 RRETURN(MATCH_NOMATCH);
2345 }
2346 GETCHARINCTEST(fc, Feptr);
2347 {
2348 const uint32_t *cp;
2349 const ucd_record *prop = GET_UCD(fc);
2350
2351 switch(Fecode[1])
2352 {
2353 case PT_ANY:
2354 if (Fop == OP_NOTPROP) RRETURN(MATCH_NOMATCH);
2355 break;
2356
2357 case PT_LAMP:
2358 if ((prop->chartype == ucp_Lu ||
2359 prop->chartype == ucp_Ll ||
2360 prop->chartype == ucp_Lt) == (Fop == OP_NOTPROP))
2361 RRETURN(MATCH_NOMATCH);
2362 break;
2363
2364 case PT_GC:
2365 if ((Fecode[2] != PRIV(ucp_gentype)[prop->chartype]) == (Fop == OP_PROP))
2366 RRETURN(MATCH_NOMATCH);
2367 break;
2368
2369 case PT_PC:
2370 if ((Fecode[2] != prop->chartype) == (Fop == OP_PROP))
2371 RRETURN(MATCH_NOMATCH);
2372 break;
2373
2374 case PT_SC:
2375 if ((Fecode[2] != prop->script) == (Fop == OP_PROP))
2376 RRETURN(MATCH_NOMATCH);
2377 break;
2378
2379 /* These are specials */
2380
2381 case PT_ALNUM:
2382 if ((PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
2383 PRIV(ucp_gentype)[prop->chartype] == ucp_N) == (Fop == OP_NOTPROP))
2384 RRETURN(MATCH_NOMATCH);
2385 break;
2386
2387 /* Perl space used to exclude VT, but from Perl 5.18 it is included,
2388 which means that Perl space and POSIX space are now identical. PCRE
2389 was changed at release 8.34. */
2390
2391 case PT_SPACE: /* Perl space */
2392 case PT_PXSPACE: /* POSIX space */
2393 switch(fc)
2394 {
2395 HSPACE_CASES:
2396 VSPACE_CASES:
2397 if (Fop == OP_NOTPROP) RRETURN(MATCH_NOMATCH);
2398 break;
2399
2400 default:
2401 if ((PRIV(ucp_gentype)[prop->chartype] == ucp_Z) ==
2402 (Fop == OP_NOTPROP)) RRETURN(MATCH_NOMATCH);
2403 break;
2404 }
2405 break;
2406
2407 case PT_WORD:
2408 if ((PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
2409 PRIV(ucp_gentype)[prop->chartype] == ucp_N ||
2410 fc == CHAR_UNDERSCORE) == (Fop == OP_NOTPROP))
2411 RRETURN(MATCH_NOMATCH);
2412 break;
2413
2414 case PT_CLIST:
2415 cp = PRIV(ucd_caseless_sets) + Fecode[2];
2416 for (;;)
2417 {
2418 if (fc < *cp)
2419 { if (Fop == OP_PROP) { RRETURN(MATCH_NOMATCH); } else break; }
2420 if (fc == *cp++)
2421 { if (Fop == OP_PROP) break; else { RRETURN(MATCH_NOMATCH); } }
2422 }
2423 break;
2424
2425 case PT_UCNC:
2426 if ((fc == CHAR_DOLLAR_SIGN || fc == CHAR_COMMERCIAL_AT ||
2427 fc == CHAR_GRAVE_ACCENT || (fc >= 0xa0 && fc <= 0xd7ff) ||
2428 fc >= 0xe000) == (Fop == OP_NOTPROP))
2429 RRETURN(MATCH_NOMATCH);
2430 break;
2431
2432 /* This should never occur */
2433
2434 default:
2435 return PCRE2_ERROR_INTERNAL;
2436 }
2437
2438 Fecode += 3;
2439 }
2440 break;
2441
2442
2443 /* ===================================================================== */
2444 /* Match an extended Unicode sequence. We will get here only if the support
2445 is in the binary; otherwise a compile-time error occurs. */
2446
2447 case OP_EXTUNI:
2448 if (Feptr >= mb->end_subject)
2449 {
2450 SCHECK_PARTIAL();
2451 RRETURN(MATCH_NOMATCH);
2452 }
2453 else
2454 {
2455 GETCHARINCTEST(fc, Feptr);
2456 Feptr = PRIV(extuni)(fc, Feptr, mb->start_subject, mb->end_subject, utf,
2457 NULL);
2458 }
2459 CHECK_PARTIAL();
2460 Fecode++;
2461 break;
2462
2463 #endif /* SUPPORT_UNICODE */
2464
2465
2466 /* ===================================================================== */
2467 /* Match a single character type repeatedly. Note that the property type
2468 does not need to be in a stack frame as it is not used within an RMATCH()
2469 loop. */
2470
2471 #define Lstart_eptr F->temp_sptr[0]
2472 #define Lmin F->temp_32[0]
2473 #define Lmax F->temp_32[1]
2474 #define Lctype F->temp_32[2]
2475 #define Lpropvalue F->temp_32[3]
2476
2477 case OP_TYPEEXACT:
2478 Lmin = Lmax = GET2(Fecode, 1);
2479 Fecode += 1 + IMM2_SIZE;
2480 goto REPEATTYPE;
2481
2482 case OP_TYPEUPTO:
2483 case OP_TYPEMINUPTO:
2484 Lmin = 0;
2485 Lmax = GET2(Fecode, 1);
2486 reptype = (*Fecode == OP_TYPEMINUPTO)? REPTYPE_MIN : REPTYPE_MAX;
2487 Fecode += 1 + IMM2_SIZE;
2488 goto REPEATTYPE;
2489
2490 case OP_TYPEPOSSTAR:
2491 reptype = REPTYPE_POS;
2492 Lmin = 0;
2493 Lmax = UINT32_MAX;
2494 Fecode++;
2495 goto REPEATTYPE;
2496
2497 case OP_TYPEPOSPLUS:
2498 reptype = REPTYPE_POS;
2499 Lmin = 1;
2500 Lmax = UINT32_MAX;
2501 Fecode++;
2502 goto REPEATTYPE;
2503
2504 case OP_TYPEPOSQUERY:
2505 reptype = REPTYPE_POS;
2506 Lmin = 0;
2507 Lmax = 1;
2508 Fecode++;
2509 goto REPEATTYPE;
2510
2511 case OP_TYPEPOSUPTO:
2512 reptype = REPTYPE_POS;
2513 Lmin = 0;
2514 Lmax = GET2(Fecode, 1);
2515 Fecode += 1 + IMM2_SIZE;
2516 goto REPEATTYPE;
2517
2518 case OP_TYPESTAR:
2519 case OP_TYPEMINSTAR:
2520 case OP_TYPEPLUS:
2521 case OP_TYPEMINPLUS:
2522 case OP_TYPEQUERY:
2523 case OP_TYPEMINQUERY:
2524 fc = *Fecode++ - OP_TYPESTAR;
2525 Lmin = rep_min[fc];
2526 Lmax = rep_max[fc];
2527 reptype = rep_typ[fc];
2528
2529 /* Common code for all repeated character type matches. */
2530
2531 REPEATTYPE:
2532 Lctype = *Fecode++; /* Code for the character type */
2533
2534 #ifdef SUPPORT_UNICODE
2535 if (Lctype == OP_PROP || Lctype == OP_NOTPROP)
2536 {
2537 proptype = *Fecode++;
2538 Lpropvalue = *Fecode++;
2539 }
2540 else proptype = -1;
2541 #endif
2542
2543 /* First, ensure the minimum number of matches are present. Use inline
2544 code for maximizing the speed, and do the type test once at the start
2545 (i.e. keep it out of the loop). The code for UTF mode is separated out for
2546 tidiness, except for Unicode property tests. */
2547
2548 if (Lmin > 0)
2549 {
2550 #ifdef SUPPORT_UNICODE
2551 if (proptype >= 0) /* Property tests in all modes */
2552 {
2553 switch(proptype)
2554 {
2555 case PT_ANY:
2556 if (Lctype == OP_NOTPROP) RRETURN(MATCH_NOMATCH);
2557 for (i = 1; i <= Lmin; i++)
2558 {
2559 if (Feptr >= mb->end_subject)
2560 {
2561 SCHECK_PARTIAL();
2562 RRETURN(MATCH_NOMATCH);
2563 }
2564 GETCHARINCTEST(fc, Feptr);
2565 }
2566 break;
2567
2568 case PT_LAMP:
2569 for (i = 1; i <= Lmin; i++)
2570 {
2571 int chartype;
2572 if (Feptr >= mb->end_subject)
2573 {
2574 SCHECK_PARTIAL();
2575 RRETURN(MATCH_NOMATCH);
2576 }
2577 GETCHARINCTEST(fc, Feptr);
2578 chartype = UCD_CHARTYPE(fc);
2579 if ((chartype == ucp_Lu ||
2580 chartype == ucp_Ll ||
2581 chartype == ucp_Lt) == (Lctype == OP_NOTPROP))
2582 RRETURN(MATCH_NOMATCH);
2583 }
2584 break;
2585
2586 case PT_GC:
2587 for (i = 1; i <= Lmin; i++)
2588 {
2589 if (Feptr >= mb->end_subject)
2590 {
2591 SCHECK_PARTIAL();
2592 RRETURN(MATCH_NOMATCH);
2593 }
2594 GETCHARINCTEST(fc, Feptr);
2595 if ((UCD_CATEGORY(fc) == Lpropvalue) == (Lctype == OP_NOTPROP))
2596 RRETURN(MATCH_NOMATCH);
2597 }
2598 break;
2599
2600 case PT_PC:
2601 for (i = 1; i <= Lmin; i++)
2602 {
2603 if (Feptr >= mb->end_subject)
2604 {
2605 SCHECK_PARTIAL();
2606 RRETURN(MATCH_NOMATCH);
2607 }
2608 GETCHARINCTEST(fc, Feptr);
2609 if ((UCD_CHARTYPE(fc) == Lpropvalue) == (Lctype == OP_NOTPROP))
2610 RRETURN(MATCH_NOMATCH);
2611 }
2612 break;
2613
2614 case PT_SC:
2615 for (i = 1; i <= Lmin; i++)
2616 {
2617 if (Feptr >= mb->end_subject)
2618 {
2619 SCHECK_PARTIAL();
2620 RRETURN(MATCH_NOMATCH);
2621 }
2622 GETCHARINCTEST(fc, Feptr);
2623 if ((UCD_SCRIPT(fc) == Lpropvalue) == (Lctype == OP_NOTPROP))
2624 RRETURN(MATCH_NOMATCH);
2625 }
2626 break;
2627
2628 case PT_ALNUM:
2629 for (i = 1; i <= Lmin; i++)
2630 {
2631 int category;
2632 if (Feptr >= mb->end_subject)
2633 {
2634 SCHECK_PARTIAL();
2635 RRETURN(MATCH_NOMATCH);
2636 }
2637 GETCHARINCTEST(fc, Feptr);
2638 category = UCD_CATEGORY(fc);
2639 if ((category == ucp_L || category == ucp_N) == (Lctype == OP_NOTPROP))
2640 RRETURN(MATCH_NOMATCH);
2641 }
2642 break;
2643
2644 /* Perl space used to exclude VT, but from Perl 5.18 it is included,
2645 which means that Perl space and POSIX space are now identical. PCRE
2646 was changed at release 8.34. */
2647
2648 case PT_SPACE: /* Perl space */
2649 case PT_PXSPACE: /* POSIX space */
2650 for (i = 1; i <= Lmin; i++)
2651 {
2652 if (Feptr >= mb->end_subject)
2653 {
2654 SCHECK_PARTIAL();
2655 RRETURN(MATCH_NOMATCH);
2656 }
2657 GETCHARINCTEST(fc, Feptr);
2658 switch(fc)
2659 {
2660 HSPACE_CASES:
2661 VSPACE_CASES:
2662 if (Lctype == OP_NOTPROP) RRETURN(MATCH_NOMATCH);
2663 break;
2664
2665 default:
2666 if ((UCD_CATEGORY(fc) == ucp_Z) == (Lctype == OP_NOTPROP))
2667 RRETURN(MATCH_NOMATCH);
2668 break;
2669 }
2670 }
2671 break;
2672
2673 case PT_WORD:
2674 for (i = 1; i <= Lmin; i++)
2675 {
2676 int category;
2677 if (Feptr >= mb->end_subject)
2678 {
2679 SCHECK_PARTIAL();
2680 RRETURN(MATCH_NOMATCH);
2681 }
2682 GETCHARINCTEST(fc, Feptr);
2683 category = UCD_CATEGORY(fc);
2684 if ((category == ucp_L || category == ucp_N ||
2685 fc == CHAR_UNDERSCORE) == (Lctype == OP_NOTPROP))
2686 RRETURN(MATCH_NOMATCH);
2687 }
2688 break;
2689
2690 case PT_CLIST:
2691 for (i = 1; i <= Lmin; i++)
2692 {
2693 const uint32_t *cp;
2694 if (Feptr >= mb->end_subject)
2695 {
2696 SCHECK_PARTIAL();
2697 RRETURN(MATCH_NOMATCH);
2698 }
2699 GETCHARINCTEST(fc, Feptr);
2700 cp = PRIV(ucd_caseless_sets) + Lpropvalue;
2701 for (;;)
2702 {
2703 if (fc < *cp)
2704 {
2705 if (Lctype == OP_NOTPROP) break;
2706 RRETURN(MATCH_NOMATCH);
2707 }
2708 if (fc == *cp++)
2709 {
2710 if (Lctype == OP_NOTPROP) RRETURN(MATCH_NOMATCH);
2711 break;
2712 }
2713 }
2714 }
2715 break;
2716
2717 case PT_UCNC:
2718 for (i = 1; i <= Lmin; i++)
2719 {
2720 if (Feptr >= mb->end_subject)
2721 {
2722 SCHECK_PARTIAL();
2723 RRETURN(MATCH_NOMATCH);
2724 }
2725 GETCHARINCTEST(fc, Feptr);
2726 if ((fc == CHAR_DOLLAR_SIGN || fc == CHAR_COMMERCIAL_AT ||
2727 fc == CHAR_GRAVE_ACCENT || (fc >= 0xa0 && fc <= 0xd7ff) ||
2728 fc >= 0xe000) == (Lctype == OP_NOTPROP))
2729 RRETURN(MATCH_NOMATCH);
2730 }
2731 break;
2732
2733 /* This should not occur */
2734
2735 default:
2736 return PCRE2_ERROR_INTERNAL;
2737 }
2738 }
2739
2740 /* Match extended Unicode sequences. We will get here only if the
2741 support is in the binary; otherwise a compile-time error occurs. */
2742
2743 else if (Lctype == OP_EXTUNI)
2744 {
2745 for (i = 1; i <= Lmin; i++)
2746 {
2747 if (Feptr >= mb->end_subject)
2748 {
2749 SCHECK_PARTIAL();
2750 RRETURN(MATCH_NOMATCH);
2751 }
2752 else
2753 {
2754 GETCHARINCTEST(fc, Feptr);
2755 Feptr = PRIV(extuni)(fc, Feptr, mb->start_subject,
2756 mb->end_subject, utf, NULL);
2757 }
2758 CHECK_PARTIAL();
2759 }
2760 }
2761 else
2762 #endif /* SUPPORT_UNICODE */
2763
2764 /* Handle all other cases in UTF mode */
2765
2766 #ifdef SUPPORT_UNICODE
2767 if (utf) switch(Lctype)
2768 {
2769 case OP_ANY:
2770 for (i = 1; i <= Lmin; i++)
2771 {
2772 if (Feptr >= mb->end_subject)
2773 {
2774 SCHECK_PARTIAL();
2775 RRETURN(MATCH_NOMATCH);
2776 }
2777 if (IS_NEWLINE(Feptr)) RRETURN(MATCH_NOMATCH);
2778 if (mb->partial != 0 &&
2779 Feptr + 1 >= mb->end_subject &&
2780 NLBLOCK->nltype == NLTYPE_FIXED &&
2781 NLBLOCK->nllen == 2 &&
2782 UCHAR21(Feptr) == NLBLOCK->nl[0])
2783 {
2784 mb->hitend = TRUE;
2785 if (mb->partial > 1) return PCRE2_ERROR_PARTIAL;
2786 }
2787 Feptr++;
2788 ACROSSCHAR(Feptr < mb->end_subject, Feptr, Feptr++);
2789 }
2790 break;
2791
2792 case OP_ALLANY:
2793 for (i = 1; i <= Lmin; i++)
2794 {
2795 if (Feptr >= mb->end_subject)
2796 {
2797 SCHECK_PARTIAL();
2798 RRETURN(MATCH_NOMATCH);
2799 }
2800 Feptr++;
2801 ACROSSCHAR(Feptr < mb->end_subject, Feptr, Feptr++);
2802 }
2803 break;
2804
2805 case OP_ANYBYTE:
2806 if (Feptr > mb->end_subject - Lmin) RRETURN(MATCH_NOMATCH);
2807 Feptr += Lmin;
2808 break;
2809
2810 case OP_ANYNL:
2811 for (i = 1; i <= Lmin; i++)
2812 {
2813 if (Feptr >= mb->end_subject)
2814 {
2815 SCHECK_PARTIAL();
2816 RRETURN(MATCH_NOMATCH);
2817 }
2818 GETCHARINC(fc, Feptr);
2819 switch(fc)
2820 {
2821 default: RRETURN(MATCH_NOMATCH);
2822
2823 case CHAR_CR:
2824 if (Feptr < mb->end_subject && UCHAR21(Feptr) == CHAR_LF) Feptr++;
2825 break;
2826
2827 case CHAR_LF:
2828 break;
2829
2830 case CHAR_VT:
2831 case CHAR_FF:
2832 case CHAR_NEL:
2833 #ifndef EBCDIC
2834 case 0x2028:
2835 case 0x2029:
2836 #endif /* Not EBCDIC */
2837 if (mb->bsr_convention == PCRE2_BSR_ANYCRLF) RRETURN(MATCH_NOMATCH);
2838 break;
2839 }
2840 }
2841 break;
2842
2843 case OP_NOT_HSPACE:
2844 for (i = 1; i <= Lmin; i++)
2845 {
2846 if (Feptr >= mb->end_subject)
2847 {
2848 SCHECK_PARTIAL();
2849 RRETURN(MATCH_NOMATCH);
2850 }
2851 GETCHARINC(fc, Feptr);
2852 switch(fc)
2853 {
2854 HSPACE_CASES: RRETURN(MATCH_NOMATCH);
2855 default: break;
2856 }
2857 }
2858 break;
2859
2860 case OP_HSPACE:
2861 for (i = 1; i <= Lmin; i++)
2862 {
2863 if (Feptr >= mb->end_subject)
2864 {
2865 SCHECK_PARTIAL();
2866 RRETURN(MATCH_NOMATCH);
2867 }
2868 GETCHARINC(fc, Feptr);
2869 switch(fc)
2870 {
2871 HSPACE_CASES: break;
2872 default: RRETURN(MATCH_NOMATCH);
2873 }
2874 }
2875 break;
2876
2877 case OP_NOT_VSPACE:
2878 for (i = 1; i <= Lmin; i++)
2879 {
2880 if (Feptr >= mb->end_subject)
2881 {
2882 SCHECK_PARTIAL();
2883 RRETURN(MATCH_NOMATCH);
2884 }
2885 GETCHARINC(fc, Feptr);
2886 switch(fc)
2887 {
2888 VSPACE_CASES: RRETURN(MATCH_NOMATCH);
2889 default: break;
2890 }
2891 }
2892 break;
2893
2894 case OP_VSPACE:
2895 for (i = 1; i <= Lmin; i++)
2896 {
2897 if (Feptr >= mb->end_subject)
2898 {
2899 SCHECK_PARTIAL();
2900 RRETURN(MATCH_NOMATCH);
2901 }
2902 GETCHARINC(fc, Feptr);
2903 switch(fc)
2904 {
2905 VSPACE_CASES: break;
2906 default: RRETURN(MATCH_NOMATCH);
2907 }
2908 }
2909 break;
2910
2911 case OP_NOT_DIGIT:
2912 for (i = 1; i <= Lmin; i++)
2913 {
2914 if (Feptr >= mb->end_subject)
2915 {
2916 SCHECK_PARTIAL();
2917 RRETURN(MATCH_NOMATCH);
2918 }
2919 GETCHARINC(fc, Feptr);
2920 if (fc < 128 && (mb->ctypes[fc] & ctype_digit) != 0)
2921 RRETURN(MATCH_NOMATCH);
2922 }
2923 break;
2924
2925 case OP_DIGIT:
2926 for (i = 1; i <= Lmin; i++)
2927 {
2928 uint32_t cc;
2929 if (Feptr >= mb->end_subject)
2930 {
2931 SCHECK_PARTIAL();
2932 RRETURN(MATCH_NOMATCH);
2933 }
2934 cc = UCHAR21(Feptr);
2935 if (cc >= 128 || (mb->ctypes[cc] & ctype_digit) == 0)
2936 RRETURN(MATCH_NOMATCH);
2937 Feptr++;
2938 /* No need to skip more code units - we know it has only one. */
2939 }
2940 break;
2941
2942 case OP_NOT_WHITESPACE:
2943 for (i = 1; i <= Lmin; i++)
2944 {
2945 uint32_t cc;
2946 if (Feptr >= mb->end_subject)
2947 {
2948 SCHECK_PARTIAL();
2949 RRETURN(MATCH_NOMATCH);
2950 }
2951 cc = UCHAR21(Feptr);
2952 if (cc < 128 && (mb->ctypes[cc] & ctype_space) != 0)
2953 RRETURN(MATCH_NOMATCH);
2954 Feptr++;
2955 ACROSSCHAR(Feptr < mb->end_subject, Feptr, Feptr++);
2956 }
2957 break;
2958
2959 case OP_WHITESPACE:
2960 for (i = 1; i <= Lmin; i++)
2961 {
2962 uint32_t cc;
2963 if (Feptr >= mb->end_subject)
2964 {
2965 SCHECK_PARTIAL();
2966 RRETURN(MATCH_NOMATCH);
2967 }
2968 cc = UCHAR21(Feptr);
2969 if (cc >= 128 || (mb->ctypes[cc] & ctype_space) == 0)
2970 RRETURN(MATCH_NOMATCH);
2971 Feptr++;
2972 /* No need to skip more code units - we know it has only one. */
2973 }
2974 break;
2975
2976 case OP_NOT_WORDCHAR:
2977 for (i = 1; i <= Lmin; i++)
2978 {
2979 uint32_t cc;
2980 if (Feptr >= mb->end_subject)
2981 {
2982 SCHECK_PARTIAL();
2983 RRETURN(MATCH_NOMATCH);
2984 }
2985 cc = UCHAR21(Feptr);
2986 if (cc < 128 && (mb->ctypes[cc] & ctype_word) != 0)
2987 RRETURN(MATCH_NOMATCH);
2988 Feptr++;
2989 ACROSSCHAR(Feptr < mb->end_subject, Feptr, Feptr++);
2990 }
2991 break;
2992
2993 case OP_WORDCHAR:
2994 for (i = 1; i <= Lmin; i++)
2995 {
2996 uint32_t cc;
2997 if (Feptr >= mb->end_subject)
2998 {
2999 SCHECK_PARTIAL();
3000 RRETURN(MATCH_NOMATCH);
3001 }
3002 cc = UCHAR21(Feptr);
3003 if (cc >= 128 || (mb->ctypes[cc] & ctype_word) == 0)
3004 RRETURN(MATCH_NOMATCH);
3005 Feptr++;
3006 /* No need to skip more code units - we know it has only one. */
3007 }
3008 break;
3009
3010 default:
3011 return PCRE2_ERROR_INTERNAL;
3012 } /* End switch(Lctype) */
3013
3014 else
3015 #endif /* SUPPORT_UNICODE */
3016
3017 /* Code for the non-UTF case for minimum matching of operators other
3018 than OP_PROP and OP_NOTPROP. */
3019
3020 switch(Lctype)
3021 {
3022 case OP_ANY:
3023 for (i = 1; i <= Lmin; i++)
3024 {
3025 if (Feptr >= mb->end_subject)
3026 {
3027 SCHECK_PARTIAL();
3028 RRETURN(MATCH_NOMATCH);
3029 }
3030 if (IS_NEWLINE(Feptr)) RRETURN(MATCH_NOMATCH);
3031 if (mb->partial != 0 &&
3032 Feptr + 1 >= mb->end_subject &&
3033 NLBLOCK->nltype == NLTYPE_FIXED &&
3034 NLBLOCK->nllen == 2 &&
3035 *Feptr == NLBLOCK->nl[0])
3036 {
3037 mb->hitend = TRUE;
3038 if (mb->partial > 1) return PCRE2_ERROR_PARTIAL;
3039 }
3040 Feptr++;
3041 }
3042 break;
3043
3044 case OP_ALLANY:
3045 if (Feptr > mb->end_subject - Lmin)
3046 {
3047 SCHECK_PARTIAL();
3048 RRETURN(MATCH_NOMATCH);
3049 }
3050 Feptr += Lmin;
3051 break;
3052
3053 /* This OP_ANYBYTE case will never be reached because \C gets turned
3054 into OP_ALLANY in non-UTF mode. Cut out the code so that coverage
3055 reports don't complain about it's never being used. */
3056
3057 /* case OP_ANYBYTE:
3058 * if (Feptr > mb->end_subject - Lmin)
3059 * {
3060 * SCHECK_PARTIAL();
3061 * RRETURN(MATCH_NOMATCH);
3062 * }
3063 * Feptr += Lmin;
3064 * break;
3065 */
3066 case OP_ANYNL:
3067 for (i = 1; i <= Lmin; i++)
3068 {
3069 if (Feptr >= mb->end_subject)
3070 {
3071 SCHECK_PARTIAL();
3072 RRETURN(MATCH_NOMATCH);
3073 }
3074 switch(*Feptr++)
3075 {
3076 default: RRETURN(MATCH_NOMATCH);
3077
3078 case CHAR_CR:
3079 if (Feptr < mb->end_subject && *Feptr == CHAR_LF) Feptr++;
3080 break;
3081
3082 case CHAR_LF:
3083 break;
3084
3085 case CHAR_VT:
3086 case CHAR_FF:
3087 case CHAR_NEL:
3088 #if PCRE2_CODE_UNIT_WIDTH != 8
3089 case 0x2028:
3090 case 0x2029:
3091 #endif
3092 if (mb->bsr_convention == PCRE2_BSR_ANYCRLF) RRETURN(MATCH_NOMATCH);
3093 break;
3094 }
3095 }
3096 break;
3097
3098 case OP_NOT_HSPACE:
3099 for (i = 1; i <= Lmin; i++)
3100 {
3101 if (Feptr >= mb->end_subject)
3102 {
3103 SCHECK_PARTIAL();
3104 RRETURN(MATCH_NOMATCH);
3105 }
3106 switch(*Feptr++)
3107 {
3108 default: break;
3109 HSPACE_BYTE_CASES:
3110 #if PCRE2_CODE_UNIT_WIDTH != 8
3111 HSPACE_MULTIBYTE_CASES:
3112 #endif
3113 RRETURN(MATCH_NOMATCH);
3114 }
3115 }
3116 break;
3117
3118 case OP_HSPACE:
3119 for (i = 1; i <= Lmin; i++)
3120 {
3121 if (Feptr >= mb->end_subject)
3122 {
3123 SCHECK_PARTIAL();
3124 RRETURN(MATCH_NOMATCH);
3125 }
3126 switch(*Feptr++)
3127 {
3128 default: RRETURN(MATCH_NOMATCH);
3129 HSPACE_BYTE_CASES:
3130 #if PCRE2_CODE_UNIT_WIDTH != 8
3131 HSPACE_MULTIBYTE_CASES:
3132 #endif
3133 break;
3134 }
3135 }
3136 break;
3137
3138 case OP_NOT_VSPACE:
3139 for (i = 1; i <= Lmin; i++)
3140 {
3141 if (Feptr >= mb->end_subject)
3142 {
3143 SCHECK_PARTIAL();
3144 RRETURN(MATCH_NOMATCH);
3145 }
3146 switch(*Feptr++)
3147 {
3148 VSPACE_BYTE_CASES:
3149 #if PCRE2_CODE_UNIT_WIDTH != 8
3150 VSPACE_MULTIBYTE_CASES:
3151 #endif
3152 RRETURN(MATCH_NOMATCH);
3153 default: break;
3154 }
3155 }
3156 break;
3157
3158 case OP_VSPACE:
3159 for (i = 1; i <= Lmin; i++)
3160 {
3161 if (Feptr >= mb->end_subject)
3162 {
3163 SCHECK_PARTIAL();
3164 RRETURN(MATCH_NOMATCH);
3165 }
3166 switch(*Feptr++)
3167 {
3168 default: RRETURN(MATCH_NOMATCH);
3169 VSPACE_BYTE_CASES:
3170 #if PCRE2_CODE_UNIT_WIDTH != 8
3171 VSPACE_MULTIBYTE_CASES:
3172 #endif
3173 break;
3174 }
3175 }
3176 break;
3177
3178 case OP_NOT_DIGIT:
3179 for (i = 1; i <= Lmin; i++)
3180 {
3181 if (Feptr >= mb->end_subject)
3182 {
3183 SCHECK_PARTIAL();
3184 RRETURN(MATCH_NOMATCH);
3185 }
3186 if (MAX_255(*Feptr) && (mb->ctypes[*Feptr] & ctype_digit) != 0)
3187 RRETURN(MATCH_NOMATCH);
3188 Feptr++;
3189 }
3190 break;
3191
3192 case OP_DIGIT:
3193 for (i = 1; i <= Lmin; i++)
3194 {
3195 if (Feptr >= mb->end_subject)
3196 {
3197 SCHECK_PARTIAL();
3198 RRETURN(MATCH_NOMATCH);
3199 }
3200 if (!MAX_255(*Feptr) || (mb->ctypes[*Feptr] & ctype_digit) == 0)
3201 RRETURN(MATCH_NOMATCH);
3202 Feptr++;
3203 }
3204 break;
3205
3206 case OP_NOT_WHITESPACE:
3207 for (i = 1; i <= Lmin; i++)
3208 {
3209 if (Feptr >= mb->end_subject)
3210 {
3211 SCHECK_PARTIAL();
3212 RRETURN(MATCH_NOMATCH);
3213 }
3214 if (MAX_255(*Feptr) && (mb->ctypes[*Feptr] & ctype_space) != 0)
3215 RRETURN(MATCH_NOMATCH);
3216 Feptr++;
3217 }
3218 break;
3219
3220 case OP_WHITESPACE:
3221 for (i = 1; i <= Lmin; i++)
3222 {
3223 if (Feptr >= mb->end_subject)
3224 {
3225 SCHECK_PARTIAL();
3226 RRETURN(MATCH_NOMATCH);
3227 }
3228 if (!MAX_255(*Feptr) || (mb->ctypes[*Feptr] & ctype_space) == 0)
3229 RRETURN(MATCH_NOMATCH);
3230 Feptr++;
3231 }
3232 break;
3233
3234 case OP_NOT_WORDCHAR:
3235 for (i = 1; i <= Lmin; i++)
3236 {
3237 if (Feptr >= mb->end_subject)
3238 {
3239 SCHECK_PARTIAL();
3240 RRETURN(MATCH_NOMATCH);
3241 }
3242 if (MAX_255(*Feptr) && (mb->ctypes[*Feptr] & ctype_word) != 0)
3243 RRETURN(MATCH_NOMATCH);
3244 Feptr++;
3245 }
3246 break;
3247
3248 case OP_WORDCHAR:
3249 for (i = 1; i <= Lmin; i++)
3250 {
3251 if (Feptr >= mb->end_subject)
3252 {
3253 SCHECK_PARTIAL();
3254 RRETURN(MATCH_NOMATCH);
3255 }
3256 if (!MAX_255(*Feptr) || (mb->ctypes[*Feptr] & ctype_word) == 0)
3257 RRETURN(MATCH_NOMATCH);
3258 Feptr++;
3259 }
3260 break;
3261
3262 default:
3263 return PCRE2_ERROR_INTERNAL;
3264 }
3265 }
3266
3267 /* If Lmin = Lmax we are done. Continue with the main loop. */
3268
3269 if (Lmin == Lmax) continue;
3270
3271 /* If minimizing, we have to test the rest of the pattern before each
3272 subsequent match. */
3273
3274 if (reptype == REPTYPE_MIN)
3275 {
3276 #ifdef SUPPORT_UNICODE
3277 if (proptype >= 0)
3278 {
3279 switch(proptype)
3280 {
3281 case PT_ANY:
3282 for (;;)
3283 {
3284 RMATCH(Fecode, RM208);
3285 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3286 if (Lmin++ >= Lmax) RRETURN(MATCH_NOMATCH);
3287 if (Feptr >= mb->end_subject)
3288 {
3289 SCHECK_PARTIAL();
3290 RRETURN(MATCH_NOMATCH);
3291 }
3292 GETCHARINCTEST(fc, Feptr);
3293 if (Lctype == OP_NOTPROP) RRETURN(MATCH_NOMATCH);
3294 }
3295 /* Control never gets here */
3296
3297 case PT_LAMP:
3298 for (;;)
3299 {
3300 int chartype;
3301 RMATCH(Fecode, RM209);
3302 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3303 if (Lmin++ >= Lmax) RRETURN(MATCH_NOMATCH);
3304 if (Feptr >= mb->end_subject)
3305 {
3306 SCHECK_PARTIAL();
3307 RRETURN(MATCH_NOMATCH);
3308 }
3309 GETCHARINCTEST(fc, Feptr);
3310 chartype = UCD_CHARTYPE(fc);
3311 if ((chartype == ucp_Lu ||
3312 chartype == ucp_Ll ||
3313 chartype == ucp_Lt) == (Lctype == OP_NOTPROP))
3314 RRETURN(MATCH_NOMATCH);
3315 }
3316 /* Control never gets here */
3317
3318 case PT_GC:
3319 for (;;)
3320 {
3321 RMATCH(Fecode, RM210);
3322 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3323 if (Lmin++ >= Lmax) RRETURN(MATCH_NOMATCH);
3324 if (Feptr >= mb->end_subject)
3325 {
3326 SCHECK_PARTIAL();
3327 RRETURN(MATCH_NOMATCH);
3328 }
3329 GETCHARINCTEST(fc, Feptr);
3330 if ((UCD_CATEGORY(fc) == Lpropvalue) == (Lctype == OP_NOTPROP))
3331 RRETURN(MATCH_NOMATCH);
3332 }
3333 /* Control never gets here */
3334
3335 case PT_PC:
3336 for (;;)
3337 {
3338 RMATCH(Fecode, RM211);
3339 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3340 if (Lmin++ >= Lmax) RRETURN(MATCH_NOMATCH);
3341 if (Feptr >= mb->end_subject)
3342 {
3343 SCHECK_PARTIAL();
3344 RRETURN(MATCH_NOMATCH);
3345 }
3346 GETCHARINCTEST(fc, Feptr);
3347 if ((UCD_CHARTYPE(fc) == Lpropvalue) == (Lctype == OP_NOTPROP))
3348 RRETURN(MATCH_NOMATCH);
3349 }
3350 /* Control never gets here */
3351
3352 case PT_SC:
3353 for (;;)
3354 {
3355 RMATCH(Fecode, RM212);
3356 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3357 if (Lmin++ >= Lmax) RRETURN(MATCH_NOMATCH);
3358 if (Feptr >= mb->end_subject)
3359 {
3360 SCHECK_PARTIAL();
3361 RRETURN(MATCH_NOMATCH);
3362 }
3363 GETCHARINCTEST(fc, Feptr);
3364 if ((UCD_SCRIPT(fc) == Lpropvalue) == (Lctype == OP_NOTPROP))
3365 RRETURN(MATCH_NOMATCH);
3366 }
3367 /* Control never gets here */
3368
3369 case PT_ALNUM:
3370 for (;;)
3371 {
3372 int category;
3373 RMATCH(Fecode, RM213);
3374 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3375 if (Lmin++ >= Lmax) RRETURN(MATCH_NOMATCH);
3376 if (Feptr >= mb->end_subject)
3377 {
3378 SCHECK_PARTIAL();
3379 RRETURN(MATCH_NOMATCH);
3380 }
3381 GETCHARINCTEST(fc, Feptr);
3382 category = UCD_CATEGORY(fc);
3383 if ((category == ucp_L || category == ucp_N) ==
3384 (Lctype == OP_NOTPROP))
3385 RRETURN(MATCH_NOMATCH);
3386 }
3387 /* Control never gets here */
3388
3389 /* Perl space used to exclude VT, but from Perl 5.18 it is included,
3390 which means that Perl space and POSIX space are now identical. PCRE
3391 was changed at release 8.34. */
3392
3393 case PT_SPACE: /* Perl space */
3394 case PT_PXSPACE: /* POSIX space */
3395 for (;;)
3396 {
3397 RMATCH(Fecode, RM214);
3398 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3399 if (Lmin++ >= Lmax) RRETURN(MATCH_NOMATCH);
3400 if (Feptr >= mb->end_subject)
3401 {
3402 SCHECK_PARTIAL();
3403 RRETURN(MATCH_NOMATCH);
3404 }
3405 GETCHARINCTEST(fc, Feptr);
3406 switch(fc)
3407 {
3408 HSPACE_CASES:
3409 VSPACE_CASES:
3410 if (Lctype == OP_NOTPROP) RRETURN(MATCH_NOMATCH);
3411 break;
3412
3413 default:
3414 if ((UCD_CATEGORY(fc) == ucp_Z) == (Lctype == OP_NOTPROP))
3415 RRETURN(MATCH_NOMATCH);
3416 break;
3417 }
3418 }
3419 /* Control never gets here */
3420
3421 case PT_WORD:
3422 for (;;)
3423 {
3424 int category;
3425 RMATCH(Fecode, RM215);
3426 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3427 if (Lmin++ >= Lmax) RRETURN(MATCH_NOMATCH);
3428 if (Feptr >= mb->end_subject)
3429 {
3430 SCHECK_PARTIAL();
3431 RRETURN(MATCH_NOMATCH);
3432 }
3433 GETCHARINCTEST(fc, Feptr);
3434 category = UCD_CATEGORY(fc);
3435 if ((category == ucp_L ||
3436 category == ucp_N ||
3437 fc == CHAR_UNDERSCORE) == (Lctype == OP_NOTPROP))
3438 RRETURN(MATCH_NOMATCH);
3439 }
3440 /* Control never gets here */
3441
3442 case PT_CLIST:
3443 for (;;)
3444 {
3445 const uint32_t *cp;
3446 RMATCH(Fecode, RM216);
3447 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3448 if (Lmin++ >= Lmax) RRETURN(MATCH_NOMATCH);
3449 if (Feptr >= mb->end_subject)
3450 {
3451 SCHECK_PARTIAL();
3452 RRETURN(MATCH_NOMATCH);
3453 }
3454 GETCHARINCTEST(fc, Feptr);
3455 cp = PRIV(ucd_caseless_sets) + Lpropvalue;
3456 for (;;)
3457 {
3458 if (fc < *cp)
3459 {
3460 if (Lctype == OP_NOTPROP) break;
3461 RRETURN(MATCH_NOMATCH);
3462 }
3463 if (fc == *cp++)
3464 {
3465 if (Lctype == OP_NOTPROP) RRETURN(MATCH_NOMATCH);
3466 break;
3467 }
3468 }
3469 }
3470 /* Control never gets here */
3471
3472 case PT_UCNC:
3473 for (;;)
3474 {
3475 RMATCH(Fecode, RM217);
3476 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3477 if (Lmin++ >= Lmax) RRETURN(MATCH_NOMATCH);
3478 if (Feptr >= mb->end_subject)
3479 {
3480 SCHECK_PARTIAL();
3481 RRETURN(MATCH_NOMATCH);
3482 }
3483 GETCHARINCTEST(fc, Feptr);
3484 if ((fc == CHAR_DOLLAR_SIGN || fc == CHAR_COMMERCIAL_AT ||
3485 fc == CHAR_GRAVE_ACCENT || (fc >= 0xa0 && fc <= 0xd7ff) ||
3486 fc >= 0xe000) == (Lctype == OP_NOTPROP))
3487 RRETURN(MATCH_NOMATCH);
3488 }
3489 /* Control never gets here */
3490
3491 /* This should never occur */
3492 default:
3493 return PCRE2_ERROR_INTERNAL;
3494 }
3495 }
3496
3497 /* Match extended Unicode sequences. We will get here only if the
3498 support is in the binary; otherwise a compile-time error occurs. */
3499
3500 else if (Lctype == OP_EXTUNI)
3501 {
3502 for (;;)
3503 {
3504 RMATCH(Fecode, RM218);
3505 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3506 if (Lmin++ >= Lmax) RRETURN(MATCH_NOMATCH);
3507 if (Feptr >= mb->end_subject)
3508 {
3509 SCHECK_PARTIAL();
3510 RRETURN(MATCH_NOMATCH);
3511 }
3512 else
3513 {
3514 GETCHARINCTEST(fc, Feptr);
3515 Feptr = PRIV(extuni)(fc, Feptr, mb->start_subject, mb->end_subject,
3516 utf, NULL);
3517 }
3518 CHECK_PARTIAL();
3519 }
3520 }
3521 else
3522 #endif /* SUPPORT_UNICODE */
3523
3524 /* UTF mode for non-property testing character types. */
3525
3526 #ifdef SUPPORT_UNICODE
3527 if (utf)
3528 {
3529 for (;;)
3530 {
3531 RMATCH(Fecode, RM219);
3532 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3533 if (Lmin++ >= Lmax) RRETURN(MATCH_NOMATCH);
3534 if (Feptr >= mb->end_subject)
3535 {
3536 SCHECK_PARTIAL();
3537 RRETURN(MATCH_NOMATCH);
3538 }
3539 if (Lctype == OP_ANY && IS_NEWLINE(Feptr)) RRETURN(MATCH_NOMATCH);
3540 GETCHARINC(fc, Feptr);
3541 switch(Lctype)
3542 {
3543 case OP_ANY: /* This is the non-NL case */
3544 if (mb->partial != 0 && /* Take care with CRLF partial */
3545 Feptr >= mb->end_subject &&
3546 NLBLOCK->nltype == NLTYPE_FIXED &&
3547 NLBLOCK->nllen == 2 &&
3548 fc == NLBLOCK->nl[0])
3549 {
3550 mb->hitend = TRUE;
3551 if (mb->partial > 1) return PCRE2_ERROR_PARTIAL;
3552 }
3553 break;
3554
3555 case OP_ALLANY:
3556 case OP_ANYBYTE:
3557 break;
3558
3559 case OP_ANYNL:
3560 switch(fc)
3561 {
3562 default: RRETURN(MATCH_NOMATCH);
3563
3564 case CHAR_CR:
3565 if (Feptr < mb->end_subject && UCHAR21(Feptr) == CHAR_LF) Feptr++;
3566 break;
3567
3568 case CHAR_LF:
3569 break;
3570
3571 case CHAR_VT:
3572 case CHAR_FF:
3573 case CHAR_NEL:
3574 #ifndef EBCDIC
3575 case 0x2028:
3576 case 0x2029:
3577 #endif /* Not EBCDIC */
3578 if (mb->bsr_convention == PCRE2_BSR_ANYCRLF)
3579 RRETURN(MATCH_NOMATCH);
3580 break;
3581 }
3582 break;
3583
3584 case OP_NOT_HSPACE:
3585 switch(fc)
3586 {
3587 HSPACE_CASES: RRETURN(MATCH_NOMATCH);
3588 default: break;
3589 }
3590 break;
3591
3592 case OP_HSPACE:
3593 switch(fc)
3594 {
3595 HSPACE_CASES: break;
3596 default: RRETURN(MATCH_NOMATCH);
3597 }
3598 break;
3599
3600 case OP_NOT_VSPACE:
3601 switch(fc)
3602 {
3603 VSPACE_CASES: RRETURN(MATCH_NOMATCH);
3604 default: break;
3605 }
3606 break;
3607
3608 case OP_VSPACE:
3609 switch(fc)
3610 {
3611 VSPACE_CASES: break;
3612 default: RRETURN(MATCH_NOMATCH);
3613 }
3614 break;
3615
3616 case OP_NOT_DIGIT:
3617 if (fc < 256 && (mb->ctypes[fc] & ctype_digit) != 0)
3618 RRETURN(MATCH_NOMATCH);
3619 break;
3620
3621 case OP_DIGIT:
3622 if (fc >= 256 || (mb->ctypes[fc] & ctype_digit) == 0)
3623 RRETURN(MATCH_NOMATCH);
3624 break;
3625
3626 case OP_NOT_WHITESPACE:
3627 if (fc < 256 && (mb->ctypes[fc] & ctype_space) != 0)
3628 RRETURN(MATCH_NOMATCH);
3629 break;
3630
3631 case OP_WHITESPACE:
3632 if (fc >= 256 || (mb->ctypes[fc] & ctype_space) == 0)
3633 RRETURN(MATCH_NOMATCH);
3634 break;
3635
3636 case OP_NOT_WORDCHAR:
3637 if (fc < 256 && (mb->ctypes[fc] & ctype_word) != 0)
3638 RRETURN(MATCH_NOMATCH);
3639 break;
3640
3641 case OP_WORDCHAR:
3642 if (fc >= 256 || (mb->ctypes[fc] & ctype_word) == 0)
3643 RRETURN(MATCH_NOMATCH);
3644 break;
3645
3646 default:
3647 return PCRE2_ERROR_INTERNAL;
3648 }
3649 }
3650 }
3651 else
3652 #endif /* SUPPORT_UNICODE */
3653
3654 /* Not UTF mode */
3655 {
3656 for (;;)
3657 {
3658 RMATCH(Fecode, RM33);
3659 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3660 if (Lmin++ >= Lmax) RRETURN(MATCH_NOMATCH);
3661 if (Feptr >= mb->end_subject)
3662 {
3663 SCHECK_PARTIAL();
3664 RRETURN(MATCH_NOMATCH);
3665 }
3666 if (Lctype == OP_ANY && IS_NEWLINE(Feptr))
3667 RRETURN(MATCH_NOMATCH);
3668 fc = *Feptr++;
3669 switch(Lctype)
3670 {
3671 case OP_ANY: /* This is the non-NL case */
3672 if (mb->partial != 0 && /* Take care with CRLF partial */
3673 Feptr >= mb->end_subject &&
3674 NLBLOCK->nltype == NLTYPE_FIXED &&
3675 NLBLOCK->nllen == 2 &&
3676 fc == NLBLOCK->nl[0])
3677 {
3678 mb->hitend = TRUE;
3679 if (mb->partial > 1) return PCRE2_ERROR_PARTIAL;
3680 }
3681 break;
3682
3683 case OP_ALLANY:
3684 case OP_ANYBYTE:
3685 break;
3686
3687 case OP_ANYNL:
3688 switch(fc)
3689 {
3690 default: RRETURN(MATCH_NOMATCH);
3691
3692 case CHAR_CR:
3693 if (Feptr < mb->end_subject && *Feptr == CHAR_LF) Feptr++;
3694 break;
3695
3696 case CHAR_LF:
3697 break;
3698
3699 case CHAR_VT:
3700 case CHAR_FF:
3701 case CHAR_NEL:
3702 #if PCRE2_CODE_UNIT_WIDTH != 8
3703 case 0x2028:
3704 case 0x2029:
3705 #endif
3706 if (mb->bsr_convention == PCRE2_BSR_ANYCRLF)
3707 RRETURN(MATCH_NOMATCH);
3708 break;
3709 }
3710 break;
3711
3712 case OP_NOT_HSPACE:
3713 switch(fc)
3714 {
3715 default: break;
3716 HSPACE_BYTE_CASES:
3717 #if PCRE2_CODE_UNIT_WIDTH != 8
3718 HSPACE_MULTIBYTE_CASES:
3719 #endif
3720 RRETURN(MATCH_NOMATCH);
3721 }
3722 break;
3723
3724 case OP_HSPACE:
3725 switch(fc)
3726 {
3727 default: RRETURN(MATCH_NOMATCH);
3728 HSPACE_BYTE_CASES:
3729 #if PCRE2_CODE_UNIT_WIDTH != 8
3730 HSPACE_MULTIBYTE_CASES:
3731 #endif
3732 break;
3733 }
3734 break;
3735
3736 case OP_NOT_VSPACE:
3737 switch(fc)
3738 {
3739 default: break;
3740 VSPACE_BYTE_CASES:
3741 #if PCRE2_CODE_UNIT_WIDTH != 8
3742 VSPACE_MULTIBYTE_CASES:
3743 #endif
3744 RRETURN(MATCH_NOMATCH);
3745 }
3746 break;
3747
3748 case OP_VSPACE:
3749 switch(fc)
3750 {
3751 default: RRETURN(MATCH_NOMATCH);
3752 VSPACE_BYTE_CASES:
3753 #if PCRE2_CODE_UNIT_WIDTH != 8
3754 VSPACE_MULTIBYTE_CASES:
3755 #endif
3756 break;
3757 }
3758 break;
3759
3760 case OP_NOT_DIGIT:
3761 if (MAX_255(fc) && (mb->ctypes[fc] & ctype_digit) != 0)
3762 RRETURN(MATCH_NOMATCH);
3763 break;
3764
3765 case OP_DIGIT:
3766 if (!MAX_255(fc) || (mb->ctypes[fc] & ctype_digit) == 0)
3767 RRETURN(MATCH_NOMATCH);
3768 break;
3769
3770 case OP_NOT_WHITESPACE:
3771 if (MAX_255(fc) && (mb->ctypes[fc] & ctype_space) != 0)
3772 RRETURN(MATCH_NOMATCH);
3773 break;
3774
3775 case OP_WHITESPACE:
3776 if (!MAX_255(fc) || (mb->ctypes[fc] & ctype_space) == 0)
3777 RRETURN(MATCH_NOMATCH);
3778 break;
3779
3780 case OP_NOT_WORDCHAR:
3781 if (MAX_255(fc) && (mb->ctypes[fc] & ctype_word) != 0)
3782 RRETURN(MATCH_NOMATCH);
3783 break;
3784
3785 case OP_WORDCHAR:
3786 if (!MAX_255(fc) || (mb->ctypes[fc] & ctype_word) == 0)
3787 RRETURN(MATCH_NOMATCH);
3788 break;
3789
3790 default:
3791 return PCRE2_ERROR_INTERNAL;
3792 }
3793 }
3794 }
3795 /* Control never gets here */
3796 }
3797
3798 /* If maximizing, it is worth using inline code for speed, doing the type
3799 test once at the start (i.e. keep it out of the loop). */
3800
3801 else
3802 {
3803 Lstart_eptr = Feptr; /* Remember where we started */
3804
3805 #ifdef SUPPORT_UNICODE
3806 if (proptype >= 0)
3807 {
3808 switch(proptype)
3809 {
3810 case PT_ANY:
3811 for (i = Lmin; i < Lmax; i++)
3812 {
3813 int len = 1;
3814 if (Feptr >= mb->end_subject)
3815 {
3816 SCHECK_PARTIAL();
3817 break;
3818 }
3819 GETCHARLENTEST(fc, Feptr, len);
3820 if (Lctype == OP_NOTPROP) break;
3821 Feptr+= len;
3822 }
3823 break;
3824
3825 case PT_LAMP:
3826 for (i = Lmin; i < Lmax; i++)
3827 {
3828 int chartype;
3829 int len = 1;
3830 if (Feptr >= mb->end_subject)
3831 {
3832 SCHECK_PARTIAL();
3833 break;
3834 }
3835 GETCHARLENTEST(fc, Feptr, len);
3836 chartype = UCD_CHARTYPE(fc);
3837 if ((chartype == ucp_Lu ||
3838 chartype == ucp_Ll ||
3839 chartype == ucp_Lt) == (Lctype == OP_NOTPROP))
3840 break;
3841 Feptr+= len;
3842 }
3843 break;
3844
3845 case PT_GC:
3846 for (i = Lmin; i < Lmax; i++)
3847 {
3848 int len = 1;
3849 if (Feptr >= mb->end_subject)
3850 {
3851 SCHECK_PARTIAL();
3852 break;
3853 }
3854 GETCHARLENTEST(fc, Feptr, len);
3855 if ((UCD_CATEGORY(fc) == Lpropvalue) == (Lctype == OP_NOTPROP))
3856 break;
3857 Feptr+= len;
3858 }
3859 break;
3860
3861 case PT_PC:
3862 for (i = Lmin; i < Lmax; i++)
3863 {
3864 int len = 1;
3865 if (Feptr >= mb->end_subject)
3866 {
3867 SCHECK_PARTIAL();
3868 break;
3869 }
3870 GETCHARLENTEST(fc, Feptr, len);
3871 if ((UCD_CHARTYPE(fc) == Lpropvalue) == (Lctype == OP_NOTPROP))
3872 break;
3873 Feptr+= len;
3874 }
3875 break;
3876
3877 case PT_SC:
3878 for (i = Lmin; i < Lmax; i++)
3879 {
3880 int len = 1;
3881 if (Feptr >= mb->end_subject)
3882 {
3883 SCHECK_PARTIAL();
3884 break;
3885 }
3886 GETCHARLENTEST(fc, Feptr, len);
3887 if ((UCD_SCRIPT(fc) == Lpropvalue) == (Lctype == OP_NOTPROP))
3888 break;
3889 Feptr+= len;
3890 }
3891 break;
3892
3893 case PT_ALNUM:
3894 for (i = Lmin; i < Lmax; i++)
3895 {
3896 int category;
3897 int len = 1;
3898 if (Feptr >= mb->end_subject)
3899 {
3900 SCHECK_PARTIAL();
3901 break;
3902 }
3903 GETCHARLENTEST(fc, Feptr, len);
3904 category = UCD_CATEGORY(fc);
3905 if ((category == ucp_L || category == ucp_N) ==
3906 (Lctype == OP_NOTPROP))
3907 break;
3908 Feptr+= len;
3909 }
3910 break;
3911
3912 /* Perl space used to exclude VT, but from Perl 5.18 it is included,
3913 which means that Perl space and POSIX space are now identical. PCRE
3914 was changed at release 8.34. */
3915
3916 case PT_SPACE: /* Perl space */
3917 case PT_PXSPACE: /* POSIX space */
3918 for (i = Lmin; i < Lmax; i++)
3919 {
3920 int len = 1;
3921 if (Feptr >= mb->end_subject)
3922 {
3923 SCHECK_PARTIAL();
3924 break;
3925 }
3926 GETCHARLENTEST(fc, Feptr, len);
3927 switch(fc)
3928 {
3929 HSPACE_CASES:
3930 VSPACE_CASES:
3931 if (Lctype == OP_NOTPROP) goto ENDLOOP99; /* Break the loop */
3932 break;
3933
3934 default:
3935 if ((UCD_CATEGORY(fc) == ucp_Z) == (Lctype == OP_NOTPROP))
3936 goto ENDLOOP99; /* Break the loop */
3937 break;
3938 }
3939 Feptr+= len;
3940 }
3941 ENDLOOP99:
3942 break;
3943
3944 case PT_WORD:
3945 for (i = Lmin; i < Lmax; i++)
3946 {
3947 int category;
3948 int len = 1;
3949 if (Feptr >= mb->end_subject)
3950 {
3951 SCHECK_PARTIAL();
3952 break;
3953 }
3954 GETCHARLENTEST(fc, Feptr, len);
3955 category = UCD_CATEGORY(fc);
3956 if ((category == ucp_L || category == ucp_N ||
3957 fc == CHAR_UNDERSCORE) == (Lctype == OP_NOTPROP))
3958 break;
3959 Feptr+= len;
3960 }
3961 break;
3962
3963 case PT_CLIST:
3964 for (i = Lmin; i < Lmax; i++)
3965 {
3966 const uint32_t *cp;
3967 int len = 1;
3968 if (Feptr >= mb->end_subject)
3969 {
3970 SCHECK_PARTIAL();
3971 break;
3972 }
3973 GETCHARLENTEST(fc, Feptr, len);
3974 cp = PRIV(ucd_caseless_sets) + Lpropvalue;
3975 for (;;)
3976 {
3977 if (fc < *cp)
3978 { if (Lctype == OP_NOTPROP) break; else goto GOT_MAX; }
3979 if (fc == *cp++)
3980 { if (Lctype == OP_NOTPROP) goto GOT_MAX; else break; }
3981 }
3982 Feptr += len;
3983 }
3984 GOT_MAX:
3985 break;
3986
3987 case PT_UCNC:
3988 for (i = Lmin; i < Lmax; i++)
3989 {
3990 int len = 1;
3991 if (Feptr >= mb->end_subject)
3992 {
3993 SCHECK_PARTIAL();
3994 break;
3995 }
3996 GETCHARLENTEST(fc, Feptr, len);
3997 if ((fc == CHAR_DOLLAR_SIGN || fc == CHAR_COMMERCIAL_AT ||
3998 fc == CHAR_GRAVE_ACCENT || (fc >= 0xa0 && fc <= 0xd7ff) ||
3999 fc >= 0xe000) == (Lctype == OP_NOTPROP))
4000 break;
4001 Feptr += len;
4002 }
4003 break;
4004
4005 default:
4006 return PCRE2_ERROR_INTERNAL;
4007 }
4008
4009 /* Feptr is now past the end of the maximum run */
4010
4011 if (reptype == REPTYPE_POS) continue; /* No backtracking */
4012
4013 /* After \C in UTF mode, Lstart_eptr might be in the middle of a
4014 Unicode character. Use <= Lstart_eptr to ensure backtracking doesn't
4015 go too far. */
4016
4017 for(;;)
4018 {
4019 if (Feptr <= Lstart_eptr) break;
4020 RMATCH(Fecode, RM222);
4021 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4022 Feptr--;
4023 if (utf) BACKCHAR(Feptr);
4024 }
4025 }
4026
4027 /* Match extended Unicode grapheme clusters. We will get here only if the
4028 support is in the binary; otherwise a compile-time error occurs. */
4029
4030 else if (Lctype == OP_EXTUNI)
4031 {
4032 for (i = Lmin; i < Lmax; i++)
4033 {
4034 if (Feptr >= mb->end_subject)
4035 {
4036 SCHECK_PARTIAL();
4037 break;
4038 }
4039 else
4040 {
4041 GETCHARINCTEST(fc, Feptr);
4042 Feptr = PRIV(extuni)(fc, Feptr, mb->start_subject, mb->end_subject,
4043 utf, NULL);
4044 }
4045 CHECK_PARTIAL();
4046 }
4047
4048 /* Feptr is now past the end of the maximum run */
4049
4050 if (reptype == REPTYPE_POS) continue; /* No backtracking */
4051
4052 /* We use <= Lstart_eptr rather than == Lstart_eptr to detect the start
4053 of the run while backtracking because the use of \C in UTF mode can
4054 cause BACKCHAR to move back past Lstart_eptr. This is just palliative;
4055 the use of \C in UTF mode is fraught with danger. */
4056
4057 for(;;)
4058 {
4059 int lgb, rgb;
4060 PCRE2_SPTR fptr;
4061
4062 if (Feptr <= Lstart_eptr) break; /* At start of char run */
4063 RMATCH(Fecode, RM220);
4064 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4065
4066 /* Backtracking over an extended grapheme cluster involves inspecting
4067 the previous two characters (if present) to see if a break is
4068 permitted between them. */
4069
4070 Feptr--;
4071 if (!utf) fc = *Feptr; else
4072 {
4073 BACKCHAR(Feptr);
4074 GETCHAR(fc, Feptr);
4075 }
4076 rgb = UCD_GRAPHBREAK(fc);
4077
4078 for (;;)
4079 {
4080 if (Feptr <= Lstart_eptr) break; /* At start of char run */
4081 fptr = Feptr - 1;
4082 if (!utf) fc = *fptr; else
4083 {
4084 BACKCHAR(fptr);
4085 GETCHAR(fc, fptr);
4086 }
4087 lgb = UCD_GRAPHBREAK(fc);
4088 if ((PRIV(ucp_gbtable)[lgb] & (1u << rgb)) == 0) break;
4089 Feptr = fptr;
4090 rgb = lgb;
4091 }
4092 }
4093 }
4094
4095 else
4096 #endif /* SUPPORT_UNICODE */
4097
4098 #ifdef SUPPORT_UNICODE
4099 if (utf)
4100 {
4101 switch(Lctype)
4102 {
4103 case OP_ANY:
4104 for (i = Lmin; i < Lmax; i++)
4105 {
4106 if (Feptr >= mb->end_subject)
4107 {
4108 SCHECK_PARTIAL();
4109 break;
4110 }
4111 if (IS_NEWLINE(Feptr)) break;
4112 if (mb->partial != 0 && /* Take care with CRLF partial */
4113 Feptr + 1 >= mb->end_subject &&
4114 NLBLOCK->nltype == NLTYPE_FIXED &&
4115 NLBLOCK->nllen == 2 &&
4116 UCHAR21(Feptr) == NLBLOCK->nl[0])
4117 {
4118 mb->hitend = TRUE;
4119 if (mb->partial > 1) return PCRE2_ERROR_PARTIAL;
4120 }
4121 Feptr++;
4122 ACROSSCHAR(Feptr < mb->end_subject, Feptr, Feptr++);
4123 }
4124 break;
4125
4126 case OP_ALLANY:
4127 if (Lmax < UINT32_MAX)
4128 {
4129 for (i = Lmin; i < Lmax; i++)
4130 {
4131 if (Feptr >= mb->end_subject)
4132 {
4133 SCHECK_PARTIAL();
4134 break;
4135 }
4136 Feptr++;
4137 ACROSSCHAR(Feptr < mb->end_subject, Feptr, Feptr++);
4138 }
4139 }
4140 else
4141 {
4142 Feptr = mb->end_subject; /* Unlimited UTF-8 repeat */
4143 SCHECK_PARTIAL();
4144 }
4145 break;
4146
4147 /* The "byte" (i.e. "code unit") case is the same as non-UTF */
4148
4149 case OP_ANYBYTE:
4150 fc = Lmax - Lmin;
4151 if (fc > (uint32_t)(mb->end_subject - Feptr))
4152 {
4153 Feptr = mb->end_subject;
4154 SCHECK_PARTIAL();
4155 }
4156 else Feptr += fc;
4157 break;
4158
4159 case OP_ANYNL:
4160 for (i = Lmin; i < Lmax; i++)
4161 {
4162 int len = 1;
4163 if (Feptr >= mb->end_subject)
4164 {
4165 SCHECK_PARTIAL();
4166 break;
4167 }
4168 GETCHARLEN(fc, Feptr, len);
4169 if (fc == CHAR_CR)
4170 {
4171 if (++Feptr >= mb->end_subject) break;
4172 if (UCHAR21(Feptr) == CHAR_LF) Feptr++;
4173 }
4174 else
4175 {
4176 if (fc != CHAR_LF &&
4177 (mb->bsr_convention == PCRE2_BSR_ANYCRLF ||
4178 (fc != CHAR_VT && fc != CHAR_FF && fc != CHAR_NEL
4179 #ifndef EBCDIC
4180 && fc != 0x2028 && fc != 0x2029
4181 #endif /* Not EBCDIC */
4182 )))
4183 break;
4184 Feptr += len;
4185 }
4186 }
4187 break;
4188
4189 case OP_NOT_HSPACE:
4190 case OP_HSPACE:
4191 for (i = Lmin; i < Lmax; i++)
4192 {
4193 BOOL gotspace;
4194 int len = 1;
4195 if (Feptr >= mb->end_subject)
4196 {
4197 SCHECK_PARTIAL();
4198 break;
4199 }
4200 GETCHARLEN(fc, Feptr, len);
4201 switch(fc)
4202 {
4203 HSPACE_CASES: gotspace = TRUE; break;
4204 default: gotspace = FALSE; break;
4205 }
4206 if (gotspace == (Lctype == OP_NOT_HSPACE)) break;
4207 Feptr += len;
4208 }
4209 break;
4210
4211 case OP_NOT_VSPACE:
4212 case OP_VSPACE:
4213 for (i = Lmin; i < Lmax; i++)
4214 {
4215 BOOL gotspace;
4216 int len = 1;
4217 if (Feptr >= mb->end_subject)
4218 {
4219 SCHECK_PARTIAL();
4220 break;
4221 }
4222 GETCHARLEN(fc, Feptr, len);
4223 switch(fc)
4224 {
4225 VSPACE_CASES: gotspace = TRUE; break;
4226 default: gotspace = FALSE; break;
4227 }
4228 if (gotspace == (Lctype == OP_NOT_VSPACE)) break;
4229 Feptr += len;
4230 }
4231 break;
4232
4233 case OP_NOT_DIGIT:
4234 for (i = Lmin; i < Lmax; i++)
4235 {
4236 int len = 1;
4237 if (Feptr >= mb->end_subject)
4238 {
4239 SCHECK_PARTIAL();
4240 break;
4241 }
4242 GETCHARLEN(fc, Feptr, len);
4243 if (fc < 256 && (mb->ctypes[fc] & ctype_digit) != 0) break;
4244 Feptr+= len;
4245 }
4246 break;
4247
4248 case OP_DIGIT:
4249 for (i = Lmin; i < Lmax; i++)
4250 {
4251 int len = 1;
4252 if (Feptr >= mb->end_subject)
4253 {
4254 SCHECK_PARTIAL();
4255 break;
4256 }
4257 GETCHARLEN(fc, Feptr, len);
4258 if (fc >= 256 ||(mb->ctypes[fc] & ctype_digit) == 0) break;
4259 Feptr+= len;
4260 }
4261 break;
4262
4263 case OP_NOT_WHITESPACE:
4264 for (i = Lmin; i < Lmax; i++)
4265 {
4266 int len = 1;
4267 if (Feptr >= mb->end_subject)
4268 {
4269 SCHECK_PARTIAL();
4270 break;
4271 }
4272 GETCHARLEN(fc, Feptr, len);
4273 if (fc < 256 && (mb->ctypes[fc] & ctype_space) != 0) break;
4274 Feptr+= len;
4275 }
4276 break;
4277
4278 case OP_WHITESPACE:
4279 for (i = Lmin; i < Lmax; i++)
4280 {
4281 int len = 1;
4282 if (Feptr >= mb->end_subject)
4283 {
4284 SCHECK_PARTIAL();
4285 break;
4286 }
4287 GETCHARLEN(fc, Feptr, len);
4288 if (fc >= 256 ||(mb->ctypes[fc] & ctype_space) == 0) break;
4289 Feptr+= len;
4290 }
4291 break;
4292
4293 case OP_NOT_WORDCHAR:
4294 for (i = Lmin; i < Lmax; i++)
4295 {
4296 int len = 1;
4297 if (Feptr >= mb->end_subject)
4298 {
4299 SCHECK_PARTIAL();
4300 break;
4301 }
4302 GETCHARLEN(fc, Feptr, len);
4303 if (fc < 256 && (mb->ctypes[fc] & ctype_word) != 0) break;
4304 Feptr+= len;
4305 }
4306 break;
4307
4308 case OP_WORDCHAR:
4309 for (i = Lmin; i < Lmax; i++)
4310 {
4311 int len = 1;
4312 if (Feptr >= mb->end_subject)
4313 {
4314 SCHECK_PARTIAL();
4315 break;
4316 }
4317 GETCHARLEN(fc, Feptr, len);
4318 if (fc >= 256 || (mb->ctypes[fc] & ctype_word) == 0) break;
4319 Feptr+= len;
4320 }
4321 break;
4322
4323 default:
4324 return PCRE2_ERROR_INTERNAL;
4325 }
4326
4327 if (reptype == REPTYPE_POS) continue; /* No backtracking */
4328
4329 /* After \C in UTF mode, Lstart_eptr might be in the middle of a
4330 Unicode character. Use <= Lstart_eptr to ensure backtracking doesn't go
4331 too far. */
4332
4333 for(;;)
4334 {
4335 if (Feptr <= Lstart_eptr) break;
4336 RMATCH(Fecode, RM221);
4337 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4338 Feptr--;
4339 BACKCHAR(Feptr);
4340 if (Lctype == OP_ANYNL && Feptr > Lstart_eptr &&
4341 UCHAR21(Feptr) == CHAR_NL && UCHAR21(Feptr - 1) == CHAR_CR)
4342 Feptr--;
4343 }
4344 }
4345 else
4346 #endif /* SUPPORT_UNICODE */
4347
4348 /* Not UTF mode */
4349 {
4350 switch(Lctype)
4351 {
4352 case OP_ANY:
4353 for (i = Lmin; i < Lmax; i++)
4354 {
4355 if (Feptr >= mb->end_subject)
4356 {
4357 SCHECK_PARTIAL();
4358 break;
4359 }
4360 if (IS_NEWLINE(Feptr)) break;
4361 if (mb->partial != 0 && /* Take care with CRLF partial */
4362 Feptr + 1 >= mb->end_subject &&
4363 NLBLOCK->nltype == NLTYPE_FIXED &&
4364 NLBLOCK->nllen == 2 &&
4365 *Feptr == NLBLOCK->nl[0])
4366 {
4367 mb->hitend = TRUE;
4368 if (mb->partial > 1) return PCRE2_ERROR_PARTIAL;
4369 }
4370 Feptr++;
4371 }
4372 break;
4373
4374 case OP_ALLANY:
4375 case OP_ANYBYTE:
4376 fc = Lmax - Lmin;
4377 if (fc > (uint32_t)(mb->end_subject - Feptr))
4378 {
4379 Feptr = mb->end_subject;
4380 SCHECK_PARTIAL();
4381 }
4382 else Feptr += fc;
4383 break;
4384
4385 case OP_ANYNL:
4386 for (i = Lmin; i < Lmax; i++)
4387 {
4388 if (Feptr >= mb->end_subject)
4389 {
4390 SCHECK_PARTIAL();
4391 break;
4392 }
4393 fc = *Feptr;
4394 if (fc == CHAR_CR)
4395 {
4396 if (++Feptr >= mb->end_subject) break;
4397 if (*Feptr == CHAR_LF) Feptr++;
4398 }
4399 else
4400 {
4401 if (fc != CHAR_LF && (mb->bsr_convention == PCRE2_BSR_ANYCRLF ||
4402 (fc != CHAR_VT && fc != CHAR_FF && fc != CHAR_NEL
4403 #if PCRE2_CODE_UNIT_WIDTH != 8
4404 && fc != 0x2028 && fc != 0x2029
4405 #endif
4406 ))) break;
4407 Feptr++;
4408 }
4409 }
4410 break;
4411
4412 case OP_NOT_HSPACE:
4413 for (i = Lmin; i < Lmax; i++)
4414 {
4415 if (Feptr >= mb->end_subject)
4416 {
4417 SCHECK_PARTIAL();
4418 break;
4419 }
4420 switch(*Feptr)
4421 {
4422 default: Feptr++; break;
4423 HSPACE_BYTE_CASES:
4424 #if PCRE2_CODE_UNIT_WIDTH != 8
4425 HSPACE_MULTIBYTE_CASES:
4426 #endif
4427 goto ENDLOOP00;
4428 }
4429 }
4430 ENDLOOP00:
4431 break;
4432
4433 case OP_HSPACE:
4434 for (i = Lmin; i < Lmax; i++)
4435 {
4436 if (Feptr >= mb->end_subject)
4437 {
4438 SCHECK_PARTIAL();
4439 break;
4440 }
4441 switch(*Feptr)
4442 {
4443 default: goto ENDLOOP01;
4444 HSPACE_BYTE_CASES:
4445 #if PCRE2_CODE_UNIT_WIDTH != 8
4446 HSPACE_MULTIBYTE_CASES:
4447 #endif
4448 Feptr++; break;
4449 }
4450 }
4451 ENDLOOP01:
4452 break;
4453
4454 case OP_NOT_VSPACE:
4455 for (i = Lmin; i < Lmax; i++)
4456 {
4457 if (Feptr >= mb->end_subject)
4458 {
4459 SCHECK_PARTIAL();
4460 break;
4461 }
4462 switch(*Feptr)
4463 {
4464 default: Feptr++; break;
4465 VSPACE_BYTE_CASES:
4466 #if PCRE2_CODE_UNIT_WIDTH != 8
4467 VSPACE_MULTIBYTE_CASES:
4468 #endif
4469 goto ENDLOOP02;
4470 }
4471 }
4472 ENDLOOP02:
4473 break;
4474
4475 case OP_VSPACE:
4476 for (i = Lmin; i < Lmax; i++)
4477 {
4478 if (Feptr >= mb->end_subject)
4479 {
4480 SCHECK_PARTIAL();
4481 break;
4482 }
4483 switch(*Feptr)
4484 {
4485 default: goto ENDLOOP03;
4486 VSPACE_BYTE_CASES:
4487 #if PCRE2_CODE_UNIT_WIDTH != 8
4488 VSPACE_MULTIBYTE_CASES:
4489 #endif
4490 Feptr++; break;
4491 }
4492 }
4493 ENDLOOP03:
4494 break;
4495
4496 case OP_NOT_DIGIT:
4497 for (i = Lmin; i < Lmax; i++)
4498 {
4499 if (Feptr >= mb->end_subject)
4500 {
4501 SCHECK_PARTIAL();
4502 break;
4503 }
4504 if (MAX_255(*Feptr) && (mb->ctypes[*Feptr] & ctype_digit) != 0)
4505 break;
4506 Feptr++;
4507 }
4508 break;
4509
4510 case OP_DIGIT:
4511 for (i = Lmin; i < Lmax; i++)
4512 {
4513 if (Feptr >= mb->end_subject)
4514 {
4515 SCHECK_PARTIAL();
4516 break;
4517 }
4518 if (!MAX_255(*Feptr) || (mb->ctypes[*Feptr] & ctype_digit) == 0)
4519 break;
4520 Feptr++;
4521 }
4522 break;
4523
4524 case OP_NOT_WHITESPACE:
4525 for (i = Lmin; i < Lmax; i++)
4526 {
4527 if (Feptr >= mb->end_subject)
4528 {
4529 SCHECK_PARTIAL();
4530 break;
4531 }
4532 if (MAX_255(*Feptr) && (mb->ctypes[*Feptr] & ctype_space) != 0)
4533 break;
4534 Feptr++;
4535 }
4536 break;
4537
4538 case OP_WHITESPACE:
4539 for (i = Lmin; i < Lmax; i++)
4540 {
4541 if (Feptr >= mb->end_subject)
4542 {
4543 SCHECK_PARTIAL();
4544 break;
4545 }
4546 if (!MAX_255(*Feptr) || (mb->ctypes[*Feptr] & ctype_space) == 0)
4547 break;
4548 Feptr++;
4549 }
4550 break;
4551
4552 case OP_NOT_WORDCHAR:
4553 for (i = Lmin; i < Lmax; i++)
4554 {
4555 if (Feptr >= mb->end_subject)
4556 {
4557 SCHECK_PARTIAL();
4558 break;
4559 }
4560 if (MAX_255(*Feptr) && (mb->ctypes[*Feptr] & ctype_word) != 0)
4561 break;
4562 Feptr++;
4563 }
4564 break;
4565
4566 case OP_WORDCHAR:
4567 for (i = Lmin; i < Lmax; i++)
4568 {
4569 if (Feptr >= mb->end_subject)
4570 {
4571 SCHECK_PARTIAL();
4572 break;
4573 }
4574 if (!MAX_255(*Feptr) || (mb->ctypes[*Feptr] & ctype_word) == 0)
4575 break;
4576 Feptr++;
4577 }
4578 break;
4579
4580 default:
4581 return PCRE2_ERROR_INTERNAL;
4582 }
4583
4584 if (reptype == REPTYPE_POS) continue; /* No backtracking */
4585
4586 for (;;)
4587 {
4588 if (Feptr == Lstart_eptr) break;
4589 RMATCH(Fecode, RM34);
4590 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4591 Feptr--;
4592 if (Lctype == OP_ANYNL && Feptr > Lstart_eptr && *Feptr == CHAR_LF &&
4593 Feptr[-1] == CHAR_CR) Feptr--;
4594 }
4595 }
4596 }
4597 break; /* End of repeat character type processing */
4598
4599 #undef Lstart_eptr
4600 #undef Lmin
4601 #undef Lmax
4602 #undef Lctype
4603 #undef Lpropvalue
4604
4605
4606 /* ===================================================================== */
4607 /* Match a back reference, possibly repeatedly. Look past the end of the
4608 item to see if there is repeat information following. The OP_REF and
4609 OP_REFI opcodes are used for a reference to a numbered group or to a
4610 non-duplicated named group. For a duplicated named group, OP_DNREF and
4611 OP_DNREFI are used. In this case we must scan the list of groups to which
4612 the name refers, and use the first one that is set. */
4613
4614 #define Lmin F->temp_32[0]
4615 #define Lmax F->temp_32[1]
4616 #define Lcaseless F->temp_32[2]
4617 #define Lstart F->temp_sptr[0]
4618 #define Loffset F->temp_size
4619
4620 case OP_DNREF:
4621 case OP_DNREFI:
4622 Lcaseless = (Fop == OP_DNREFI);
4623 {
4624 int count = GET2(Fecode, 1+IMM2_SIZE);
4625 PCRE2_SPTR slot = mb->name_table + GET2(Fecode, 1) * mb->name_entry_size;
4626 Fecode += 1 + 2*IMM2_SIZE;
4627
4628 while (count-- > 0)
4629 {
4630 Loffset = (GET2(slot, 0) << 1) - 2;
4631 if (Loffset < Foffset_top && Fovector[Loffset] != PCRE2_UNSET) break;
4632 slot += mb->name_entry_size;
4633 }
4634 }
4635 goto REF_REPEAT;
4636
4637 case OP_REF:
4638 case OP_REFI:
4639 Lcaseless = (Fop == OP_REFI);
4640 Loffset = (GET2(Fecode, 1) << 1) - 2;
4641 Fecode += 1 + IMM2_SIZE;
4642
4643 /* Set up for repetition, or handle the non-repeated case. The maximum and
4644 minimum must be in the heap frame, but as they are short-term values, we
4645 use temporary fields. */
4646
4647 REF_REPEAT:
4648 switch (*Fecode)
4649 {
4650 case OP_CRSTAR:
4651 case OP_CRMINSTAR:
4652 case OP_CRPLUS:
4653 case OP_CRMINPLUS:
4654 case OP_CRQUERY:
4655 case OP_CRMINQUERY:
4656 fc = *Fecode++ - OP_CRSTAR;
4657 Lmin = rep_min[fc];
4658 Lmax = rep_max[fc];
4659 reptype = rep_typ[fc];
4660 break;
4661
4662 case OP_CRRANGE:
4663 case OP_CRMINRANGE:
4664 Lmin = GET2(Fecode, 1);
4665 Lmax = GET2(Fecode, 1 + IMM2_SIZE);
4666 reptype = rep_typ[*Fecode - OP_CRSTAR];
4667 if (Lmax == 0) Lmax = UINT32_MAX; /* Max 0 => infinity */
4668 Fecode += 1 + 2 * IMM2_SIZE;
4669 break;
4670
4671 default: /* No repeat follows */
4672 {
4673 rrc = match_ref(Loffset, Lcaseless, F, mb, &length);
4674 if (rrc != 0)
4675 {
4676 if (rrc > 0) Feptr = mb->end_subject; /* Partial match */
4677 CHECK_PARTIAL();
4678 RRETURN(MATCH_NOMATCH);
4679 }
4680 }
4681 Feptr += length;
4682 continue; /* With the main loop */
4683 }
4684
4685 /* Handle repeated back references. If a set group has length zero, just
4686 continue with the main loop, because it matches however many times. For an
4687 unset reference, if the minimum is zero, we can also just continue. We can
4688 also continue if PCRE2_MATCH_UNSET_BACKREF is set, because this makes unset
4689 group behave as a zero-length group. For any other unset cases, carrying
4690 on will result in NOMATCH. */
4691
4692 if (Loffset < Foffset_top && Fovector[Loffset] != PCRE2_UNSET)
4693 {
4694 if (Fovector[Loffset] == Fovector[Loffset + 1]) continue;
4695 }
4696 else /* Group is not set */
4697 {
4698 if (Lmin == 0 || (mb->poptions & PCRE2_MATCH_UNSET_BACKREF) != 0)
4699 continue;
4700 }
4701
4702 /* First, ensure the minimum number of matches are present. */
4703
4704 for (i = 1; i <= Lmin; i++)
4705 {
4706 PCRE2_SIZE slength;
4707 rrc = match_ref(Loffset, Lcaseless, F, mb, &slength);
4708 if (rrc != 0)
4709 {
4710 if (rrc > 0) Feptr = mb->end_subject; /* Partial match */
4711 CHECK_PARTIAL();
4712 RRETURN(MATCH_NOMATCH);
4713 }
4714 Feptr += slength;
4715 }
4716
4717 /* If min = max, we are done. They are not both allowed to be zero. */
4718
4719 if (Lmin == Lmax) continue;
4720
4721 /* If minimizing, keep trying and advancing the pointer. */
4722
4723 if (reptype == REPTYPE_MIN)
4724 {
4725 for (;;)
4726 {
4727 PCRE2_SIZE slength;
4728 RMATCH(Fecode, RM20);
4729 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4730 if (Lmin++ >= Lmax) RRETURN(MATCH_NOMATCH);
4731 rrc = match_ref(Loffset, Lcaseless, F, mb, &slength);
4732 if (rrc != 0)
4733 {
4734 if (rrc > 0) Feptr = mb->end_subject; /* Partial match */
4735 CHECK_PARTIAL();
4736 RRETURN(MATCH_NOMATCH);
4737 }
4738 Feptr += slength;
4739 }
4740 /* Control never gets here */
4741 }
4742
4743 /* If maximizing, find the longest string and work backwards, as long as
4744 the matched lengths for each iteration are the same. */
4745
4746 else
4747 {
4748 BOOL samelengths = TRUE;
4749 Lstart = Feptr; /* Starting position */
4750 Flength = Fovector[Loffset+1] - Fovector[Loffset];
4751
4752 for (i = Lmin; i < Lmax; i++)
4753 {
4754 PCRE2_SIZE slength;
4755 rrc = match_ref(Loffset, Lcaseless, F, mb, &slength);
4756 if (rrc != 0)
4757 {
4758 /* Can't use CHECK_PARTIAL because we don't want to update Feptr in
4759 the soft partial matching case. */
4760
4761 if (rrc > 0 && mb->partial != 0 &&
4762 mb->end_subject > mb->start_used_ptr)
4763 {
4764 mb->hitend = TRUE;
4765 if (mb->partial > 1) return PCRE2_ERROR_PARTIAL;
4766 }
4767 break;
4768 }
4769
4770 if (slength != Flength) samelengths = FALSE;
4771 Feptr += slength;
4772 }
4773
4774 /* If the length matched for each repetition is the same as the length of
4775 the captured group, we can easily work backwards. This is the normal
4776 case. However, in caseless UTF-8 mode there are pairs of case-equivalent
4777 characters whose lengths (in terms of code units) differ. However, this
4778 is very rare, so we handle it by re-matching fewer and fewer times. */
4779
4780 if (samelengths)
4781 {
4782 while (Feptr >= Lstart)
4783 {
4784 RMATCH(Fecode, RM21);
4785 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4786 Feptr -= Flength;
4787 }
4788 }
4789
4790 /* The rare case of non-matching lengths. Re-scan the repetition for each
4791 iteration. We know that match_ref() will succeed every time. */
4792
4793 else
4794 {
4795 Lmax = i;
4796 for (;;)
4797 {
4798 RMATCH(Fecode, RM22);
4799 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4800 if (Feptr == Lstart) break; /* Failed after minimal repetition */
4801 Feptr = Lstart;
4802 Lmax--;
4803 for (i = Lmin; i < Lmax; i++)
4804 {
4805 PCRE2_SIZE slength;
4806 (void)match_ref(Loffset, Lcaseless, F, mb, &slength);
4807 Feptr += slength;
4808 }
4809 }
4810 }
4811
4812 RRETURN(MATCH_NOMATCH);
4813 }
4814 /* Control never gets here */
4815
4816 #undef Lcaseless
4817 #undef Lmin
4818 #undef Lmax
4819 #undef Lstart
4820 #undef Loffset
4821
4822
4823
4824 /* ========================================================================= */
4825 /* Opcodes for the start of various parenthesized items */
4826 /* ========================================================================= */
4827
4828 /* In all cases, if the result of RMATCH() is MATCH_THEN, check whether the
4829 (*THEN) is within the current branch by comparing the address of OP_THEN
4830 that is passed back with the end of the branch. If (*THEN) is within the
4831 current branch, and the branch is one of two or more alternatives (it
4832 either starts or ends with OP_ALT), we have reached the limit of THEN's
4833 action, so convert the return code to NOMATCH, which will cause normal
4834 backtracking to happen from now on. Otherwise, THEN is passed back to an
4835 outer alternative. This implements Perl's treatment of parenthesized
4836 groups, where a group not containing | does not affect the current
4837 alternative, that is, (X) is NOT the same as (X|(*F)). */
4838
4839
4840 /* ===================================================================== */
4841 /* BRAZERO, BRAMINZERO and SKIPZERO occur just before a non-possessive
4842 bracket group, indicating that it may occur zero times. It may repeat
4843 infinitely, or not at all - i.e. it could be ()* or ()? or even (){0} in
4844 the pattern. Brackets with fixed upper repeat limits are compiled as a
4845 number of copies, with the optional ones preceded by BRAZERO or BRAMINZERO.
4846 Possessive groups with possible zero repeats are preceded by BRAPOSZERO. */
4847
4848 #define Lnext_ecode F->temp_sptr[0]
4849
4850 case OP_BRAZERO:
4851 Lnext_ecode = Fecode + 1;
4852 RMATCH(Lnext_ecode, RM9);
4853 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4854 do Lnext_ecode += GET(Lnext_ecode, 1); while (*Lnext_ecode == OP_ALT);
4855 Fecode = Lnext_ecode + 1 + LINK_SIZE;
4856 break;
4857
4858 case OP_BRAMINZERO:
4859 Lnext_ecode = Fecode + 1;
4860 do Lnext_ecode += GET(Lnext_ecode, 1); while (*Lnext_ecode == OP_ALT);
4861 RMATCH(Lnext_ecode + 1 + LINK_SIZE, RM10);
4862 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4863 Fecode++;
4864 break;
4865
4866 #undef Lnext_ecode
4867
4868 case OP_SKIPZERO:
4869 Fecode++;
4870 do Fecode += GET(Fecode,1); while (*Fecode == OP_ALT);
4871 Fecode += 1 + LINK_SIZE;
4872 break;
4873
4874
4875 /* ===================================================================== */
4876 /* Handle possessive brackets with an unlimited repeat. The end of these
4877 brackets will always be OP_KETRPOS, which returns MATCH_KETRPOS without
4878 going further in the pattern. */
4879
4880 #define Lframe_type F->temp_32[0]
4881 #define Lmatched_once F->temp_32[1]
4882 #define Lzero_allowed F->temp_32[2]
4883 #define Lstart_eptr F->temp_sptr[0]
4884 #define Lstart_group F->temp_sptr[1]
4885
4886 case OP_BRAPOSZERO:
4887 Lzero_allowed = TRUE; /* Zero repeat is allowed */
4888 Fecode += 1;
4889 if (*Fecode == OP_CBRAPOS || *Fecode == OP_SCBRAPOS)
4890 goto POSSESSIVE_CAPTURE;
4891 goto POSSESSIVE_NON_CAPTURE;
4892
4893 case OP_BRAPOS:
4894 case OP_SBRAPOS:
4895 Lzero_allowed = FALSE; /* Zero repeat not allowed */
4896
4897 POSSESSIVE_NON_CAPTURE:
4898 Lframe_type = GF_NOCAPTURE; /* Remembered frame type */
4899 goto POSSESSIVE_GROUP;
4900
4901 case OP_CBRAPOS:
4902 case OP_SCBRAPOS:
4903 Lzero_allowed = FALSE; /* Zero repeat not allowed */
4904
4905 POSSESSIVE_CAPTURE:
4906 number = GET2(Fecode, 1+LINK_SIZE);
4907 Lframe_type = GF_CAPTURE | number; /* Remembered frame type */
4908
4909 POSSESSIVE_GROUP:
4910 Lmatched_once = FALSE; /* Never matched */
4911 Lstart_group = Fecode; /* Start of this group */
4912
4913 for (;;)
4914 {
4915 Lstart_eptr = Feptr; /* Position at group start */
4916 group_frame_type = Lframe_type;
4917 RMATCH(Fecode + PRIV(OP_lengths)[*Fecode], RM8);
4918 if (rrc == MATCH_KETRPOS)
4919 {
4920 Lmatched_once = TRUE; /* Matched at least once */
4921 if (Feptr == Lstart_eptr) /* Empty match; skip to end */
4922 {
4923 do Fecode += GET(Fecode, 1); while (*Fecode == OP_ALT);
4924 break;
4925 }
4926
4927 Fecode = Lstart_group;
4928 continue;
4929 }
4930
4931 /* See comment above about handling THEN. */
4932
4933 if (rrc == MATCH_THEN)
4934 {
4935 PCRE2_SPTR next_ecode = Fecode + GET(Fecode,1);
4936 if (mb->verb_ecode_ptr < next_ecode &&
4937 (*Fecode == OP_ALT || *next_ecode == OP_ALT))
4938 rrc = MATCH_NOMATCH;
4939 }
4940
4941 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4942 Fecode += GET(Fecode, 1);
4943 if (*Fecode != OP_ALT) break;
4944 }
4945
4946 /* Success if matched something or zero repeat allowed */
4947
4948 if (Lmatched_once || Lzero_allowed)
4949 {
4950 Fecode += 1 + LINK_SIZE;
4951 break;
4952 }
4953
4954 RRETURN(MATCH_NOMATCH);
4955
4956 #undef Lmatched_once
4957 #undef Lzero_allowed
4958 #undef Lframe_type
4959 #undef Lstart_eptr
4960 #undef Lstart_group
4961
4962
4963 /* ===================================================================== */
4964 /* Handle non-capturing brackets that cannot match an empty string. When we
4965 get to the final alternative within the brackets, as long as there are no
4966 THEN's in the pattern, we can optimize by not recording a new backtracking
4967 point. (Ideally we should test for a THEN within this group, but we don't
4968 have that information.) Don't do this if we are at the very top level,
4969 however, because that would make handling assertions and once-only brackets
4970 messier when there is nothing to go back to. */
4971
4972 #define Lframe_type F->temp_32[0] /* Set for all that use GROUPLOOP */
4973 #define Lnext_branch F->temp_sptr[0] /* Used only in OP_BRA handling */
4974
4975 case OP_BRA:
4976 if (mb->hasthen || Frdepth == 0)
4977 {
4978 Lframe_type = 0;
4979 goto GROUPLOOP;
4980 }
4981
4982 for (;;)
4983 {
4984 Lnext_branch = Fecode + GET(Fecode, 1);
4985 if (*Lnext_branch != OP_ALT) break;
4986
4987 /* This is never the final branch. We do not need to test for MATCH_THEN
4988 here because this code is not used when there is a THEN in the pattern. */
4989
4990 RMATCH(Fecode + PRIV(OP_lengths)[*Fecode], RM1);
4991 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4992 Fecode = Lnext_branch;
4993 }
4994
4995 /* Hit the start of the final branch. Continue at this level. */
4996
4997 Fecode += PRIV(OP_lengths)[*Fecode];
4998 break;
4999
5000 #undef Lnext_branch
5001
5002
5003 /* ===================================================================== */
5004 /* Handle a capturing bracket, other than those that are possessive with an
5005 unlimited repeat. */
5006
5007 case OP_CBRA:
5008 case OP_SCBRA:
5009 Lframe_type = GF_CAPTURE | GET2(Fecode, 1+LINK_SIZE);
5010 goto GROUPLOOP;
5011
5012
5013 /* ===================================================================== */
5014 /* Atomic groups and non-capturing brackets that can match an empty string
5015 must record a backtracking point and also set up a chained frame. */
5016
5017 case OP_ONCE:
5018 case OP_SCRIPT_RUN:
5019 case OP_SBRA:
5020 Lframe_type = GF_NOCAPTURE | Fop;
5021
5022 GROUPLOOP:
5023 for (;;)
5024 {
5025 group_frame_type = Lframe_type;
5026 RMATCH(Fecode + PRIV(OP_lengths)[*Fecode], RM2);
5027 if (rrc == MATCH_THEN)
5028 {
5029 PCRE2_SPTR next_ecode = Fecode + GET(Fecode,1);
5030 if (mb->verb_ecode_ptr < next_ecode &&
5031 (*Fecode == OP_ALT || *next_ecode == OP_ALT))
5032 rrc = MATCH_NOMATCH;
5033 }
5034 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5035 Fecode += GET(Fecode, 1);
5036 if (*Fecode != OP_ALT) RRETURN(MATCH_NOMATCH);
5037 }
5038 /* Control never reaches here. */
5039
5040 #undef Lframe_type
5041
5042
5043 /* ===================================================================== */
5044 /* Recursion either matches the current regex, or some subexpression. The
5045 offset data is the offset to the starting bracket from the start of the
5046 whole pattern. (This is so that it works from duplicated subpatterns.) */
5047
5048 #define Lframe_type F->temp_32[0]
5049 #define Lstart_branch F->temp_sptr[0]
5050
5051 case OP_RECURSE:
5052 bracode = mb->start_code + GET(Fecode, 1);
5053 number = (bracode == mb->start_code)? 0 : GET2(bracode, 1 + LINK_SIZE);
5054
5055 /* If we are already in a recursion, check for repeating the same one
5056 without advancing the subject pointer. This should catch convoluted mutual
5057 recursions. (Some simple cases are caught at compile time.) */
5058
5059 if (Fcurrent_recurse != RECURSE_UNSET)
5060 {
5061 offset = Flast_group_offset;
5062 while (offset != PCRE2_UNSET)
5063 {
5064 N = (heapframe *)((char *)mb->match_frames + offset);
5065 P = (heapframe *)((char *)N - frame_size);
5066 if (N->group_frame_type == (GF_RECURSE | number))
5067 {
5068 if (Feptr == P->eptr) return PCRE2_ERROR_RECURSELOOP;
5069 break;
5070 }
5071 offset = P->last_group_offset;
5072 }
5073 }
5074
5075 /* Now run the recursion, branch by branch. */
5076
5077 Lstart_branch = bracode;
5078 Lframe_type = GF_RECURSE | number;
5079
5080 for (;;)
5081 {
5082 PCRE2_SPTR next_ecode;
5083
5084 group_frame_type = Lframe_type;
5085 RMATCH(Lstart_branch + PRIV(OP_lengths)[*Lstart_branch], RM11);
5086 next_ecode = Lstart_branch + GET(Lstart_branch,1);
5087
5088 /* Handle backtracking verbs, which are defined in a range that can
5089 easily be tested for. PCRE does not allow THEN, SKIP, PRUNE or COMMIT to
5090 escape beyond a recursion; they cause a NOMATCH for the entire recursion.
5091
5092 When one of these verbs triggers, the current recursion group number is
5093 recorded. If it matches the recursion we are processing, the verb
5094 happened within the recursion and we must deal with it. Otherwise it must
5095 have happened after the recursion completed, and so has to be passed
5096 back. See comment above about handling THEN. */
5097
5098 if (rrc >= MATCH_BACKTRACK_MIN && rrc <= MATCH_BACKTRACK_MAX &&
5099 mb->verb_current_recurse == (Lframe_type ^ GF_RECURSE))
5100 {
5101 if (rrc == MATCH_THEN && mb->verb_ecode_ptr < next_ecode &&
5102 (*Lstart_branch == OP_ALT || *next_ecode == OP_ALT))
5103 rrc = MATCH_NOMATCH;
5104 else RRETURN(MATCH_NOMATCH);
5105 }
5106
5107 /* Note that carrying on after (*ACCEPT) in a recursion is handled in the
5108 OP_ACCEPT code. Nothing needs to be done here. */
5109
5110 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5111 Lstart_branch = next_ecode;
5112 if (*Lstart_branch != OP_ALT) RRETURN(MATCH_NOMATCH);
5113 }
5114 /* Control never reaches here. */
5115
5116 #undef Lframe_type
5117 #undef Lstart_branch
5118
5119
5120 /* ===================================================================== */
5121 /* Positive assertions are like other groups except that PCRE doesn't allow
5122 the effect of (*THEN) to escape beyond an assertion; it is therefore
5123 treated as NOMATCH. (*ACCEPT) is treated as successful assertion, with its
5124 captures and mark retained. Any other return is an error. */
5125
5126 #define Lframe_type F->temp_32[0]
5127
5128 case OP_ASSERT:
5129 case OP_ASSERTBACK:
5130 Lframe_type = GF_NOCAPTURE | Fop;
5131 for (;;)
5132 {
5133 group_frame_type = Lframe_type;
5134 RMATCH(Fecode + PRIV(OP_lengths)[*Fecode], RM3);
5135 if (rrc == MATCH_ACCEPT)
5136 {
5137 memcpy(Fovector,
5138 (char *)assert_accept_frame + offsetof(heapframe, ovector),
5139 assert_accept_frame->offset_top * sizeof(PCRE2_SIZE));
5140 Foffset_top = assert_accept_frame->offset_top;
5141 Fmark = assert_accept_frame->mark;
5142 break;
5143 }
5144 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
5145 Fecode += GET(Fecode, 1);
5146 if (*Fecode != OP_ALT) RRETURN(MATCH_NOMATCH);
5147 }
5148
5149 do Fecode += GET(Fecode, 1); while (*Fecode == OP_ALT);
5150 Fecode += 1 + LINK_SIZE;
5151 break;
5152
5153 #undef Lframe_type
5154
5155
5156 /* ===================================================================== */
5157 /* Handle negative assertions. Loop for each non-matching branch as for
5158 positive assertions. */
5159
5160 #define Lframe_type F->temp_32[0]
5161
5162 case OP_ASSERT_NOT:
5163 case OP_ASSERTBACK_NOT:
5164 Lframe_type = GF_NOCAPTURE | Fop;
5165
5166 for (;;)
5167 {
5168 group_frame_type = Lframe_type;
5169 RMATCH(Fecode + PRIV(OP_lengths)[*Fecode], RM4);
5170 switch(rrc)
5171 {
5172 case MATCH_ACCEPT: /* Assertion matched, therefore it fails. */
5173 case MATCH_MATCH:
5174 RRETURN (MATCH_NOMATCH);
5175
5176 case MATCH_NOMATCH: /* Branch failed, try next if present. */
5177 case MATCH_THEN:
5178 Fecode += GET(Fecode, 1);
5179 if (*Fecode != OP_ALT) goto ASSERT_NOT_FAILED;
5180 break;
5181
5182 case MATCH_COMMIT: /* Assertion forced to fail, therefore continue. */
5183 case MATCH_SKIP:
5184 case MATCH_PRUNE:
5185 do Fecode += GET(Fecode, 1); while (*Fecode == OP_ALT);
5186 goto ASSERT_NOT_FAILED;
5187
5188 default: /* Pass back any other return */
5189 RRETURN(rrc);
5190 }
5191 }
5192
5193 /* None of the branches have matched or there was a backtrack to (*COMMIT),
5194 (*SKIP), (*PRUNE), or (*THEN) in the last branch. This is success for a
5195 negative assertion, so carry on. */
5196
5197 ASSERT_NOT_FAILED:
5198 Fecode += 1 + LINK_SIZE;
5199 break;
5200
5201 #undef Lframe_type
5202
5203
5204 /* ===================================================================== */
5205 /* The callout item calls an external function, if one is provided, passing
5206 details of the match so far. This is mainly for debugging, though the
5207 function is able to force a failure. */
5208
5209 case OP_CALLOUT:
5210 case OP_CALLOUT_STR:
5211 rrc = do_callout(F, mb, &length);
5212 if (rrc > 0) RRETURN(MATCH_NOMATCH);
5213 if (rrc < 0) RRETURN(rrc);
5214 Fecode += length;
5215 break;
5216
5217
5218 /* ===================================================================== */
5219 /* Conditional group: compilation checked that there are no more than two
5220 branches. If the condition is false, skipping the first branch takes us
5221 past the end of the item if there is only one branch, but that's exactly
5222 what we want. */
5223
5224 case OP_COND:
5225 case OP_SCOND:
5226
5227 /* The variable Flength will be added to Fecode when the condition is
5228 false, to get to the second branch. Setting it to the offset to the ALT or
5229 KET, then incrementing Fecode achieves this effect. However, if the second
5230 branch is non-existent, we must point to the KET so that the end of the
5231 group is correctly processed. We now have Fecode pointing to the condition
5232 or callout. */
5233
5234 Flength = GET(Fecode, 1); /* Offset to the second branch */
5235 if (Fecode[Flength] != OP_ALT) Flength -= 1 + LINK_SIZE;
5236 Fecode += 1 + LINK_SIZE; /* From this opcode */
5237
5238 /* Because of the way auto-callout works during compile, a callout item is
5239 inserted between OP_COND and an assertion condition. Such a callout can
5240 also be inserted manually. */
5241
5242 if (*Fecode == OP_CALLOUT || *Fecode == OP_CALLOUT_STR)
5243 {
5244 rrc = do_callout(F, mb, &length);
5245 if (rrc > 0) RRETURN(MATCH_NOMATCH);
5246 if (rrc < 0) RRETURN(rrc);
5247
5248 /* Advance Fecode past the callout, so it now points to the condition. We
5249 must adjust Flength so that the value of Fecode+Flength is unchanged. */
5250
5251 Fecode += length;
5252 Flength -= length;
5253 }
5254
5255 /* Test the various possible conditions */
5256
5257 condition = FALSE;
5258 switch(*Fecode)
5259 {
5260 case OP_RREF: /* Group recursion test */
5261 if (Fcurrent_recurse != RECURSE_UNSET)
5262 {
5263 number = GET2(Fecode, 1);
5264 condition = (number == RREF_ANY || number == Fcurrent_recurse);
5265 }
5266 break;
5267
5268 case OP_DNRREF: /* Duplicate named group recursion test */
5269 if (Fcurrent_recurse != RECURSE_UNSET)
5270 {
5271 int count = GET2(Fecode, 1 + IMM2_SIZE);
5272 PCRE2_SPTR slot = mb->name_table + GET2(Fecode, 1) * mb->name_entry_size;
5273 while (count-- > 0)
5274 {
5275 number = GET2(slot, 0);
5276 condition = number == Fcurrent_recurse;
5277 if (condition) break;
5278 slot += mb->name_entry_size;
5279 }
5280 }
5281 break;
5282
5283 case OP_CREF: /* Numbered group used test */
5284 offset = (GET2(Fecode, 1) << 1) - 2; /* Doubled ref number */
5285 condition = offset < Foffset_top && Fovector[offset] != PCRE2_UNSET;
5286 break;
5287
5288 case OP_DNCREF: /* Duplicate named group used test */
5289 {
5290 int count = GET2(Fecode, 1 + IMM2_SIZE);
5291 PCRE2_SPTR slot = mb->name_table + GET2(Fecode, 1) * mb->name_entry_size;
5292 while (count-- > 0)
5293 {
5294 offset = (GET2(slot, 0) << 1) - 2;
5295 condition = offset < Foffset_top && Fovector[offset] != PCRE2_UNSET;
5296 if (condition) break;
5297 slot += mb->name_entry_size;
5298 }
5299 }
5300 break;
5301
5302 case OP_FALSE:
5303 case OP_FAIL: /* The assertion (?!) becomes OP_FAIL */
5304 break;
5305
5306 case OP_TRUE:
5307 condition = TRUE;
5308 break;
5309
5310 /* The condition is an assertion. Run code similar to the assertion code
5311 above. */
5312
5313 #define Lpositive F->temp_32[0]
5314 #define Lstart_branch F->temp_sptr[0]
5315
5316 default:
5317 Lpositive = (*Fecode == OP_ASSERT || *Fecode == OP_ASSERTBACK);
5318 Lstart_branch = Fecode;
5319
5320 for (;;)
5321 {
5322 group_frame_type = GF_CONDASSERT | *Fecode;
5323 RMATCH(Lstart_branch + PRIV(OP_lengths)[*Lstart_branch], RM5);
5324
5325 switch(rrc)
5326 {
5327 case MATCH_ACCEPT: /* Save captures */
5328 memcpy(Fovector,
5329 (char *)assert_accept_frame + offsetof(heapframe, ovector),
5330 assert_accept_frame->offset_top * sizeof(PCRE2_SIZE));
5331 Foffset_top = assert_accept_frame->offset_top;
5332
5333 /* Fall through */
5334 /* In the case of a match, the captures have already been put into
5335 the current frame. */
5336
5337 case MATCH_MATCH:
5338 condition = Lpositive; /* TRUE for positive assertion */
5339 break;
5340
5341 /* PCRE doesn't allow the effect of (*THEN) to escape beyond an
5342 assertion; it is therefore always treated as NOMATCH. */
5343
5344 case MATCH_NOMATCH:
5345 case MATCH_THEN:
5346 Lstart_branch += GET(Lstart_branch, 1);
5347 if (*Lstart_branch == OP_ALT) continue; /* Try next branch */
5348 condition = !Lpositive; /* TRUE for negative assertion */
5349 break;
5350
5351 /* These force no match without checking other branches. */
5352
5353 case MATCH_COMMIT:
5354 case MATCH_SKIP:
5355 case MATCH_PRUNE:
5356 condition = !Lpositive;
5357 break;
5358
5359 default:
5360 RRETURN(rrc);
5361 }
5362 break; /* Out of the branch loop */
5363 }
5364
5365 /* If the condition is true, find the end of the assertion so that
5366 advancing past it gets us to the start of the first branch. */
5367
5368 if (condition)
5369 {
5370 do Fecode += GET(Fecode, 1); while (*Fecode == OP_ALT);
5371 }
5372 break; /* End of assertion condition */
5373 }
5374
5375 #undef Lpositive
5376 #undef Lstart_branch
5377
5378 /* Choose branch according to the condition. */
5379
5380 Fecode += condition? PRIV(OP_lengths)[*Fecode] : Flength;
5381
5382 /* If the opcode is OP_SCOND it means we are at a repeated conditional
5383 group that might match an empty string. We must therefore descend a level
5384 so that the start is remembered for checking. For OP_COND we can just
5385 continue at this level. */
5386
5387 if (Fop == OP_SCOND)
5388 {
5389 group_frame_type = GF_NOCAPTURE | Fop;
5390 RMATCH(Fecode, RM35);
5391 RRETURN(rrc);
5392 }
5393 break;
5394
5395
5396
5397 /* ========================================================================= */
5398 /* End of start of parenthesis opcodes */
5399 /* ========================================================================= */
5400
5401
5402 /* ===================================================================== */
5403 /* Move the subject pointer back. This occurs only at the start of each
5404 branch of a lookbehind assertion. If we are too close to the start to move
5405 back, fail. When working with UTF-8 we move back a number of characters,
5406 not bytes. */
5407
5408 case OP_REVERSE:
5409 number = GET(Fecode, 1);
5410 #ifdef SUPPORT_UNICODE
5411 if (utf)
5412 {
5413 while (number-- > 0)
5414 {
5415 if (Feptr <= mb->start_subject) RRETURN(MATCH_NOMATCH);
5416 Feptr--;
5417 BACKCHAR(Feptr);
5418 }
5419 }
5420 else
5421 #endif
5422
5423 /* No UTF-8 support, or not in UTF-8 mode: count is byte count */
5424
5425 {
5426 if ((ptrdiff_t)number > Feptr - mb->start_subject) RRETURN(MATCH_NOMATCH);
5427 Feptr -= number;
5428 }
5429
5430 /* Save the earliest consulted character, then skip to next opcode */
5431
5432 if (Feptr < mb->start_used_ptr) mb->start_used_ptr = Feptr;
5433 Fecode += 1 + LINK_SIZE;
5434 break;
5435
5436
5437 /* ===================================================================== */
5438 /* An alternation is the end of a branch; scan along to find the end of the
5439 bracketed group. */
5440
5441 case OP_ALT:
5442 do Fecode += GET(Fecode,1); while (*Fecode == OP_ALT);
5443 break;
5444
5445
5446 /* ===================================================================== */
5447 /* The end of a parenthesized group. For all but OP_BRA and OP_COND, the
5448 starting frame was added to the chained frames in order to remember the
5449 starting subject position for the group. */
5450
5451 case OP_KET:
5452 case OP_KETRMIN:
5453 case OP_KETRMAX:
5454 case OP_KETRPOS:
5455
5456 bracode = Fecode - GET(Fecode, 1);
5457
5458 /* Point N to the frame at the start of the most recent group.
5459 Remember the subject pointer at the start of the group. */
5460
5461 if (*bracode != OP_BRA && *bracode != OP_COND)
5462 {
5463 N = (heapframe *)((char *)mb->match_frames + Flast_group_offset);
5464 P = (heapframe *)((char *)N - frame_size);
5465 Flast_group_offset = P->last_group_offset;
5466
5467 #ifdef DEBUG_SHOW_RMATCH
5468 fprintf(stderr, "++ KET for frame=%d type=%x prev char offset=%lu\n",
5469 N->rdepth, N->group_frame_type,
5470 (char *)P->eptr - (char *)mb->start_subject);
5471 #endif
5472
5473 /* If we are at the end of an assertion that is a condition, return a
5474 match, discarding any intermediate backtracking points. Copy back the
5475 captures into the frame before N so that they are set on return. Doing
5476 this for all assertions, both positive and negative, seems to match what
5477 Perl does. */
5478
5479 if (GF_IDMASK(N->group_frame_type) == GF_CONDASSERT)
5480 {
5481 memcpy((char *)P + offsetof(heapframe, ovector), Fovector,
5482 Foffset_top * sizeof(PCRE2_SIZE));
5483 P->offset_top = Foffset_top;
5484 Fback_frame = (char *)F - (char *)P;
5485 RRETURN(MATCH_MATCH);
5486 }
5487 }
5488 else P = NULL; /* Indicates starting frame not recorded */
5489
5490 /* The group was not a conditional assertion. */
5491
5492 switch (*bracode)
5493 {
5494 case OP_BRA: /* No need to do anything for these */
5495 case OP_COND:
5496 case OP_SCOND:
5497 break;
5498
5499 /* Positive assertions are like OP_ONCE, except that in addition the
5500 subject pointer must be put back to where it was at the start of the
5501 assertion. */
5502
5503 case OP_ASSERT:
5504 case OP_ASSERTBACK:
5505 if (Feptr > mb->last_used_ptr) mb->last_used_ptr = Feptr;
5506 Feptr = P->eptr;
5507 /* Fall through */
5508
5509 /* For an atomic group, discard internal backtracking points. We must
5510 also ensure that any remaining branches within the top-level of the group
5511 are not tried. Do this by adjusting the code pointer within the backtrack
5512 frame so that it points to the final branch. */
5513
5514 case OP_ONCE:
5515 Fback_frame = ((char *)F - (char *)P);
5516 for (;;)
5517 {
5518 uint32_t y = GET(P->ecode,1);
5519 if ((P->ecode)[y] != OP_ALT) break;
5520 P->ecode += y;
5521 }
5522 break;
5523
5524 /* A matching negative assertion returns MATCH, which is turned into
5525 NOMATCH at the assertion level. */
5526
5527 case OP_ASSERT_NOT:
5528 case OP_ASSERTBACK_NOT:
5529 RRETURN(MATCH_MATCH);
5530
5531 /* At the end of a script run, apply the script-checking rules. This code
5532 will never by exercised if Unicode support it not compiled, because in
5533 that environment script runs cause an error at compile time. */
5534
5535 case OP_SCRIPT_RUN:
5536 if (!PRIV(script_run)(P->eptr, Feptr, utf)) RRETURN(MATCH_NOMATCH);
5537 break;
5538
5539 /* Whole-pattern recursion is coded as a recurse into group 0, so it
5540 won't be picked up here. Instead, we catch it when the OP_END is reached.
5541 Other recursion is handled here. */
5542
5543 case OP_CBRA:
5544 case OP_CBRAPOS:
5545 case OP_SCBRA:
5546 case OP_SCBRAPOS:
5547 number = GET2(bracode, 1+LINK_SIZE);
5548
5549 /* Handle a recursively called group. We reinstate the previous set of
5550 captures and then carry on after the recursion call. */
5551
5552 if (Fcurrent_recurse == number)
5553 {
5554 P = (heapframe *)((char *)N - frame_size);
5555 memcpy((char *)F + offsetof(heapframe, ovector), P->ovector,
5556 P->offset_top * sizeof(PCRE2_SIZE));
5557 Foffset_top = P->offset_top;
5558 Fcapture_last = P->capture_last;
5559 Fcurrent_recurse = P->current_recurse;
5560 Fecode = P->ecode + 1 + LINK_SIZE;
5561 continue; /* With next opcode */
5562 }
5563
5564 /* Deal with actual capturing. */
5565
5566 offset = (number << 1) - 2;
5567 Fcapture_last = number;
5568 Fovector[offset] = P->eptr - mb->start_subject;
5569 Fovector[offset+1] = Feptr - mb->start_subject;
5570 if (offset >= Foffset_top) Foffset_top = offset + 2;
5571 break;
5572 } /* End actions relating to the starting opcode */
5573
5574 /* OP_KETRPOS is a possessive repeating ket. Remember the current position,
5575 and return the MATCH_KETRPOS. This makes it possible to do the repeats one
5576 at a time from the outer level. This must precede the empty string test -
5577 in this case that test is done at the outer level. */
5578
5579 if (*Fecode == OP_KETRPOS)
5580 {
5581 memcpy((char *)P + offsetof(heapframe, eptr),
5582 (char *)F + offsetof(heapframe, eptr),
5583 frame_copy_size);
5584 RRETURN(MATCH_KETRPOS);
5585 }
5586
5587 /* Handle the different kinds of closing brackets. A non-repeating ket
5588 needs no special action, just continuing at this level. This also happens
5589 for the repeating kets if the group matched no characters, in order to
5590 forcibly break infinite loops. Otherwise, the repeating kets try the rest
5591 of the pattern or restart from the preceding bracket, in the appropriate
5592 order. */
5593
5594 if (Fop != OP_KET && (P == NULL || Feptr != P->eptr))
5595 {
5596 if (Fop == OP_KETRMIN)
5597 {
5598 RMATCH(Fecode + 1 + LINK_SIZE, RM6);
5599 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5600 Fecode -= GET(Fecode, 1);
5601 break; /* End of ket processing */
5602 }
5603
5604 /* Repeat the maximum number of times (KETRMAX) */
5605
5606 RMATCH(bracode, RM7);
5607 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5608 }
5609
5610 /* Carry on at this level for a non-repeating ket, or after matching an
5611 empty string, or after repeating for a maximum number of times. */
5612
5613 Fecode += 1 + LINK_SIZE;
5614 break;
5615
5616
5617 /* ===================================================================== */
5618 /* Start and end of line assertions, not multiline mode. */
5619
5620 case OP_CIRC: /* Start of line, unless PCRE2_NOTBOL is set. */
5621 if (Feptr != mb->start_subject || (mb->moptions & PCRE2_NOTBOL) != 0)
5622 RRETURN(MATCH_NOMATCH);
5623 Fecode++;
5624 break;
5625
5626 case OP_SOD: /* Unconditional start of subject */
5627 if (Feptr != mb->start_subject) RRETURN(MATCH_NOMATCH);
5628 Fecode++;
5629 break;
5630
5631 /* When PCRE2_NOTEOL is unset, assert before the subject end, or a
5632 terminating newline unless PCRE2_DOLLAR_ENDONLY is set. */
5633
5634 case OP_DOLL:
5635 if ((mb->moptions & PCRE2_NOTEOL) != 0) RRETURN(MATCH_NOMATCH);
5636 if ((mb->poptions & PCRE2_DOLLAR_ENDONLY) == 0) goto ASSERT_NL_OR_EOS;
5637
5638 /* Fall through */
5639 /* Unconditional end of subject assertion (\z) */
5640
5641 case OP_EOD:
5642 if (Feptr < mb->end_subject) RRETURN(MATCH_NOMATCH);
5643 SCHECK_PARTIAL();
5644 Fecode++;
5645 break;
5646
5647 /* End of subject or ending \n assertion (\Z) */
5648
5649 case OP_EODN:
5650 ASSERT_NL_OR_EOS:
5651 if (Feptr < mb->end_subject &&
5652 (!IS_NEWLINE(Feptr) || Feptr != mb->end_subject - mb->nllen))
5653 {
5654 if (mb->partial != 0 &&
5655 Feptr + 1 >= mb->end_subject &&
5656 NLBLOCK->nltype == NLTYPE_FIXED &&
5657 NLBLOCK->nllen == 2 &&
5658 UCHAR21TEST(Feptr) == NLBLOCK->nl[0])
5659 {
5660 mb->hitend = TRUE;
5661 if (mb->partial > 1) return PCRE2_ERROR_PARTIAL;
5662 }
5663 RRETURN(MATCH_NOMATCH);
5664 }
5665
5666 /* Either at end of string or \n before end. */
5667
5668 SCHECK_PARTIAL();
5669 Fecode++;
5670 break;
5671
5672
5673 /* ===================================================================== */
5674 /* Start and end of line assertions, multiline mode. */
5675
5676 /* Start of subject unless notbol, or after any newline except for one at
5677 the very end, unless PCRE2_ALT_CIRCUMFLEX is set. */
5678
5679 case OP_CIRCM:
5680 if ((mb->moptions & PCRE2_NOTBOL) != 0 && Feptr == mb->start_subject)
5681 RRETURN(MATCH_NOMATCH);
5682 if (Feptr != mb->start_subject &&
5683 ((Feptr == mb->end_subject &&
5684 (mb->poptions & PCRE2_ALT_CIRCUMFLEX) == 0) ||
5685 !WAS_NEWLINE(Feptr)))
5686 RRETURN(MATCH_NOMATCH);
5687 Fecode++;
5688 break;
5689
5690 /* Assert before any newline, or before end of subject unless noteol is
5691 set. */
5692
5693 case OP_DOLLM:
5694 if (Feptr < mb->end_subject)
5695 {
5696 if (!IS_NEWLINE(Feptr))
5697 {
5698 if (mb->partial != 0 &&
5699 Feptr + 1 >= mb->end_subject &&
5700 NLBLOCK->nltype == NLTYPE_FIXED &&
5701 NLBLOCK->nllen == 2 &&
5702 UCHAR21TEST(Feptr) == NLBLOCK->nl[0])
5703 {
5704 mb->hitend = TRUE;
5705 if (mb->partial > 1) return PCRE2_ERROR_PARTIAL;
5706 }
5707 RRETURN(MATCH_NOMATCH);
5708 }
5709 }
5710 else
5711 {
5712 if ((mb->moptions & PCRE2_NOTEOL) != 0) RRETURN(MATCH_NOMATCH);
5713 SCHECK_PARTIAL();
5714 }
5715 Fecode++;
5716 break;
5717
5718
5719 /* ===================================================================== */
5720 /* Start of match assertion */
5721
5722 case OP_SOM:
5723 if (Feptr != mb->start_subject + mb->start_offset) RRETURN(MATCH_NOMATCH);
5724 Fecode++;
5725 break;
5726
5727
5728 /* ===================================================================== */
5729 /* Reset the start of match point */
5730
5731 case OP_SET_SOM:
5732 Fstart_match = Feptr;
5733 Fecode++;
5734 break;
5735
5736
5737 /* ===================================================================== */
5738 /* Word boundary assertions. Find out if the previous and current
5739 characters are "word" characters. It takes a bit more work in UTF mode.
5740 Characters > 255 are assumed to be "non-word" characters when PCRE2_UCP is
5741 not set. When it is set, use Unicode properties if available, even when not
5742 in UTF mode. Remember the earliest and latest consulted characters. */
5743
5744 case OP_NOT_WORD_BOUNDARY:
5745 case OP_WORD_BOUNDARY:
5746 if (Feptr == mb->start_subject) prev_is_word = FALSE; else
5747 {
5748 PCRE2_SPTR lastptr = Feptr - 1;
5749 #ifdef SUPPORT_UNICODE
5750 if (utf)
5751 {
5752 BACKCHAR(lastptr);
5753 GETCHAR(fc, lastptr);
5754 }
5755 else
5756 #endif /* SUPPORT_UNICODE */
5757 fc = *lastptr;
5758 if (lastptr < mb->start_used_ptr) mb->start_used_ptr = lastptr;
5759 #ifdef SUPPORT_UNICODE
5760 if ((mb->poptions & PCRE2_UCP) != 0)
5761 {
5762 if (fc == '_') prev_is_word = TRUE; else
5763 {
5764 int cat = UCD_CATEGORY(fc);
5765 prev_is_word = (cat == ucp_L || cat == ucp_N);
5766 }
5767 }
5768 else
5769 #endif /* SUPPORT_UNICODE */
5770 prev_is_word = CHMAX_255(fc) && (mb->ctypes[fc] & ctype_word) != 0;
5771 }
5772
5773 /* Get status of next character */
5774
5775 if (Feptr >= mb->end_subject)
5776 {
5777 SCHECK_PARTIAL();
5778 cur_is_word = FALSE;
5779 }
5780 else
5781 {
5782 PCRE2_SPTR nextptr = Feptr + 1;
5783 #ifdef SUPPORT_UNICODE
5784 if (utf)
5785 {
5786 FORWARDCHARTEST(nextptr, mb->end_subject);
5787 GETCHAR(fc, Feptr);
5788 }
5789 else
5790 #endif /* SUPPORT_UNICODE */
5791 fc = *Feptr;
5792 if (nextptr > mb->last_used_ptr) mb->last_used_ptr = nextptr;
5793 #ifdef SUPPORT_UNICODE
5794 if ((mb->poptions & PCRE2_UCP) != 0)
5795 {
5796 if (fc == '_') cur_is_word = TRUE; else
5797 {
5798 int cat = UCD_CATEGORY(fc);
5799 cur_is_word = (cat == ucp_L || cat == ucp_N);
5800 }
5801 }
5802 else
5803 #endif /* SUPPORT_UNICODE */
5804 cur_is_word = CHMAX_255(fc) && (mb->ctypes[fc] & ctype_word) != 0;
5805 }
5806
5807 /* Now see if the situation is what we want */
5808
5809 if ((*Fecode++ == OP_WORD_BOUNDARY)?
5810 cur_is_word == prev_is_word : cur_is_word != prev_is_word)
5811 RRETURN(MATCH_NOMATCH);
5812 break;
5813
5814
5815 /* ===================================================================== */
5816 /* Backtracking (*VERB)s, with and without arguments. Note that if the
5817 pattern is successfully matched, we do not come back from RMATCH. */
5818
5819 case OP_MARK:
5820 Fmark = mb->nomatch_mark = Fecode + 2;
5821 RMATCH(Fecode + PRIV(OP_lengths)[*Fecode] + Fecode[1], RM12);
5822
5823 /* A return of MATCH_SKIP_ARG means that matching failed at SKIP with an
5824 argument, and we must check whether that argument matches this MARK's
5825 argument. It is passed back in mb->verb_skip_ptr. If it does match, we
5826 return MATCH_SKIP with mb->verb_skip_ptr now pointing to the subject
5827 position that corresponds to this mark. Otherwise, pass back the return
5828 code unaltered. */
5829
5830 if (rrc == MATCH_SKIP_ARG &&
5831 PRIV(strcmp)(Fecode + 2, mb->verb_skip_ptr) == 0)
5832 {
5833 mb->verb_skip_ptr = Feptr; /* Pass back current position */
5834 RRETURN(MATCH_SKIP);
5835 }
5836 RRETURN(rrc);
5837
5838 case OP_FAIL:
5839 RRETURN(MATCH_NOMATCH);
5840
5841 /* Record the current recursing group number in mb->verb_current_recurse
5842 when a backtracking return such as MATCH_COMMIT is given. This enables the
5843 recurse processing to catch verbs from within the recursion. */
5844
5845 case OP_COMMIT:
5846 RMATCH(Fecode + PRIV(OP_lengths)[*Fecode], RM13);
5847 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5848 mb->verb_current_recurse = Fcurrent_recurse;
5849 RRETURN(MATCH_COMMIT);
5850
5851 case OP_COMMIT_ARG:
5852 Fmark = mb->nomatch_mark = Fecode + 2;
5853 RMATCH(Fecode + PRIV(OP_lengths)[*Fecode] + Fecode[1], RM36);
5854 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5855 mb->verb_current_recurse = Fcurrent_recurse;
5856 RRETURN(MATCH_COMMIT);
5857
5858 case OP_PRUNE:
5859 RMATCH(Fecode + PRIV(OP_lengths)[*Fecode], RM14);
5860 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5861 mb->verb_current_recurse = Fcurrent_recurse;
5862 RRETURN(MATCH_PRUNE);
5863
5864 case OP_PRUNE_ARG:
5865 Fmark = mb->nomatch_mark = Fecode + 2;
5866 RMATCH(Fecode + PRIV(OP_lengths)[*Fecode] + Fecode[1], RM15);
5867 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5868 mb->verb_current_recurse = Fcurrent_recurse;
5869 RRETURN(MATCH_PRUNE);
5870
5871 case OP_SKIP:
5872 RMATCH(Fecode + PRIV(OP_lengths)[*Fecode], RM16);
5873 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5874 mb->verb_skip_ptr = Feptr; /* Pass back current position */
5875 mb->verb_current_recurse = Fcurrent_recurse;
5876 RRETURN(MATCH_SKIP);
5877
5878 /* Note that, for Perl compatibility, SKIP with an argument does NOT set
5879 nomatch_mark. When a pattern match ends with a SKIP_ARG for which there was
5880 not a matching mark, we have to re-run the match, ignoring the SKIP_ARG
5881 that failed and any that precede it (either they also failed, or were not
5882 triggered). To do this, we maintain a count of executed SKIP_ARGs. If a
5883 SKIP_ARG gets to top level, the match is re-run with mb->ignore_skip_arg
5884 set to the count of the one that failed. */
5885
5886 case OP_SKIP_ARG:
5887 mb->skip_arg_count++;
5888 if (mb->skip_arg_count <= mb->ignore_skip_arg)
5889 {
5890 Fecode += PRIV(OP_lengths)[*Fecode] + Fecode[1];
5891 break;
5892 }
5893 RMATCH(Fecode + PRIV(OP_lengths)[*Fecode] + Fecode[1], RM17);
5894 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5895
5896 /* Pass back the current skip name and return the special MATCH_SKIP_ARG
5897 return code. This will either be caught by a matching MARK, or get to the
5898 top, where it causes a rematch with mb->ignore_skip_arg set to the value of
5899 mb->skip_arg_count. */
5900
5901 mb->verb_skip_ptr = Fecode + 2;
5902 mb->verb_current_recurse = Fcurrent_recurse;
5903 RRETURN(MATCH_SKIP_ARG);
5904
5905 /* For THEN (and THEN_ARG) we pass back the address of the opcode, so that
5906 the branch in which it occurs can be determined. */
5907
5908 case OP_THEN:
5909 RMATCH(Fecode + PRIV(OP_lengths)[*Fecode], RM18);
5910 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5911 mb->verb_ecode_ptr = Fecode;
5912 mb->verb_current_recurse = Fcurrent_recurse;
5913 RRETURN(MATCH_THEN);
5914
5915 case OP_THEN_ARG:
5916 Fmark = mb->nomatch_mark = Fecode + 2;
5917 RMATCH(Fecode + PRIV(OP_lengths)[*Fecode] + Fecode[1], RM19);
5918 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5919 mb->verb_ecode_ptr = Fecode;
5920 mb->verb_current_recurse = Fcurrent_recurse;
5921 RRETURN(MATCH_THEN);
5922
5923
5924 /* ===================================================================== */
5925 /* There's been some horrible disaster. Arrival here can only mean there is
5926 something seriously wrong in the code above or the OP_xxx definitions. */
5927
5928 default:
5929 return PCRE2_ERROR_INTERNAL;
5930 }
5931
5932 /* Do not insert any code in here without much thought; it is assumed
5933 that "continue" in the code above comes out to here to repeat the main
5934 loop. */
5935
5936 } /* End of main loop */
5937 /* Control never reaches here */
5938
5939
5940 /* ========================================================================= */
5941 /* The RRETURN() macro jumps here. The number that is saved in Freturn_id
5942 indicates which label we actually want to return to. The value in Frdepth is
5943 the index number of the frame in the vector. The return value has been placed
5944 in rrc. */
5945
5946 #define LBL(val) case val: goto L_RM##val;
5947
5948 RETURN_SWITCH:
5949 if (Frdepth == 0) return rrc; /* Exit from the top level */
5950 F = (heapframe *)((char *)F - Fback_frame); /* Backtrack */
5951 mb->cb->callout_flags |= PCRE2_CALLOUT_BACKTRACK; /* Note for callouts */
5952
5953 #ifdef DEBUG_SHOW_RMATCH
5954 fprintf(stderr, "++ RETURN %d to %d\n", rrc, Freturn_id);
5955 #endif
5956
5957 switch (Freturn_id)
5958 {
5959 LBL( 1) LBL( 2) LBL( 3) LBL( 4) LBL( 5) LBL( 6) LBL( 7) LBL( 8)
5960 LBL( 9) LBL(10) LBL(11) LBL(12) LBL(13) LBL(14) LBL(15) LBL(16)
5961 LBL(17) LBL(18) LBL(19) LBL(20) LBL(21) LBL(22) LBL(23) LBL(24)
5962 LBL(25) LBL(26) LBL(27) LBL(28) LBL(29) LBL(30) LBL(31) LBL(32)
5963 LBL(33) LBL(34) LBL(35) LBL(36)
5964
5965 #ifdef SUPPORT_WIDE_CHARS
5966 LBL(100) LBL(101)
5967 #endif
5968
5969 #ifdef SUPPORT_UNICODE
5970 LBL(200) LBL(201) LBL(202) LBL(203) LBL(204) LBL(205) LBL(206)
5971 LBL(207) LBL(208) LBL(209) LBL(210) LBL(211) LBL(212) LBL(213)
5972 LBL(214) LBL(215) LBL(216) LBL(217) LBL(218) LBL(219) LBL(220)
5973 LBL(221) LBL(222)
5974 #endif
5975
5976 default:
5977 return PCRE2_ERROR_INTERNAL;
5978 }
5979 #undef LBL
5980 }
5981
5982
5983 /*************************************************
5984 * Match a Regular Expression *
5985 *************************************************/
5986
5987 /* This function applies a compiled pattern to a subject string and picks out
5988 portions of the string if it matches. Two elements in the vector are set for
5989 each substring: the offsets to the start and end of the substring.
5990
5991 Arguments:
5992 code points to the compiled expression
5993 subject points to the subject string
5994 length length of subject string (may contain binary zeros)
5995 start_offset where to start in the subject string
5996 options option bits
5997 match_data points to a match_data block
5998 mcontext points a PCRE2 context
5999
6000 Returns: > 0 => success; value is the number of ovector pairs filled
6001 = 0 => success, but ovector is not big enough
6002 -1 => failed to match (PCRE2_ERROR_NOMATCH)
6003 -2 => partial match (PCRE2_ERROR_PARTIAL)
6004 < -2 => some kind of unexpected problem
6005 */
6006
6007 PCRE2_EXP_DEFN int PCRE2_CALL_CONVENTION
pcre2_match(const pcre2_code * code,PCRE2_SPTR subject,PCRE2_SIZE length,PCRE2_SIZE start_offset,uint32_t options,pcre2_match_data * match_data,pcre2_match_context * mcontext)6008 pcre2_match(const pcre2_code *code, PCRE2_SPTR subject, PCRE2_SIZE length,
6009 PCRE2_SIZE start_offset, uint32_t options, pcre2_match_data *match_data,
6010 pcre2_match_context *mcontext)
6011 {
6012 int rc;
6013 int was_zero_terminated = 0;
6014 const uint8_t *start_bits = NULL;
6015 const pcre2_real_code *re = (const pcre2_real_code *)code;
6016
6017
6018 BOOL anchored;
6019 BOOL firstline;
6020 BOOL has_first_cu = FALSE;
6021 BOOL has_req_cu = FALSE;
6022 BOOL startline;
6023 BOOL utf;
6024
6025 PCRE2_UCHAR first_cu = 0;
6026 PCRE2_UCHAR first_cu2 = 0;
6027 PCRE2_UCHAR req_cu = 0;
6028 PCRE2_UCHAR req_cu2 = 0;
6029
6030 PCRE2_SPTR bumpalong_limit;
6031 PCRE2_SPTR end_subject;
6032 PCRE2_SPTR start_match = subject + start_offset;
6033 PCRE2_SPTR req_cu_ptr = start_match - 1;
6034 PCRE2_SPTR start_partial = NULL;
6035 PCRE2_SPTR match_partial = NULL;
6036
6037 PCRE2_SIZE frame_size;
6038
6039 /* We need to have mb as a pointer to a match block, because the IS_NEWLINE
6040 macro is used below, and it expects NLBLOCK to be defined as a pointer. */
6041
6042 pcre2_callout_block cb;
6043 match_block actual_match_block;
6044 match_block *mb = &actual_match_block;
6045
6046 /* Allocate an initial vector of backtracking frames on the stack. If this
6047 proves to be too small, it is replaced by a larger one on the heap. To get a
6048 vector of the size required that is aligned for pointers, allocate it as a
6049 vector of pointers. */
6050
6051 PCRE2_SPTR stack_frames_vector[START_FRAMES_SIZE/sizeof(PCRE2_SPTR)]
6052 PCRE2_KEEP_UNINITIALIZED;
6053 mb->stack_frames = (heapframe *)stack_frames_vector;
6054
6055 /* A length equal to PCRE2_ZERO_TERMINATED implies a zero-terminated
6056 subject string. */
6057
6058 if (length == PCRE2_ZERO_TERMINATED)
6059 {
6060 length = PRIV(strlen)(subject);
6061 was_zero_terminated = 1;
6062 }
6063 end_subject = subject + length;
6064
6065 /* Plausibility checks */
6066
6067 if ((options & ~PUBLIC_MATCH_OPTIONS) != 0) return PCRE2_ERROR_BADOPTION;
6068 if (code == NULL || subject == NULL || match_data == NULL)
6069 return PCRE2_ERROR_NULL;
6070 if (start_offset > length) return PCRE2_ERROR_BADOFFSET;
6071
6072 /* Check that the first field in the block is the magic number. */
6073
6074 if (re->magic_number != MAGIC_NUMBER) return PCRE2_ERROR_BADMAGIC;
6075
6076 /* Check the code unit width. */
6077
6078 if ((re->flags & PCRE2_MODE_MASK) != PCRE2_CODE_UNIT_WIDTH/8)
6079 return PCRE2_ERROR_BADMODE;
6080
6081 /* PCRE2_NOTEMPTY and PCRE2_NOTEMPTY_ATSTART are match-time flags in the
6082 options variable for this function. Users of PCRE2 who are not calling the
6083 function directly would like to have a way of setting these flags, in the same
6084 way that they can set pcre2_compile() flags like PCRE2_NO_AUTOPOSSESS with
6085 constructions like (*NO_AUTOPOSSESS). To enable this, (*NOTEMPTY) and
6086 (*NOTEMPTY_ATSTART) set bits in the pattern's "flag" function which we now
6087 transfer to the options for this function. The bits are guaranteed to be
6088 adjacent, but do not have the same values. This bit of Boolean trickery assumes
6089 that the match-time bits are not more significant than the flag bits. If by
6090 accident this is not the case, a compile-time division by zero error will
6091 occur. */
6092
6093 #define FF (PCRE2_NOTEMPTY_SET|PCRE2_NE_ATST_SET)
6094 #define OO (PCRE2_NOTEMPTY|PCRE2_NOTEMPTY_ATSTART)
6095 options |= (re->flags & FF) / ((FF & (~FF+1)) / (OO & (~OO+1)));
6096 #undef FF
6097 #undef OO
6098
6099 /* These two settings are used in the code for checking a UTF string that
6100 follows immediately afterwards. Other values in the mb block are used only
6101 during interpretive processing, not when the JIT support is in use, so they are
6102 set up later. */
6103
6104 utf = (re->overall_options & PCRE2_UTF) != 0;
6105 mb->partial = ((options & PCRE2_PARTIAL_HARD) != 0)? 2 :
6106 ((options & PCRE2_PARTIAL_SOFT) != 0)? 1 : 0;
6107
6108 /* Partial matching and PCRE2_ENDANCHORED are currently not allowed at the same
6109 time. */
6110
6111 if (mb->partial != 0 &&
6112 ((re->overall_options | options) & PCRE2_ENDANCHORED) != 0)
6113 return PCRE2_ERROR_BADOPTION;
6114
6115 /* Check a UTF string for validity if required. For 8-bit and 16-bit strings,
6116 we must also check that a starting offset does not point into the middle of a
6117 multiunit character. We check only the portion of the subject that is going to
6118 be inspected during matching - from the offset minus the maximum back reference
6119 to the given length. This saves time when a small part of a large subject is
6120 being matched by the use of a starting offset. Note that the maximum lookbehind
6121 is a number of characters, not code units. */
6122
6123 #ifdef SUPPORT_UNICODE
6124 if (utf && (options & PCRE2_NO_UTF_CHECK) == 0)
6125 {
6126 PCRE2_SPTR check_subject = start_match; /* start_match includes offset */
6127
6128 if (start_offset > 0)
6129 {
6130 #if PCRE2_CODE_UNIT_WIDTH != 32
6131 unsigned int i;
6132 if (start_match < end_subject && NOT_FIRSTCU(*start_match))
6133 return PCRE2_ERROR_BADUTFOFFSET;
6134 for (i = re->max_lookbehind; i > 0 && check_subject > subject; i--)
6135 {
6136 check_subject--;
6137 while (check_subject > subject &&
6138 #if PCRE2_CODE_UNIT_WIDTH == 8
6139 (*check_subject & 0xc0) == 0x80)
6140 #else /* 16-bit */
6141 (*check_subject & 0xfc00) == 0xdc00)
6142 #endif /* PCRE2_CODE_UNIT_WIDTH == 8 */
6143 check_subject--;
6144 }
6145 #else
6146 /* In the 32-bit library, one code unit equals one character. However,
6147 we cannot just subtract the lookbehind and then compare pointers, because
6148 a very large lookbehind could create an invalid pointer. */
6149
6150 if (start_offset >= re->max_lookbehind)
6151 check_subject -= re->max_lookbehind;
6152 else
6153 check_subject = subject;
6154 #endif /* PCRE2_CODE_UNIT_WIDTH != 32 */
6155 }
6156
6157 /* Validate the relevant portion of the subject. After an error, adjust the
6158 offset to be an absolute offset in the whole string. */
6159
6160 match_data->rc = PRIV(valid_utf)(check_subject,
6161 length - (check_subject - subject), &(match_data->startchar));
6162 if (match_data->rc != 0)
6163 {
6164 match_data->startchar += check_subject - subject;
6165 return match_data->rc;
6166 }
6167 }
6168 #endif /* SUPPORT_UNICODE */
6169
6170 /* It is an error to set an offset limit without setting the flag at compile
6171 time. */
6172
6173 if (mcontext != NULL && mcontext->offset_limit != PCRE2_UNSET &&
6174 (re->overall_options & PCRE2_USE_OFFSET_LIMIT) == 0)
6175 return PCRE2_ERROR_BADOFFSETLIMIT;
6176
6177 /* If the match data block was previously used with PCRE2_COPY_MATCHED_SUBJECT,
6178 free the memory that was obtained. Set the field to NULL for no match cases. */
6179
6180 if ((match_data->flags & PCRE2_MD_COPIED_SUBJECT) != 0)
6181 {
6182 match_data->memctl.free((void *)match_data->subject,
6183 match_data->memctl.memory_data);
6184 match_data->flags &= ~PCRE2_MD_COPIED_SUBJECT;
6185 }
6186 match_data->subject = NULL;
6187
6188 /* If the pattern was successfully studied with JIT support, run the JIT
6189 executable instead of the rest of this function. Most options must be set at
6190 compile time for the JIT code to be usable. Fallback to the normal code path if
6191 an unsupported option is set or if JIT returns BADOPTION (which means that the
6192 selected normal or partial matching mode was not compiled). */
6193
6194 #ifdef SUPPORT_JIT
6195 if (re->executable_jit != NULL && (options & ~PUBLIC_JIT_MATCH_OPTIONS) == 0)
6196 {
6197 rc = pcre2_jit_match(code, subject, length, start_offset, options,
6198 match_data, mcontext);
6199 if (rc != PCRE2_ERROR_JIT_BADOPTION)
6200 {
6201 if (rc >= 0 && (options & PCRE2_COPY_MATCHED_SUBJECT) != 0)
6202 {
6203 length = CU2BYTES(length + was_zero_terminated);
6204 match_data->subject = match_data->memctl.malloc(length,
6205 match_data->memctl.memory_data);
6206 if (match_data->subject == NULL) return PCRE2_ERROR_NOMEMORY;
6207 memcpy((void *)match_data->subject, subject, length);
6208 match_data->flags |= PCRE2_MD_COPIED_SUBJECT;
6209 }
6210 return rc;
6211 }
6212 }
6213 #endif
6214
6215 /* Carry on with non-JIT matching. A NULL match context means "use a default
6216 context", but we take the memory control functions from the pattern. */
6217
6218 if (mcontext == NULL)
6219 {
6220 mcontext = (pcre2_match_context *)(&PRIV(default_match_context));
6221 mb->memctl = re->memctl;
6222 }
6223 else mb->memctl = mcontext->memctl;
6224
6225 anchored = ((re->overall_options | options) & PCRE2_ANCHORED) != 0;
6226 firstline = (re->overall_options & PCRE2_FIRSTLINE) != 0;
6227 startline = (re->flags & PCRE2_STARTLINE) != 0;
6228 bumpalong_limit = (mcontext->offset_limit == PCRE2_UNSET)?
6229 end_subject : subject + mcontext->offset_limit;
6230
6231 /* Initialize and set up the fixed fields in the callout block, with a pointer
6232 in the match block. */
6233
6234 mb->cb = &cb;
6235 cb.version = 2;
6236 cb.subject = subject;
6237 cb.subject_length = (PCRE2_SIZE)(end_subject - subject);
6238 cb.callout_flags = 0;
6239
6240 /* Fill in the remaining fields in the match block. */
6241
6242 mb->callout = mcontext->callout;
6243 mb->callout_data = mcontext->callout_data;
6244
6245 mb->start_subject = subject;
6246 mb->start_offset = start_offset;
6247 mb->end_subject = end_subject;
6248 mb->hasthen = (re->flags & PCRE2_HASTHEN) != 0;
6249
6250 mb->moptions = options; /* Match options */
6251 mb->poptions = re->overall_options; /* Pattern options */
6252
6253 mb->ignore_skip_arg = 0;
6254 mb->mark = mb->nomatch_mark = NULL; /* In case never set */
6255 mb->hitend = FALSE;
6256
6257 /* The name table is needed for finding all the numbers associated with a
6258 given name, for condition testing. The code follows the name table. */
6259
6260 mb->name_table = (PCRE2_UCHAR *)((uint8_t *)re + sizeof(pcre2_real_code));
6261 mb->name_count = re->name_count;
6262 mb->name_entry_size = re->name_entry_size;
6263 mb->start_code = mb->name_table + re->name_count * re->name_entry_size;
6264
6265 /* Process the \R and newline settings. */
6266
6267 mb->bsr_convention = re->bsr_convention;
6268 mb->nltype = NLTYPE_FIXED;
6269 switch(re->newline_convention)
6270 {
6271 case PCRE2_NEWLINE_CR:
6272 mb->nllen = 1;
6273 mb->nl[0] = CHAR_CR;
6274 break;
6275
6276 case PCRE2_NEWLINE_LF:
6277 mb->nllen = 1;
6278 mb->nl[0] = CHAR_NL;
6279 break;
6280
6281 case PCRE2_NEWLINE_NUL:
6282 mb->nllen = 1;
6283 mb->nl[0] = CHAR_NUL;
6284 break;
6285
6286 case PCRE2_NEWLINE_CRLF:
6287 mb->nllen = 2;
6288 mb->nl[0] = CHAR_CR;
6289 mb->nl[1] = CHAR_NL;
6290 break;
6291
6292 case PCRE2_NEWLINE_ANY:
6293 mb->nltype = NLTYPE_ANY;
6294 break;
6295
6296 case PCRE2_NEWLINE_ANYCRLF:
6297 mb->nltype = NLTYPE_ANYCRLF;
6298 break;
6299
6300 default: return PCRE2_ERROR_INTERNAL;
6301 }
6302
6303 /* The backtracking frames have fixed data at the front, and a PCRE2_SIZE
6304 vector at the end, whose size depends on the number of capturing parentheses in
6305 the pattern. It is not used at all if there are no capturing parentheses.
6306
6307 frame_size is the total size of each frame
6308 mb->frame_vector_size is the total usable size of the vector (rounded down
6309 to a whole number of frames)
6310
6311 The last of these is changed within the match() function if the frame vector
6312 has to be expanded. We therefore put it into the match block so that it is
6313 correct when calling match() more than once for non-anchored patterns. */
6314
6315 frame_size = offsetof(heapframe, ovector) +
6316 re->top_bracket * 2 * sizeof(PCRE2_SIZE);
6317
6318 /* Limits set in the pattern override the match context only if they are
6319 smaller. */
6320
6321 mb->heap_limit = (mcontext->heap_limit < re->limit_heap)?
6322 mcontext->heap_limit : re->limit_heap;
6323
6324 mb->match_limit = (mcontext->match_limit < re->limit_match)?
6325 mcontext->match_limit : re->limit_match;
6326
6327 mb->match_limit_depth = (mcontext->depth_limit < re->limit_depth)?
6328 mcontext->depth_limit : re->limit_depth;
6329
6330 /* If a pattern has very many capturing parentheses, the frame size may be very
6331 large. Ensure that there are at least 10 available frames by getting an initial
6332 vector on the heap if necessary, except when the heap limit prevents this. Get
6333 fewer if possible. (The heap limit is in kibibytes.) */
6334
6335 if (frame_size <= START_FRAMES_SIZE/10)
6336 {
6337 mb->match_frames = mb->stack_frames; /* Initial frame vector on the stack */
6338 mb->frame_vector_size = ((START_FRAMES_SIZE/frame_size) * frame_size);
6339 }
6340 else
6341 {
6342 mb->frame_vector_size = frame_size * 10;
6343 if ((mb->frame_vector_size / 1024) > mb->heap_limit)
6344 {
6345 if (frame_size > mb->heap_limit * 1024) return PCRE2_ERROR_HEAPLIMIT;
6346 mb->frame_vector_size = ((mb->heap_limit * 1024)/frame_size) * frame_size;
6347 }
6348 mb->match_frames = mb->memctl.malloc(mb->frame_vector_size,
6349 mb->memctl.memory_data);
6350 if (mb->match_frames == NULL) return PCRE2_ERROR_NOMEMORY;
6351 }
6352
6353 mb->match_frames_top =
6354 (heapframe *)((char *)mb->match_frames + mb->frame_vector_size);
6355
6356 /* Write to the ovector within the first frame to mark every capture unset and
6357 to avoid uninitialized memory read errors when it is copied to a new frame. */
6358
6359 memset((char *)(mb->match_frames) + offsetof(heapframe, ovector), 0xff,
6360 re->top_bracket * 2 * sizeof(PCRE2_SIZE));
6361
6362 /* Pointers to the individual character tables */
6363
6364 mb->lcc = re->tables + lcc_offset;
6365 mb->fcc = re->tables + fcc_offset;
6366 mb->ctypes = re->tables + ctypes_offset;
6367
6368 /* Set up the first code unit to match, if available. If there's no first code
6369 unit there may be a bitmap of possible first characters. */
6370
6371 if ((re->flags & PCRE2_FIRSTSET) != 0)
6372 {
6373 has_first_cu = TRUE;
6374 first_cu = first_cu2 = (PCRE2_UCHAR)(re->first_codeunit);
6375 if ((re->flags & PCRE2_FIRSTCASELESS) != 0)
6376 {
6377 first_cu2 = TABLE_GET(first_cu, mb->fcc, first_cu);
6378 #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 8
6379 if (utf && first_cu > 127) first_cu2 = UCD_OTHERCASE(first_cu);
6380 #endif
6381 }
6382 }
6383 else
6384 if (!startline && (re->flags & PCRE2_FIRSTMAPSET) != 0)
6385 start_bits = re->start_bitmap;
6386
6387 /* There may also be a "last known required character" set. */
6388
6389 if ((re->flags & PCRE2_LASTSET) != 0)
6390 {
6391 has_req_cu = TRUE;
6392 req_cu = req_cu2 = (PCRE2_UCHAR)(re->last_codeunit);
6393 if ((re->flags & PCRE2_LASTCASELESS) != 0)
6394 {
6395 req_cu2 = TABLE_GET(req_cu, mb->fcc, req_cu);
6396 #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 8
6397 if (utf && req_cu > 127) req_cu2 = UCD_OTHERCASE(req_cu);
6398 #endif
6399 }
6400 }
6401
6402
6403 /* ==========================================================================*/
6404
6405 /* Loop for handling unanchored repeated matching attempts; for anchored regexs
6406 the loop runs just once. */
6407
6408 for(;;)
6409 {
6410 PCRE2_SPTR new_start_match;
6411
6412 /* ----------------- Start of match optimizations ---------------- */
6413
6414 /* There are some optimizations that avoid running the match if a known
6415 starting point is not found, or if a known later code unit is not present.
6416 However, there is an option (settable at compile time) that disables these,
6417 for testing and for ensuring that all callouts do actually occur. */
6418
6419 if ((re->overall_options & PCRE2_NO_START_OPTIMIZE) == 0)
6420 {
6421 /* If firstline is TRUE, the start of the match is constrained to the first
6422 line of a multiline string. That is, the match must be before or at the
6423 first newline following the start of matching. Temporarily adjust
6424 end_subject so that we stop the scans for a first code unit at a newline.
6425 If the match fails at the newline, later code breaks the loop. */
6426
6427 if (firstline)
6428 {
6429 PCRE2_SPTR t = start_match;
6430 #ifdef SUPPORT_UNICODE
6431 if (utf)
6432 {
6433 while (t < end_subject && !IS_NEWLINE(t))
6434 {
6435 t++;
6436 ACROSSCHAR(t < end_subject, t, t++);
6437 }
6438 }
6439 else
6440 #endif
6441 while (t < end_subject && !IS_NEWLINE(t)) t++;
6442 end_subject = t;
6443 }
6444
6445 /* Anchored: check the first code unit if one is recorded. This may seem
6446 pointless but it can help in detecting a no match case without scanning for
6447 the required code unit. */
6448
6449 if (anchored)
6450 {
6451 if (has_first_cu || start_bits != NULL)
6452 {
6453 BOOL ok = start_match < end_subject;
6454 if (ok)
6455 {
6456 PCRE2_UCHAR c = UCHAR21TEST(start_match);
6457 ok = has_first_cu && (c == first_cu || c == first_cu2);
6458 if (!ok && start_bits != NULL)
6459 {
6460 #if PCRE2_CODE_UNIT_WIDTH != 8
6461 if (c > 255) c = 255;
6462 #endif
6463 ok = (start_bits[c/8] & (1u << (c&7))) != 0;
6464 }
6465 }
6466 if (!ok)
6467 {
6468 rc = MATCH_NOMATCH;
6469 break;
6470 }
6471 }
6472 }
6473
6474 /* Not anchored. Advance to a unique first code unit if there is one. In
6475 8-bit mode, the use of memchr() gives a big speed up, even though we have
6476 to call it twice in caseless mode, in order to find the earliest occurrence
6477 of the character in either of its cases. */
6478
6479 else
6480 {
6481 if (has_first_cu)
6482 {
6483 if (first_cu != first_cu2) /* Caseless */
6484 {
6485 #if PCRE2_CODE_UNIT_WIDTH != 8
6486 PCRE2_UCHAR smc;
6487 while (start_match < end_subject &&
6488 (smc = UCHAR21TEST(start_match)) != first_cu &&
6489 smc != first_cu2)
6490 start_match++;
6491 #else /* 8-bit code units */
6492 PCRE2_SPTR pp1 =
6493 memchr(start_match, first_cu, end_subject-start_match);
6494 PCRE2_SPTR pp2 =
6495 memchr(start_match, first_cu2, end_subject-start_match);
6496 if (pp1 == NULL)
6497 start_match = (pp2 == NULL)? end_subject : pp2;
6498 else
6499 start_match = (pp2 == NULL || pp1 < pp2)? pp1 : pp2;
6500 #endif
6501 }
6502
6503 /* The caseful case */
6504
6505 else
6506 {
6507 #if PCRE2_CODE_UNIT_WIDTH != 8
6508 while (start_match < end_subject && UCHAR21TEST(start_match) !=
6509 first_cu)
6510 start_match++;
6511 #else
6512 start_match = memchr(start_match, first_cu, end_subject - start_match);
6513 if (start_match == NULL) start_match = end_subject;
6514 #endif
6515 }
6516
6517 /* If we can't find the required code unit, having reached the true end
6518 of the subject, break the bumpalong loop, to force a match failure,
6519 except when doing partial matching, when we let the next cycle run at
6520 the end of the subject. To see why, consider the pattern /(?<=abc)def/,
6521 which partially matches "abc", even though the string does not contain
6522 the starting character "d". If we have not reached the true end of the
6523 subject (PCRE2_FIRSTLINE caused end_subject to be temporarily modified)
6524 we also let the cycle run, because the matching string is legitimately
6525 allowed to start with the first code unit of a newline. */
6526
6527 if (!mb->partial && start_match >= mb->end_subject)
6528 {
6529 rc = MATCH_NOMATCH;
6530 break;
6531 }
6532 }
6533
6534 /* If there's no first code unit, advance to just after a linebreak for a
6535 multiline match if required. */
6536
6537 else if (startline)
6538 {
6539 if (start_match > mb->start_subject + start_offset)
6540 {
6541 #ifdef SUPPORT_UNICODE
6542 if (utf)
6543 {
6544 while (start_match < end_subject && !WAS_NEWLINE(start_match))
6545 {
6546 start_match++;
6547 ACROSSCHAR(start_match < end_subject, start_match, start_match++);
6548 }
6549 }
6550 else
6551 #endif
6552 while (start_match < end_subject && !WAS_NEWLINE(start_match))
6553 start_match++;
6554
6555 /* If we have just passed a CR and the newline option is ANY or
6556 ANYCRLF, and we are now at a LF, advance the match position by one
6557 more code unit. */
6558
6559 if (start_match[-1] == CHAR_CR &&
6560 (mb->nltype == NLTYPE_ANY || mb->nltype == NLTYPE_ANYCRLF) &&
6561 start_match < end_subject &&
6562 UCHAR21TEST(start_match) == CHAR_NL)
6563 start_match++;
6564 }
6565 }
6566
6567 /* If there's no first code unit or a requirement for a multiline line
6568 start, advance to a non-unique first code unit if any have been
6569 identified. The bitmap contains only 256 bits. When code units are 16 or
6570 32 bits wide, all code units greater than 254 set the 255 bit. */
6571
6572 else if (start_bits != NULL)
6573 {
6574 while (start_match < end_subject)
6575 {
6576 uint32_t c = UCHAR21TEST(start_match);
6577 #if PCRE2_CODE_UNIT_WIDTH != 8
6578 if (c > 255) c = 255;
6579 #endif
6580 if ((start_bits[c/8] & (1u << (c&7))) != 0) break;
6581 start_match++;
6582 }
6583
6584 /* See comment above in first_cu checking about the next few lines. */
6585
6586 if (!mb->partial && start_match >= mb->end_subject)
6587 {
6588 rc = MATCH_NOMATCH;
6589 break;
6590 }
6591 }
6592 } /* End first code unit handling */
6593
6594 /* Restore fudged end_subject */
6595
6596 end_subject = mb->end_subject;
6597
6598 /* The following two optimizations must be disabled for partial matching. */
6599
6600 if (!mb->partial)
6601 {
6602 /* The minimum matching length is a lower bound; no string of that length
6603 may actually match the pattern. Although the value is, strictly, in
6604 characters, we treat it as code units to avoid spending too much time in
6605 this optimization. */
6606
6607 if (end_subject - start_match < re->minlength)
6608 {
6609 rc = MATCH_NOMATCH;
6610 break;
6611 }
6612
6613 /* If req_cu is set, we know that that code unit must appear in the
6614 subject for the (non-partial) match to succeed. If the first code unit is
6615 set, req_cu must be later in the subject; otherwise the test starts at
6616 the match point. This optimization can save a huge amount of backtracking
6617 in patterns with nested unlimited repeats that aren't going to match.
6618 Writing separate code for caseful/caseless versions makes it go faster,
6619 as does using an autoincrement and backing off on a match. As in the case
6620 of the first code unit, using memchr() in the 8-bit library gives a big
6621 speed up. Unlike the first_cu check above, we do not need to call
6622 memchr() twice in the caseless case because we only need to check for the
6623 presence of the character in either case, not find the first occurrence.
6624
6625 HOWEVER: when the subject string is very, very long, searching to its end
6626 can take a long time, and give bad performance on quite ordinary
6627 patterns. This showed up when somebody was matching something like
6628 /^\d+C/ on a 32-megabyte string... so we don't do this when the string is
6629 sufficiently long. */
6630
6631 if (has_req_cu && end_subject - start_match < REQ_CU_MAX)
6632 {
6633 PCRE2_SPTR p = start_match + (has_first_cu? 1:0);
6634
6635 /* We don't need to repeat the search if we haven't yet reached the
6636 place we found it last time round the bumpalong loop. */
6637
6638 if (p > req_cu_ptr)
6639 {
6640 if (p < end_subject)
6641 {
6642 if (req_cu != req_cu2) /* Caseless */
6643 {
6644 #if PCRE2_CODE_UNIT_WIDTH != 8
6645 do
6646 {
6647 uint32_t pp = UCHAR21INCTEST(p);
6648 if (pp == req_cu || pp == req_cu2) { p--; break; }
6649 }
6650 while (p < end_subject);
6651
6652 #else /* 8-bit code units */
6653 PCRE2_SPTR pp = p;
6654 p = memchr(pp, req_cu, end_subject - pp);
6655 if (p == NULL)
6656 {
6657 p = memchr(pp, req_cu2, end_subject - pp);
6658 if (p == NULL) p = end_subject;
6659 }
6660 #endif /* PCRE2_CODE_UNIT_WIDTH != 8 */
6661 }
6662
6663 /* The caseful case */
6664
6665 else
6666 {
6667 #if PCRE2_CODE_UNIT_WIDTH != 8
6668 do
6669 {
6670 if (UCHAR21INCTEST(p) == req_cu) { p--; break; }
6671 }
6672 while (p < end_subject);
6673
6674 #else /* 8-bit code units */
6675 p = memchr(p, req_cu, end_subject - p);
6676 if (p == NULL) p = end_subject;
6677 #endif
6678 }
6679 }
6680
6681 /* If we can't find the required code unit, break the bumpalong loop,
6682 forcing a match failure. */
6683
6684 if (p >= end_subject)
6685 {
6686 rc = MATCH_NOMATCH;
6687 break;
6688 }
6689
6690 /* If we have found the required code unit, save the point where we
6691 found it, so that we don't search again next time round the bumpalong
6692 loop if the start hasn't yet passed this code unit. */
6693
6694 req_cu_ptr = p;
6695 }
6696 }
6697 }
6698 }
6699
6700 /* ------------ End of start of match optimizations ------------ */
6701
6702 /* Give no match if we have passed the bumpalong limit. */
6703
6704 if (start_match > bumpalong_limit)
6705 {
6706 rc = MATCH_NOMATCH;
6707 break;
6708 }
6709
6710 /* OK, we can now run the match. If "hitend" is set afterwards, remember the
6711 first starting point for which a partial match was found. */
6712
6713 cb.start_match = (PCRE2_SIZE)(start_match - subject);
6714 cb.callout_flags |= PCRE2_CALLOUT_STARTMATCH;
6715
6716 mb->start_used_ptr = start_match;
6717 mb->last_used_ptr = start_match;
6718 mb->match_call_count = 0;
6719 mb->end_offset_top = 0;
6720 mb->skip_arg_count = 0;
6721
6722 rc = match(start_match, mb->start_code, match_data->ovector,
6723 match_data->oveccount, re->top_bracket, frame_size, mb);
6724
6725 if (mb->hitend && start_partial == NULL)
6726 {
6727 start_partial = mb->start_used_ptr;
6728 match_partial = start_match;
6729 }
6730
6731 switch(rc)
6732 {
6733 /* If MATCH_SKIP_ARG reaches this level it means that a MARK that matched
6734 the SKIP's arg was not found. In this circumstance, Perl ignores the SKIP
6735 entirely. The only way we can do that is to re-do the match at the same
6736 point, with a flag to force SKIP with an argument to be ignored. Just
6737 treating this case as NOMATCH does not work because it does not check other
6738 alternatives in patterns such as A(*SKIP:A)B|AC when the subject is AC. */
6739
6740 case MATCH_SKIP_ARG:
6741 new_start_match = start_match;
6742 mb->ignore_skip_arg = mb->skip_arg_count;
6743 break;
6744
6745 /* SKIP passes back the next starting point explicitly, but if it is no
6746 greater than the match we have just done, treat it as NOMATCH. */
6747
6748 case MATCH_SKIP:
6749 if (mb->verb_skip_ptr > start_match)
6750 {
6751 new_start_match = mb->verb_skip_ptr;
6752 break;
6753 }
6754 /* Fall through */
6755
6756 /* NOMATCH and PRUNE advance by one character. THEN at this level acts
6757 exactly like PRUNE. Unset ignore SKIP-with-argument. */
6758
6759 case MATCH_NOMATCH:
6760 case MATCH_PRUNE:
6761 case MATCH_THEN:
6762 mb->ignore_skip_arg = 0;
6763 new_start_match = start_match + 1;
6764 #ifdef SUPPORT_UNICODE
6765 if (utf)
6766 ACROSSCHAR(new_start_match < end_subject, new_start_match,
6767 new_start_match++);
6768 #endif
6769 break;
6770
6771 /* COMMIT disables the bumpalong, but otherwise behaves as NOMATCH. */
6772
6773 case MATCH_COMMIT:
6774 rc = MATCH_NOMATCH;
6775 goto ENDLOOP;
6776
6777 /* Any other return is either a match, or some kind of error. */
6778
6779 default:
6780 goto ENDLOOP;
6781 }
6782
6783 /* Control reaches here for the various types of "no match at this point"
6784 result. Reset the code to MATCH_NOMATCH for subsequent checking. */
6785
6786 rc = MATCH_NOMATCH;
6787
6788 /* If PCRE2_FIRSTLINE is set, the match must happen before or at the first
6789 newline in the subject (though it may continue over the newline). Therefore,
6790 if we have just failed to match, starting at a newline, do not continue. */
6791
6792 if (firstline && IS_NEWLINE(start_match)) break;
6793
6794 /* Advance to new matching position */
6795
6796 start_match = new_start_match;
6797
6798 /* Break the loop if the pattern is anchored or if we have passed the end of
6799 the subject. */
6800
6801 if (anchored || start_match > end_subject) break;
6802
6803 /* If we have just passed a CR and we are now at a LF, and the pattern does
6804 not contain any explicit matches for \r or \n, and the newline option is CRLF
6805 or ANY or ANYCRLF, advance the match position by one more code unit. In
6806 normal matching start_match will aways be greater than the first position at
6807 this stage, but a failed *SKIP can cause a return at the same point, which is
6808 why the first test exists. */
6809
6810 if (start_match > subject + start_offset &&
6811 start_match[-1] == CHAR_CR &&
6812 start_match < end_subject &&
6813 *start_match == CHAR_NL &&
6814 (re->flags & PCRE2_HASCRORLF) == 0 &&
6815 (mb->nltype == NLTYPE_ANY ||
6816 mb->nltype == NLTYPE_ANYCRLF ||
6817 mb->nllen == 2))
6818 start_match++;
6819
6820 mb->mark = NULL; /* Reset for start of next match attempt */
6821 } /* End of for(;;) "bumpalong" loop */
6822
6823 /* ==========================================================================*/
6824
6825 /* When we reach here, one of the following stopping conditions is true:
6826
6827 (1) The match succeeded, either completely, or partially;
6828
6829 (2) The pattern is anchored or the match was failed after (*COMMIT);
6830
6831 (3) We are past the end of the subject or the bumpalong limit;
6832
6833 (4) PCRE2_FIRSTLINE is set and we have failed to match at a newline, because
6834 this option requests that a match occur at or before the first newline in
6835 the subject.
6836
6837 (5) Some kind of error occurred.
6838
6839 */
6840
6841 ENDLOOP:
6842
6843 /* Release an enlarged frame vector that is on the heap. */
6844
6845 if (mb->match_frames != mb->stack_frames)
6846 mb->memctl.free(mb->match_frames, mb->memctl.memory_data);
6847
6848 /* Fill in fields that are always returned in the match data. */
6849
6850 match_data->code = re;
6851 match_data->mark = mb->mark;
6852 match_data->matchedby = PCRE2_MATCHEDBY_INTERPRETER;
6853
6854 /* Handle a fully successful match. Set the return code to the number of
6855 captured strings, or 0 if there were too many to fit into the ovector, and then
6856 set the remaining returned values before returning. Make a copy of the subject
6857 string if requested. */
6858
6859 if (rc == MATCH_MATCH)
6860 {
6861 match_data->rc = ((int)mb->end_offset_top >= 2 * match_data->oveccount)?
6862 0 : (int)mb->end_offset_top/2 + 1;
6863 match_data->startchar = start_match - subject;
6864 match_data->leftchar = mb->start_used_ptr - subject;
6865 match_data->rightchar = ((mb->last_used_ptr > mb->end_match_ptr)?
6866 mb->last_used_ptr : mb->end_match_ptr) - subject;
6867 if ((options & PCRE2_COPY_MATCHED_SUBJECT) != 0)
6868 {
6869 length = CU2BYTES(length + was_zero_terminated);
6870 match_data->subject = match_data->memctl.malloc(length,
6871 match_data->memctl.memory_data);
6872 if (match_data->subject == NULL) return PCRE2_ERROR_NOMEMORY;
6873 memcpy((void *)match_data->subject, subject, length);
6874 match_data->flags |= PCRE2_MD_COPIED_SUBJECT;
6875 }
6876 else match_data->subject = subject;
6877 return match_data->rc;
6878 }
6879
6880 /* Control gets here if there has been a partial match, an error, or if the
6881 overall match attempt has failed at all permitted starting positions. Any mark
6882 data is in the nomatch_mark field. */
6883
6884 match_data->mark = mb->nomatch_mark;
6885
6886 /* For anything other than nomatch or partial match, just return the code. */
6887
6888 if (rc != MATCH_NOMATCH && rc != PCRE2_ERROR_PARTIAL) match_data->rc = rc;
6889
6890 /* Handle a partial match. If a "soft" partial match was requested, searching
6891 for a complete match will have continued, and the value of rc at this point
6892 will be MATCH_NOMATCH. For a "hard" partial match, it will already be
6893 PCRE2_ERROR_PARTIAL. */
6894
6895 else if (match_partial != NULL)
6896 {
6897 match_data->subject = subject;
6898 match_data->ovector[0] = match_partial - subject;
6899 match_data->ovector[1] = end_subject - subject;
6900 match_data->startchar = match_partial - subject;
6901 match_data->leftchar = start_partial - subject;
6902 match_data->rightchar = end_subject - subject;
6903 match_data->rc = PCRE2_ERROR_PARTIAL;
6904 }
6905
6906 /* Else this is the classic nomatch case. */
6907
6908 else match_data->rc = PCRE2_ERROR_NOMATCH;
6909
6910 return match_data->rc;
6911 }
6912
6913 /* End of pcre2_match.c */
6914