1 /*************************************************
2 * Perl-Compatible Regular Expressions *
3 *************************************************/
4
5 /* PCRE is a library of functions to support regular expressions whose syntax
6 and semantics are as close as possible to those of the Perl 5 language.
7
8 Written by Philip Hazel
9 Original API code Copyright (c) 1997-2012 University of Cambridge
10 New API code Copyright (c) 2015-2020 University of Cambridge
11
12 -----------------------------------------------------------------------------
13 Redistribution and use in source and binary forms, with or without
14 modification, are permitted provided that the following conditions are met:
15
16 * Redistributions of source code must retain the above copyright notice,
17 this list of conditions and the following disclaimer.
18
19 * Redistributions in binary form must reproduce the above copyright
20 notice, this list of conditions and the following disclaimer in the
21 documentation and/or other materials provided with the distribution.
22
23 * Neither the name of the University of Cambridge nor the names of its
24 contributors may be used to endorse or promote products derived from
25 this software without specific prior written permission.
26
27 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
28 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
29 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
30 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
31 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
32 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
33 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
34 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
35 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
36 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
37 POSSIBILITY OF SUCH DAMAGE.
38 -----------------------------------------------------------------------------
39 */
40
41
42 #ifdef HAVE_CONFIG_H
43 #include "config.h"
44 #endif
45
46 /* These defines enable debugging code */
47
48 /* #define DEBUG_FRAMES_DISPLAY */
49 /* #define DEBUG_SHOW_OPS */
50 /* #define DEBUG_SHOW_RMATCH */
51
52 #ifdef DEBUG_FRAME_DISPLAY
53 #include <stdarg.h>
54 #endif
55
56 /* These defines identify the name of the block containing "static"
57 information, and fields within it. */
58
59 #define NLBLOCK mb /* Block containing newline information */
60 #define PSSTART start_subject /* Field containing processed string start */
61 #define PSEND end_subject /* Field containing processed string end */
62
63 #include "pcre2_internal.h"
64
65 #define RECURSE_UNSET 0xffffffffu /* Bigger than max group number */
66
67 /* Masks for identifying the public options that are permitted at match time. */
68
69 #define PUBLIC_MATCH_OPTIONS \
70 (PCRE2_ANCHORED|PCRE2_ENDANCHORED|PCRE2_NOTBOL|PCRE2_NOTEOL|PCRE2_NOTEMPTY| \
71 PCRE2_NOTEMPTY_ATSTART|PCRE2_NO_UTF_CHECK|PCRE2_PARTIAL_HARD| \
72 PCRE2_PARTIAL_SOFT|PCRE2_NO_JIT|PCRE2_COPY_MATCHED_SUBJECT)
73
74 #define PUBLIC_JIT_MATCH_OPTIONS \
75 (PCRE2_NO_UTF_CHECK|PCRE2_NOTBOL|PCRE2_NOTEOL|PCRE2_NOTEMPTY|\
76 PCRE2_NOTEMPTY_ATSTART|PCRE2_PARTIAL_SOFT|PCRE2_PARTIAL_HARD|\
77 PCRE2_COPY_MATCHED_SUBJECT)
78
79 /* Non-error returns from and within the match() function. Error returns are
80 externally defined PCRE2_ERROR_xxx codes, which are all negative. */
81
82 #define MATCH_MATCH 1
83 #define MATCH_NOMATCH 0
84
85 /* Special internal returns used in the match() function. Make them
86 sufficiently negative to avoid the external error codes. */
87
88 #define MATCH_ACCEPT (-999)
89 #define MATCH_KETRPOS (-998)
90 /* The next 5 must be kept together and in sequence so that a test that checks
91 for any one of them can use a range. */
92 #define MATCH_COMMIT (-997)
93 #define MATCH_PRUNE (-996)
94 #define MATCH_SKIP (-995)
95 #define MATCH_SKIP_ARG (-994)
96 #define MATCH_THEN (-993)
97 #define MATCH_BACKTRACK_MAX MATCH_THEN
98 #define MATCH_BACKTRACK_MIN MATCH_COMMIT
99
100 /* Group frame type values. Zero means the frame is not a group frame. The
101 lower 16 bits are used for data (e.g. the capture number). Group frames are
102 used for most groups so that information about the start is easily available at
103 the end without having to scan back through intermediate frames (backtrack
104 points). */
105
106 #define GF_CAPTURE 0x00010000u
107 #define GF_NOCAPTURE 0x00020000u
108 #define GF_CONDASSERT 0x00030000u
109 #define GF_RECURSE 0x00040000u
110
111 /* Masks for the identity and data parts of the group frame type. */
112
113 #define GF_IDMASK(a) ((a) & 0xffff0000u)
114 #define GF_DATAMASK(a) ((a) & 0x0000ffffu)
115
116 /* Repetition types */
117
118 enum { REPTYPE_MIN, REPTYPE_MAX, REPTYPE_POS };
119
120 /* Min and max values for the common repeats; a maximum of UINT32_MAX =>
121 infinity. */
122
123 static const uint32_t rep_min[] = {
124 0, 0, /* * and *? */
125 1, 1, /* + and +? */
126 0, 0, /* ? and ?? */
127 0, 0, /* dummy placefillers for OP_CR[MIN]RANGE */
128 0, 1, 0 }; /* OP_CRPOS{STAR, PLUS, QUERY} */
129
130 static const uint32_t rep_max[] = {
131 UINT32_MAX, UINT32_MAX, /* * and *? */
132 UINT32_MAX, UINT32_MAX, /* + and +? */
133 1, 1, /* ? and ?? */
134 0, 0, /* dummy placefillers for OP_CR[MIN]RANGE */
135 UINT32_MAX, UINT32_MAX, 1 }; /* OP_CRPOS{STAR, PLUS, QUERY} */
136
137 /* Repetition types - must include OP_CRPOSRANGE (not needed above) */
138
139 static const uint32_t rep_typ[] = {
140 REPTYPE_MAX, REPTYPE_MIN, /* * and *? */
141 REPTYPE_MAX, REPTYPE_MIN, /* + and +? */
142 REPTYPE_MAX, REPTYPE_MIN, /* ? and ?? */
143 REPTYPE_MAX, REPTYPE_MIN, /* OP_CRRANGE and OP_CRMINRANGE */
144 REPTYPE_POS, REPTYPE_POS, /* OP_CRPOSSTAR, OP_CRPOSPLUS */
145 REPTYPE_POS, REPTYPE_POS }; /* OP_CRPOSQUERY, OP_CRPOSRANGE */
146
147 /* Numbers for RMATCH calls at backtracking points. When these lists are
148 changed, the code at RETURN_SWITCH below must be updated in sync. */
149
150 enum { RM1=1, RM2, RM3, RM4, RM5, RM6, RM7, RM8, RM9, RM10,
151 RM11, RM12, RM13, RM14, RM15, RM16, RM17, RM18, RM19, RM20,
152 RM21, RM22, RM23, RM24, RM25, RM26, RM27, RM28, RM29, RM30,
153 RM31, RM32, RM33, RM34, RM35, RM36 };
154
155 #ifdef SUPPORT_WIDE_CHARS
156 enum { RM100=100, RM101 };
157 #endif
158
159 #ifdef SUPPORT_UNICODE
160 enum { RM200=200, RM201, RM202, RM203, RM204, RM205, RM206, RM207,
161 RM208, RM209, RM210, RM211, RM212, RM213, RM214, RM215,
162 RM216, RM217, RM218, RM219, RM220, RM221, RM222 };
163 #endif
164
165 /* Define short names for general fields in the current backtrack frame, which
166 is always pointed to by the F variable. Occasional references to fields in
167 other frames are written out explicitly. There are also some fields in the
168 current frame whose names start with "temp" that are used for short-term,
169 localised backtracking memory. These are #defined with Lxxx names at the point
170 of use and undefined afterwards. */
171
172 #define Fback_frame F->back_frame
173 #define Fcapture_last F->capture_last
174 #define Fcurrent_recurse F->current_recurse
175 #define Fecode F->ecode
176 #define Feptr F->eptr
177 #define Fgroup_frame_type F->group_frame_type
178 #define Flast_group_offset F->last_group_offset
179 #define Flength F->length
180 #define Fmark F->mark
181 #define Frdepth F->rdepth
182 #define Fstart_match F->start_match
183 #define Foffset_top F->offset_top
184 #define Foccu F->occu
185 #define Fop F->op
186 #define Fovector F->ovector
187 #define Freturn_id F->return_id
188
189
190 #ifdef DEBUG_FRAMES_DISPLAY
191 /*************************************************
192 * Display current frames and contents *
193 *************************************************/
194
195 /* This debugging function displays the current set of frames and their
196 contents. It is not called automatically from anywhere, the intention being
197 that calls can be inserted where necessary when debugging frame-related
198 problems.
199
200 Arguments:
201 f the file to write to
202 F the current top frame
203 P a previous frame of interest
204 frame_size the frame size
205 mb points to the match block
206 s identification text
207
208 Returns: nothing
209 */
210
211 static void
display_frames(FILE * f,heapframe * F,heapframe * P,PCRE2_SIZE frame_size,match_block * mb,const char * s,...)212 display_frames(FILE *f, heapframe *F, heapframe *P, PCRE2_SIZE frame_size,
213 match_block *mb, const char *s, ...)
214 {
215 uint32_t i;
216 heapframe *Q;
217 va_list ap;
218 va_start(ap, s);
219
220 fprintf(f, "FRAMES ");
221 vfprintf(f, s, ap);
222 va_end(ap);
223
224 if (P != NULL) fprintf(f, " P=%lu",
225 ((char *)P - (char *)(mb->match_frames))/frame_size);
226 fprintf(f, "\n");
227
228 for (i = 0, Q = mb->match_frames;
229 Q <= F;
230 i++, Q = (heapframe *)((char *)Q + frame_size))
231 {
232 fprintf(f, "Frame %d type=%x subj=%lu code=%d back=%lu id=%d",
233 i, Q->group_frame_type, Q->eptr - mb->start_subject, *(Q->ecode),
234 Q->back_frame, Q->return_id);
235
236 if (Q->last_group_offset == PCRE2_UNSET)
237 fprintf(f, " lgoffset=unset\n");
238 else
239 fprintf(f, " lgoffset=%lu\n", Q->last_group_offset/frame_size);
240 }
241 }
242
243 #endif
244
245
246
247 /*************************************************
248 * Process a callout *
249 *************************************************/
250
251 /* This function is called for all callouts, whether "standalone" or at the
252 start of a conditional group. Feptr will be pointing to either OP_CALLOUT or
253 OP_CALLOUT_STR. A callout block is allocated in pcre2_match() and initialized
254 with fixed values.
255
256 Arguments:
257 F points to the current backtracking frame
258 mb points to the match block
259 lengthptr where to return the length of the callout item
260
261 Returns: the return from the callout
262 or 0 if no callout function exists
263 */
264
265 static int
do_callout(heapframe * F,match_block * mb,PCRE2_SIZE * lengthptr)266 do_callout(heapframe *F, match_block *mb, PCRE2_SIZE *lengthptr)
267 {
268 int rc;
269 PCRE2_SIZE save0, save1;
270 PCRE2_SIZE *callout_ovector;
271 pcre2_callout_block *cb;
272
273 *lengthptr = (*Fecode == OP_CALLOUT)?
274 PRIV(OP_lengths)[OP_CALLOUT] : GET(Fecode, 1 + 2*LINK_SIZE);
275
276 if (mb->callout == NULL) return 0; /* No callout function provided */
277
278 /* The original matching code (pre 10.30) worked directly with the ovector
279 passed by the user, and this was passed to callouts. Now that the working
280 ovector is in the backtracking frame, it no longer needs to reserve space for
281 the overall match offsets (which would waste space in the frame). For backward
282 compatibility, however, we pass capture_top and offset_vector to the callout as
283 if for the extended ovector, and we ensure that the first two slots are unset
284 by preserving and restoring their current contents. Picky compilers complain if
285 references such as Fovector[-2] are use directly, so we set up a separate
286 pointer. */
287
288 callout_ovector = (PCRE2_SIZE *)(Fovector) - 2;
289
290 /* The cb->version, cb->subject, cb->subject_length, and cb->start_match fields
291 are set externally. The first 3 never change; the last is updated for each
292 bumpalong. */
293
294 cb = mb->cb;
295 cb->capture_top = (uint32_t)Foffset_top/2 + 1;
296 cb->capture_last = Fcapture_last;
297 cb->offset_vector = callout_ovector;
298 cb->mark = mb->nomatch_mark;
299 cb->current_position = (PCRE2_SIZE)(Feptr - mb->start_subject);
300 cb->pattern_position = GET(Fecode, 1);
301 cb->next_item_length = GET(Fecode, 1 + LINK_SIZE);
302
303 if (*Fecode == OP_CALLOUT) /* Numerical callout */
304 {
305 cb->callout_number = Fecode[1 + 2*LINK_SIZE];
306 cb->callout_string_offset = 0;
307 cb->callout_string = NULL;
308 cb->callout_string_length = 0;
309 }
310 else /* String callout */
311 {
312 cb->callout_number = 0;
313 cb->callout_string_offset = GET(Fecode, 1 + 3*LINK_SIZE);
314 cb->callout_string = Fecode + (1 + 4*LINK_SIZE) + 1;
315 cb->callout_string_length =
316 *lengthptr - (1 + 4*LINK_SIZE) - 2;
317 }
318
319 save0 = callout_ovector[0];
320 save1 = callout_ovector[1];
321 callout_ovector[0] = callout_ovector[1] = PCRE2_UNSET;
322 rc = mb->callout(cb, mb->callout_data);
323 callout_ovector[0] = save0;
324 callout_ovector[1] = save1;
325 cb->callout_flags = 0;
326 return rc;
327 }
328
329
330
331 /*************************************************
332 * Match a back-reference *
333 *************************************************/
334
335 /* This function is called only when it is known that the offset lies within
336 the offsets that have so far been used in the match. Note that in caseless
337 UTF-8 mode, the number of subject bytes matched may be different to the number
338 of reference bytes. (In theory this could also happen in UTF-16 mode, but it
339 seems unlikely.)
340
341 Arguments:
342 offset index into the offset vector
343 caseless TRUE if caseless
344 F the current backtracking frame pointer
345 mb points to match block
346 lengthptr pointer for returning the length matched
347
348 Returns: = 0 sucessful match; number of code units matched is set
349 < 0 no match
350 > 0 partial match
351 */
352
353 static int
match_ref(PCRE2_SIZE offset,BOOL caseless,heapframe * F,match_block * mb,PCRE2_SIZE * lengthptr)354 match_ref(PCRE2_SIZE offset, BOOL caseless, heapframe *F, match_block *mb,
355 PCRE2_SIZE *lengthptr)
356 {
357 PCRE2_SPTR p;
358 PCRE2_SIZE length;
359 PCRE2_SPTR eptr;
360 PCRE2_SPTR eptr_start;
361
362 /* Deal with an unset group. The default is no match, but there is an option to
363 match an empty string. */
364
365 if (offset >= Foffset_top || Fovector[offset] == PCRE2_UNSET)
366 {
367 if ((mb->poptions & PCRE2_MATCH_UNSET_BACKREF) != 0)
368 {
369 *lengthptr = 0;
370 return 0; /* Match */
371 }
372 else return -1; /* No match */
373 }
374
375 /* Separate the caseless and UTF cases for speed. */
376
377 eptr = eptr_start = Feptr;
378 p = mb->start_subject + Fovector[offset];
379 length = Fovector[offset+1] - Fovector[offset];
380
381 if (caseless)
382 {
383 #if defined SUPPORT_UNICODE
384 BOOL utf = (mb->poptions & PCRE2_UTF) != 0;
385
386 if (utf || (mb->poptions & PCRE2_UCP) != 0)
387 {
388 PCRE2_SPTR endptr = p + length;
389
390 /* Match characters up to the end of the reference. NOTE: the number of
391 code units matched may differ, because in UTF-8 there are some characters
392 whose upper and lower case codes have different numbers of bytes. For
393 example, U+023A (2 bytes in UTF-8) is the upper case version of U+2C65 (3
394 bytes in UTF-8); a sequence of 3 of the former uses 6 bytes, as does a
395 sequence of two of the latter. It is important, therefore, to check the
396 length along the reference, not along the subject (earlier code did this
397 wrong). UCP without uses Unicode properties but without UTF encoding. */
398
399 while (p < endptr)
400 {
401 uint32_t c, d;
402 const ucd_record *ur;
403 if (eptr >= mb->end_subject) return 1; /* Partial match */
404
405 if (utf)
406 {
407 GETCHARINC(c, eptr);
408 GETCHARINC(d, p);
409 }
410 else
411 {
412 c = *eptr++;
413 d = *p++;
414 }
415
416 ur = GET_UCD(d);
417 if (c != d && c != (uint32_t)((int)d + ur->other_case))
418 {
419 const uint32_t *pp = PRIV(ucd_caseless_sets) + ur->caseset;
420 for (;;)
421 {
422 if (c < *pp) return -1; /* No match */
423 if (c == *pp++) break;
424 }
425 }
426 }
427 }
428 else
429 #endif
430
431 /* Not in UTF or UCP mode */
432 {
433 for (; length > 0; length--)
434 {
435 uint32_t cc, cp;
436 if (eptr >= mb->end_subject) return 1; /* Partial match */
437 cc = UCHAR21TEST(eptr);
438 cp = UCHAR21TEST(p);
439 if (TABLE_GET(cp, mb->lcc, cp) != TABLE_GET(cc, mb->lcc, cc))
440 return -1; /* No match */
441 p++;
442 eptr++;
443 }
444 }
445 }
446
447 /* In the caseful case, we can just compare the code units, whether or not we
448 are in UTF and/or UCP mode. When partial matching, we have to do this unit by
449 unit. */
450
451 else
452 {
453 if (mb->partial != 0)
454 {
455 for (; length > 0; length--)
456 {
457 if (eptr >= mb->end_subject) return 1; /* Partial match */
458 if (UCHAR21INCTEST(p) != UCHAR21INCTEST(eptr)) return -1; /* No match */
459 }
460 }
461
462 /* Not partial matching */
463
464 else
465 {
466 if ((PCRE2_SIZE)(mb->end_subject - eptr) < length) return 1; /* Partial */
467 if (memcmp(p, eptr, CU2BYTES(length)) != 0) return -1; /* No match */
468 eptr += length;
469 }
470 }
471
472 *lengthptr = eptr - eptr_start;
473 return 0; /* Match */
474 }
475
476
477
478 /******************************************************************************
479 *******************************************************************************
480 "Recursion" in the match() function
481
482 The original match() function was highly recursive, but this proved to be the
483 source of a number of problems over the years, mostly because of the relatively
484 small system stacks that are commonly found. As new features were added to
485 patterns, various kludges were invented to reduce the amount of stack used,
486 making the code hard to understand in places.
487
488 A version did exist that used individual frames on the heap instead of calling
489 match() recursively, but this ran substantially slower. The current version is
490 a refactoring that uses a vector of frames to remember backtracking points.
491 This runs no slower, and possibly even a bit faster than the original recursive
492 implementation. An initial vector of size START_FRAMES_SIZE (enough for maybe
493 50 frames) is allocated on the system stack. If this is not big enough, the
494 heap is used for a larger vector.
495
496 *******************************************************************************
497 ******************************************************************************/
498
499
500
501
502 /*************************************************
503 * Macros for the match() function *
504 *************************************************/
505
506 /* These macros pack up tests that are used for partial matching several times
507 in the code. The second one is used when we already know we are past the end of
508 the subject. We set the "hit end" flag if the pointer is at the end of the
509 subject and either (a) the pointer is past the earliest inspected character
510 (i.e. something has been matched, even if not part of the actual matched
511 string), or (b) the pattern contains a lookbehind. These are the conditions for
512 which adding more characters may allow the current match to continue.
513
514 For hard partial matching, we immediately return a partial match. Otherwise,
515 carrying on means that a complete match on the current subject will be sought.
516 A partial match is returned only if no complete match can be found. */
517
518 #define CHECK_PARTIAL()\
519 if (Feptr >= mb->end_subject) \
520 { \
521 SCHECK_PARTIAL(); \
522 }
523
524 #define SCHECK_PARTIAL()\
525 if (mb->partial != 0 && \
526 (Feptr > mb->start_used_ptr || mb->allowemptypartial)) \
527 { \
528 mb->hitend = TRUE; \
529 if (mb->partial > 1) return PCRE2_ERROR_PARTIAL; \
530 }
531
532
533 /* These macros are used to implement backtracking. They simulate a recursive
534 call to the match() function by means of a local vector of frames which
535 remember the backtracking points. */
536
537 #define RMATCH(ra,rb)\
538 {\
539 start_ecode = ra;\
540 Freturn_id = rb;\
541 goto MATCH_RECURSE;\
542 L_##rb:;\
543 }
544
545 #define RRETURN(ra)\
546 {\
547 rrc = ra;\
548 goto RETURN_SWITCH;\
549 }
550
551
552
553 /*************************************************
554 * Match from current position *
555 *************************************************/
556
557 /* This function is called to run one match attempt at a single starting point
558 in the subject.
559
560 Performance note: It might be tempting to extract commonly used fields from the
561 mb structure (e.g. end_subject) into individual variables to improve
562 performance. Tests using gcc on a SPARC disproved this; in the first case, it
563 made performance worse.
564
565 Arguments:
566 start_eptr starting character in subject
567 start_ecode starting position in compiled code
568 ovector pointer to the final output vector
569 oveccount number of pairs in ovector
570 top_bracket number of capturing parentheses in the pattern
571 frame_size size of each backtracking frame
572 mb pointer to "static" variables block
573
574 Returns: MATCH_MATCH if matched ) these values are >= 0
575 MATCH_NOMATCH if failed to match )
576 negative MATCH_xxx value for PRUNE, SKIP, etc
577 negative PCRE2_ERROR_xxx value if aborted by an error condition
578 (e.g. stopped by repeated call or depth limit)
579 */
580
581 static int
match(PCRE2_SPTR start_eptr,PCRE2_SPTR start_ecode,PCRE2_SIZE * ovector,uint16_t oveccount,uint16_t top_bracket,PCRE2_SIZE frame_size,match_block * mb)582 match(PCRE2_SPTR start_eptr, PCRE2_SPTR start_ecode, PCRE2_SIZE *ovector,
583 uint16_t oveccount, uint16_t top_bracket, PCRE2_SIZE frame_size,
584 match_block *mb)
585 {
586 /* Frame-handling variables */
587
588 heapframe *F; /* Current frame pointer */
589 heapframe *N = NULL; /* Temporary frame pointers */
590 heapframe *P = NULL;
591 heapframe *assert_accept_frame = NULL; /* For passing back a frame with captures */
592 PCRE2_SIZE frame_copy_size; /* Amount to copy when creating a new frame */
593
594 /* Local variables that do not need to be preserved over calls to RRMATCH(). */
595
596 PCRE2_SPTR bracode; /* Temp pointer to start of group */
597 PCRE2_SIZE offset; /* Used for group offsets */
598 PCRE2_SIZE length; /* Used for various length calculations */
599
600 int rrc; /* Return from functions & backtracking "recursions" */
601 #ifdef SUPPORT_UNICODE
602 int proptype; /* Type of character property */
603 #endif
604
605 uint32_t i; /* Used for local loops */
606 uint32_t fc; /* Character values */
607 uint32_t number; /* Used for group and other numbers */
608 uint32_t reptype = 0; /* Type of repetition (0 to avoid compiler warning) */
609 uint32_t group_frame_type; /* Specifies type for new group frames */
610
611 BOOL condition; /* Used in conditional groups */
612 BOOL cur_is_word; /* Used in "word" tests */
613 BOOL prev_is_word; /* Used in "word" tests */
614
615 /* UTF and UCP flags */
616
617 #ifdef SUPPORT_UNICODE
618 BOOL utf = (mb->poptions & PCRE2_UTF) != 0;
619 BOOL ucp = (mb->poptions & PCRE2_UCP) != 0;
620 #else
621 BOOL utf = FALSE; /* Required for convenience even when no Unicode support */
622 #endif
623
624 /* This is the length of the last part of a backtracking frame that must be
625 copied when a new frame is created. */
626
627 frame_copy_size = frame_size - offsetof(heapframe, eptr);
628
629 /* Set up the first current frame at the start of the vector, and initialize
630 fields that are not reset for new frames. */
631
632 F = mb->match_frames;
633 Frdepth = 0; /* "Recursion" depth */
634 Fcapture_last = 0; /* Number of most recent capture */
635 Fcurrent_recurse = RECURSE_UNSET; /* Not pattern recursing. */
636 Fstart_match = Feptr = start_eptr; /* Current data pointer and start match */
637 Fmark = NULL; /* Most recent mark */
638 Foffset_top = 0; /* End of captures within the frame */
639 Flast_group_offset = PCRE2_UNSET; /* Saved frame of most recent group */
640 group_frame_type = 0; /* Not a start of group frame */
641 goto NEW_FRAME; /* Start processing with this frame */
642
643 /* Come back here when we want to create a new frame for remembering a
644 backtracking point. */
645
646 MATCH_RECURSE:
647
648 /* Set up a new backtracking frame. If the vector is full, get a new one
649 on the heap, doubling the size, but constrained by the heap limit. */
650
651 N = (heapframe *)((char *)F + frame_size);
652 if (N >= mb->match_frames_top)
653 {
654 PCRE2_SIZE newsize = mb->frame_vector_size * 2;
655 heapframe *new;
656
657 if ((newsize / 1024) > mb->heap_limit)
658 {
659 PCRE2_SIZE maxsize = ((mb->heap_limit * 1024)/frame_size) * frame_size;
660 if (mb->frame_vector_size >= maxsize) return PCRE2_ERROR_HEAPLIMIT;
661 newsize = maxsize;
662 }
663
664 new = mb->memctl.malloc(newsize, mb->memctl.memory_data);
665 if (new == NULL) return PCRE2_ERROR_NOMEMORY;
666 memcpy(new, mb->match_frames, mb->frame_vector_size);
667
668 F = (heapframe *)((char *)new + ((char *)F - (char *)mb->match_frames));
669 N = (heapframe *)((char *)F + frame_size);
670
671 if (mb->match_frames != mb->stack_frames)
672 mb->memctl.free(mb->match_frames, mb->memctl.memory_data);
673 mb->match_frames = new;
674 mb->match_frames_top = (heapframe *)((char *)mb->match_frames + newsize);
675 mb->frame_vector_size = newsize;
676 }
677
678 #ifdef DEBUG_SHOW_RMATCH
679 fprintf(stderr, "++ RMATCH %2d frame=%d", Freturn_id, Frdepth + 1);
680 if (group_frame_type != 0)
681 {
682 fprintf(stderr, " type=%x ", group_frame_type);
683 switch (GF_IDMASK(group_frame_type))
684 {
685 case GF_CAPTURE:
686 fprintf(stderr, "capture=%d", GF_DATAMASK(group_frame_type));
687 break;
688
689 case GF_NOCAPTURE:
690 fprintf(stderr, "nocapture op=%d", GF_DATAMASK(group_frame_type));
691 break;
692
693 case GF_CONDASSERT:
694 fprintf(stderr, "condassert op=%d", GF_DATAMASK(group_frame_type));
695 break;
696
697 case GF_RECURSE:
698 fprintf(stderr, "recurse=%d", GF_DATAMASK(group_frame_type));
699 break;
700
701 default:
702 fprintf(stderr, "*** unknown ***");
703 break;
704 }
705 }
706 fprintf(stderr, "\n");
707 #endif
708
709 /* Copy those fields that must be copied into the new frame, increase the
710 "recursion" depth (i.e. the new frame's index) and then make the new frame
711 current. */
712
713 memcpy((char *)N + offsetof(heapframe, eptr),
714 (char *)F + offsetof(heapframe, eptr),
715 frame_copy_size);
716
717 N->rdepth = Frdepth + 1;
718 F = N;
719
720 /* Carry on processing with a new frame. */
721
722 NEW_FRAME:
723 Fgroup_frame_type = group_frame_type;
724 Fecode = start_ecode; /* Starting code pointer */
725 Fback_frame = frame_size; /* Default is go back one frame */
726
727 /* If this is a special type of group frame, remember its offset for quick
728 access at the end of the group. If this is a recursion, set a new current
729 recursion value. */
730
731 if (group_frame_type != 0)
732 {
733 Flast_group_offset = (char *)F - (char *)mb->match_frames;
734 if (GF_IDMASK(group_frame_type) == GF_RECURSE)
735 Fcurrent_recurse = GF_DATAMASK(group_frame_type);
736 group_frame_type = 0;
737 }
738
739
740 /* ========================================================================= */
741 /* This is the main processing loop. First check that we haven't recorded too
742 many backtracks (search tree is too large), or that we haven't exceeded the
743 recursive depth limit (used too many backtracking frames). If not, process the
744 opcodes. */
745
746 if (mb->match_call_count++ >= mb->match_limit) return PCRE2_ERROR_MATCHLIMIT;
747 if (Frdepth >= mb->match_limit_depth) return PCRE2_ERROR_DEPTHLIMIT;
748
749 for (;;)
750 {
751 #ifdef DEBUG_SHOW_OPS
752 fprintf(stderr, "++ op=%d\n", *Fecode);
753 #endif
754
755 Fop = (uint8_t)(*Fecode); /* Cast needed for 16-bit and 32-bit modes */
756 switch(Fop)
757 {
758 /* ===================================================================== */
759 /* Before OP_ACCEPT there may be any number of OP_CLOSE opcodes, to close
760 any currently open capturing brackets. Unlike reaching the end of a group,
761 where we know the starting frame is at the top of the chained frames, in
762 this case we have to search back for the relevant frame in case other types
763 of group that use chained frames have intervened. Multiple OP_CLOSEs always
764 come innermost first, which matches the chain order. We can ignore this in
765 a recursion, because captures are not passed out of recursions. */
766
767 case OP_CLOSE:
768 if (Fcurrent_recurse == RECURSE_UNSET)
769 {
770 number = GET2(Fecode, 1);
771 offset = Flast_group_offset;
772 for(;;)
773 {
774 if (offset == PCRE2_UNSET) return PCRE2_ERROR_INTERNAL;
775 N = (heapframe *)((char *)mb->match_frames + offset);
776 P = (heapframe *)((char *)N - frame_size);
777 if (N->group_frame_type == (GF_CAPTURE | number)) break;
778 offset = P->last_group_offset;
779 }
780 offset = (number << 1) - 2;
781 Fcapture_last = number;
782 Fovector[offset] = P->eptr - mb->start_subject;
783 Fovector[offset+1] = Feptr - mb->start_subject;
784 if (offset >= Foffset_top) Foffset_top = offset + 2;
785 }
786 Fecode += PRIV(OP_lengths)[*Fecode];
787 break;
788
789
790 /* ===================================================================== */
791 /* Real or forced end of the pattern, assertion, or recursion. In an
792 assertion ACCEPT, update the last used pointer and remember the current
793 frame so that the captures and mark can be fished out of it. */
794
795 case OP_ASSERT_ACCEPT:
796 if (Feptr > mb->last_used_ptr) mb->last_used_ptr = Feptr;
797 assert_accept_frame = F;
798 RRETURN(MATCH_ACCEPT);
799
800 /* If recursing, we have to find the most recent recursion. */
801
802 case OP_ACCEPT:
803 case OP_END:
804
805 /* Handle end of a recursion. */
806
807 if (Fcurrent_recurse != RECURSE_UNSET)
808 {
809 offset = Flast_group_offset;
810 for(;;)
811 {
812 if (offset == PCRE2_UNSET) return PCRE2_ERROR_INTERNAL;
813 N = (heapframe *)((char *)mb->match_frames + offset);
814 P = (heapframe *)((char *)N - frame_size);
815 if (GF_IDMASK(N->group_frame_type) == GF_RECURSE) break;
816 offset = P->last_group_offset;
817 }
818
819 /* N is now the frame of the recursion; the previous frame is at the
820 OP_RECURSE position. Go back there, copying the current subject position
821 and mark, and move on past the OP_RECURSE. */
822
823 P->eptr = Feptr;
824 P->mark = Fmark;
825 F = P;
826 Fecode += 1 + LINK_SIZE;
827 continue;
828 }
829
830 /* Not a recursion. Fail for an empty string match if either PCRE2_NOTEMPTY
831 is set, or if PCRE2_NOTEMPTY_ATSTART is set and we have matched at the
832 start of the subject. In both cases, backtracking will then try other
833 alternatives, if any. */
834
835 if (Feptr == Fstart_match &&
836 ((mb->moptions & PCRE2_NOTEMPTY) != 0 ||
837 ((mb->moptions & PCRE2_NOTEMPTY_ATSTART) != 0 &&
838 Fstart_match == mb->start_subject + mb->start_offset)))
839 RRETURN(MATCH_NOMATCH);
840
841 /* Also fail if PCRE2_ENDANCHORED is set and the end of the match is not
842 the end of the subject. After (*ACCEPT) we fail the entire match (at this
843 position) but backtrack on reaching the end of the pattern. */
844
845 if (Feptr < mb->end_subject &&
846 ((mb->moptions | mb->poptions) & PCRE2_ENDANCHORED) != 0)
847 {
848 if (Fop == OP_END) RRETURN(MATCH_NOMATCH);
849 return MATCH_NOMATCH;
850 }
851
852 /* We have a successful match of the whole pattern. Record the result and
853 then do a direct return from the function. If there is space in the offset
854 vector, set any pairs that follow the highest-numbered captured string but
855 are less than the number of capturing groups in the pattern to PCRE2_UNSET.
856 It is documented that this happens. "Gaps" are set to PCRE2_UNSET
857 dynamically. It is only those at the end that need setting here. */
858
859 mb->end_match_ptr = Feptr; /* Record where we ended */
860 mb->end_offset_top = Foffset_top; /* and how many extracts were taken */
861 mb->mark = Fmark; /* and the last success mark */
862 if (Feptr > mb->last_used_ptr) mb->last_used_ptr = Feptr;
863
864 ovector[0] = Fstart_match - mb->start_subject;
865 ovector[1] = Feptr - mb->start_subject;
866
867 /* Set i to the smaller of the sizes of the external and frame ovectors. */
868
869 i = 2 * ((top_bracket + 1 > oveccount)? oveccount : top_bracket + 1);
870 memcpy(ovector + 2, Fovector, (i - 2) * sizeof(PCRE2_SIZE));
871 while (--i >= Foffset_top + 2) ovector[i] = PCRE2_UNSET;
872 return MATCH_MATCH; /* Note: NOT RRETURN */
873
874
875 /*===================================================================== */
876 /* Match any single character type except newline; have to take care with
877 CRLF newlines and partial matching. */
878
879 case OP_ANY:
880 if (IS_NEWLINE(Feptr)) RRETURN(MATCH_NOMATCH);
881 if (mb->partial != 0 &&
882 Feptr == mb->end_subject - 1 &&
883 NLBLOCK->nltype == NLTYPE_FIXED &&
884 NLBLOCK->nllen == 2 &&
885 UCHAR21TEST(Feptr) == NLBLOCK->nl[0])
886 {
887 mb->hitend = TRUE;
888 if (mb->partial > 1) return PCRE2_ERROR_PARTIAL;
889 }
890 /* Fall through */
891
892 /* Match any single character whatsoever. */
893
894 case OP_ALLANY:
895 if (Feptr >= mb->end_subject) /* DO NOT merge the Feptr++ here; it must */
896 { /* not be updated before SCHECK_PARTIAL. */
897 SCHECK_PARTIAL();
898 RRETURN(MATCH_NOMATCH);
899 }
900 Feptr++;
901 #ifdef SUPPORT_UNICODE
902 if (utf) ACROSSCHAR(Feptr < mb->end_subject, Feptr, Feptr++);
903 #endif
904 Fecode++;
905 break;
906
907
908 /* ===================================================================== */
909 /* Match a single code unit, even in UTF mode. This opcode really does
910 match any code unit, even newline. (It really should be called ANYCODEUNIT,
911 of course - the byte name is from pre-16 bit days.) */
912
913 case OP_ANYBYTE:
914 if (Feptr >= mb->end_subject) /* DO NOT merge the Feptr++ here; it must */
915 { /* not be updated before SCHECK_PARTIAL. */
916 SCHECK_PARTIAL();
917 RRETURN(MATCH_NOMATCH);
918 }
919 Feptr++;
920 Fecode++;
921 break;
922
923
924 /* ===================================================================== */
925 /* Match a single character, casefully */
926
927 case OP_CHAR:
928 #ifdef SUPPORT_UNICODE
929 if (utf)
930 {
931 Flength = 1;
932 Fecode++;
933 GETCHARLEN(fc, Fecode, Flength);
934 if (Flength > (PCRE2_SIZE)(mb->end_subject - Feptr))
935 {
936 CHECK_PARTIAL(); /* Not SCHECK_PARTIAL() */
937 RRETURN(MATCH_NOMATCH);
938 }
939 for (; Flength > 0; Flength--)
940 {
941 if (*Fecode++ != UCHAR21INC(Feptr)) RRETURN(MATCH_NOMATCH);
942 }
943 }
944 else
945 #endif
946
947 /* Not UTF mode */
948 {
949 if (mb->end_subject - Feptr < 1)
950 {
951 SCHECK_PARTIAL(); /* This one can use SCHECK_PARTIAL() */
952 RRETURN(MATCH_NOMATCH);
953 }
954 if (Fecode[1] != *Feptr++) RRETURN(MATCH_NOMATCH);
955 Fecode += 2;
956 }
957 break;
958
959
960 /* ===================================================================== */
961 /* Match a single character, caselessly. If we are at the end of the
962 subject, give up immediately. We get here only when the pattern character
963 has at most one other case. Characters with more than two cases are coded
964 as OP_PROP with the pseudo-property PT_CLIST. */
965
966 case OP_CHARI:
967 if (Feptr >= mb->end_subject)
968 {
969 SCHECK_PARTIAL();
970 RRETURN(MATCH_NOMATCH);
971 }
972
973 #ifdef SUPPORT_UNICODE
974 if (utf)
975 {
976 Flength = 1;
977 Fecode++;
978 GETCHARLEN(fc, Fecode, Flength);
979
980 /* If the pattern character's value is < 128, we know that its other case
981 (if any) is also < 128 (and therefore only one code unit long in all
982 code-unit widths), so we can use the fast lookup table. We checked above
983 that there is at least one character left in the subject. */
984
985 if (fc < 128)
986 {
987 uint32_t cc = UCHAR21(Feptr);
988 if (mb->lcc[fc] != TABLE_GET(cc, mb->lcc, cc)) RRETURN(MATCH_NOMATCH);
989 Fecode++;
990 Feptr++;
991 }
992
993 /* Otherwise we must pick up the subject character and use Unicode
994 property support to test its other case. Note that we cannot use the
995 value of "Flength" to check for sufficient bytes left, because the other
996 case of the character may have more or fewer code units. */
997
998 else
999 {
1000 uint32_t dc;
1001 GETCHARINC(dc, Feptr);
1002 Fecode += Flength;
1003 if (dc != fc && dc != UCD_OTHERCASE(fc)) RRETURN(MATCH_NOMATCH);
1004 }
1005 }
1006
1007 /* If UCP is set without UTF we must do the same as above, but with one
1008 character per code unit. */
1009
1010 else if (ucp)
1011 {
1012 uint32_t cc = UCHAR21(Feptr);
1013 fc = Fecode[1];
1014 if (fc < 128)
1015 {
1016 if (mb->lcc[fc] != TABLE_GET(cc, mb->lcc, cc)) RRETURN(MATCH_NOMATCH);
1017 }
1018 else
1019 {
1020 if (cc != fc && cc != UCD_OTHERCASE(fc)) RRETURN(MATCH_NOMATCH);
1021 }
1022 Feptr++;
1023 Fecode += 2;
1024 }
1025
1026 else
1027 #endif /* SUPPORT_UNICODE */
1028
1029 /* Not UTF or UCP mode; use the table for characters < 256. */
1030 {
1031 if (TABLE_GET(Fecode[1], mb->lcc, Fecode[1])
1032 != TABLE_GET(*Feptr, mb->lcc, *Feptr)) RRETURN(MATCH_NOMATCH);
1033 Feptr++;
1034 Fecode += 2;
1035 }
1036 break;
1037
1038
1039 /* ===================================================================== */
1040 /* Match not a single character. */
1041
1042 case OP_NOT:
1043 case OP_NOTI:
1044 if (Feptr >= mb->end_subject)
1045 {
1046 SCHECK_PARTIAL();
1047 RRETURN(MATCH_NOMATCH);
1048 }
1049
1050 #ifdef SUPPORT_UNICODE
1051 if (utf)
1052 {
1053 uint32_t ch;
1054 Fecode++;
1055 GETCHARINC(ch, Fecode);
1056 GETCHARINC(fc, Feptr);
1057 if (ch == fc)
1058 {
1059 RRETURN(MATCH_NOMATCH); /* Caseful match */
1060 }
1061 else if (Fop == OP_NOTI) /* If caseless */
1062 {
1063 if (ch > 127)
1064 ch = UCD_OTHERCASE(ch);
1065 else
1066 ch = (mb->fcc)[ch];
1067 if (ch == fc) RRETURN(MATCH_NOMATCH);
1068 }
1069 }
1070
1071 /* UCP without UTF is as above, but with one character per code unit. */
1072
1073 else if (ucp)
1074 {
1075 uint32_t ch;
1076 fc = UCHAR21INC(Feptr);
1077 ch = Fecode[1];
1078 Fecode += 2;
1079
1080 if (ch == fc)
1081 {
1082 RRETURN(MATCH_NOMATCH); /* Caseful match */
1083 }
1084 else if (Fop == OP_NOTI) /* If caseless */
1085 {
1086 if (ch > 127)
1087 ch = UCD_OTHERCASE(ch);
1088 else
1089 ch = (mb->fcc)[ch];
1090 if (ch == fc) RRETURN(MATCH_NOMATCH);
1091 }
1092 }
1093
1094 else
1095 #endif /* SUPPORT_UNICODE */
1096
1097 /* Neither UTF nor UCP is set */
1098
1099 {
1100 uint32_t ch = Fecode[1];
1101 fc = UCHAR21INC(Feptr);
1102 if (ch == fc || (Fop == OP_NOTI && TABLE_GET(ch, mb->fcc, ch) == fc))
1103 RRETURN(MATCH_NOMATCH);
1104 Fecode += 2;
1105 }
1106 break;
1107
1108
1109 /* ===================================================================== */
1110 /* Match a single character repeatedly. */
1111
1112 #define Loclength F->temp_size
1113 #define Lstart_eptr F->temp_sptr[0]
1114 #define Lcharptr F->temp_sptr[1]
1115 #define Lmin F->temp_32[0]
1116 #define Lmax F->temp_32[1]
1117 #define Lc F->temp_32[2]
1118 #define Loc F->temp_32[3]
1119
1120 case OP_EXACT:
1121 case OP_EXACTI:
1122 Lmin = Lmax = GET2(Fecode, 1);
1123 Fecode += 1 + IMM2_SIZE;
1124 goto REPEATCHAR;
1125
1126 case OP_POSUPTO:
1127 case OP_POSUPTOI:
1128 reptype = REPTYPE_POS;
1129 Lmin = 0;
1130 Lmax = GET2(Fecode, 1);
1131 Fecode += 1 + IMM2_SIZE;
1132 goto REPEATCHAR;
1133
1134 case OP_UPTO:
1135 case OP_UPTOI:
1136 reptype = REPTYPE_MAX;
1137 Lmin = 0;
1138 Lmax = GET2(Fecode, 1);
1139 Fecode += 1 + IMM2_SIZE;
1140 goto REPEATCHAR;
1141
1142 case OP_MINUPTO:
1143 case OP_MINUPTOI:
1144 reptype = REPTYPE_MIN;
1145 Lmin = 0;
1146 Lmax = GET2(Fecode, 1);
1147 Fecode += 1 + IMM2_SIZE;
1148 goto REPEATCHAR;
1149
1150 case OP_POSSTAR:
1151 case OP_POSSTARI:
1152 reptype = REPTYPE_POS;
1153 Lmin = 0;
1154 Lmax = UINT32_MAX;
1155 Fecode++;
1156 goto REPEATCHAR;
1157
1158 case OP_POSPLUS:
1159 case OP_POSPLUSI:
1160 reptype = REPTYPE_POS;
1161 Lmin = 1;
1162 Lmax = UINT32_MAX;
1163 Fecode++;
1164 goto REPEATCHAR;
1165
1166 case OP_POSQUERY:
1167 case OP_POSQUERYI:
1168 reptype = REPTYPE_POS;
1169 Lmin = 0;
1170 Lmax = 1;
1171 Fecode++;
1172 goto REPEATCHAR;
1173
1174 case OP_STAR:
1175 case OP_STARI:
1176 case OP_MINSTAR:
1177 case OP_MINSTARI:
1178 case OP_PLUS:
1179 case OP_PLUSI:
1180 case OP_MINPLUS:
1181 case OP_MINPLUSI:
1182 case OP_QUERY:
1183 case OP_QUERYI:
1184 case OP_MINQUERY:
1185 case OP_MINQUERYI:
1186 fc = *Fecode++ - ((Fop < OP_STARI)? OP_STAR : OP_STARI);
1187 Lmin = rep_min[fc];
1188 Lmax = rep_max[fc];
1189 reptype = rep_typ[fc];
1190
1191 /* Common code for all repeated single-character matches. We first check
1192 for the minimum number of characters. If the minimum equals the maximum, we
1193 are done. Otherwise, if minimizing, check the rest of the pattern for a
1194 match; if there isn't one, advance up to the maximum, one character at a
1195 time.
1196
1197 If maximizing, advance up to the maximum number of matching characters,
1198 until Feptr is past the end of the maximum run. If possessive, we are
1199 then done (no backing up). Otherwise, match at this position; anything
1200 other than no match is immediately returned. For nomatch, back up one
1201 character, unless we are matching \R and the last thing matched was
1202 \r\n, in which case, back up two code units until we reach the first
1203 optional character position.
1204
1205 The various UTF/non-UTF and caseful/caseless cases are handled separately,
1206 for speed. */
1207
1208 REPEATCHAR:
1209 #ifdef SUPPORT_UNICODE
1210 if (utf)
1211 {
1212 Flength = 1;
1213 Lcharptr = Fecode;
1214 GETCHARLEN(fc, Fecode, Flength);
1215 Fecode += Flength;
1216
1217 /* Handle multi-code-unit character matching, caseful and caseless. */
1218
1219 if (Flength > 1)
1220 {
1221 uint32_t othercase;
1222
1223 if (Fop >= OP_STARI && /* Caseless */
1224 (othercase = UCD_OTHERCASE(fc)) != fc)
1225 Loclength = PRIV(ord2utf)(othercase, Foccu);
1226 else Loclength = 0;
1227
1228 for (i = 1; i <= Lmin; i++)
1229 {
1230 if (Feptr <= mb->end_subject - Flength &&
1231 memcmp(Feptr, Lcharptr, CU2BYTES(Flength)) == 0) Feptr += Flength;
1232 else if (Loclength > 0 &&
1233 Feptr <= mb->end_subject - Loclength &&
1234 memcmp(Feptr, Foccu, CU2BYTES(Loclength)) == 0)
1235 Feptr += Loclength;
1236 else
1237 {
1238 CHECK_PARTIAL();
1239 RRETURN(MATCH_NOMATCH);
1240 }
1241 }
1242
1243 if (Lmin == Lmax) continue;
1244
1245 if (reptype == REPTYPE_MIN)
1246 {
1247 for (;;)
1248 {
1249 RMATCH(Fecode, RM202);
1250 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1251 if (Lmin++ >= Lmax) RRETURN(MATCH_NOMATCH);
1252 if (Feptr <= mb->end_subject - Flength &&
1253 memcmp(Feptr, Lcharptr, CU2BYTES(Flength)) == 0) Feptr += Flength;
1254 else if (Loclength > 0 &&
1255 Feptr <= mb->end_subject - Loclength &&
1256 memcmp(Feptr, Foccu, CU2BYTES(Loclength)) == 0)
1257 Feptr += Loclength;
1258 else
1259 {
1260 CHECK_PARTIAL();
1261 RRETURN(MATCH_NOMATCH);
1262 }
1263 }
1264 /* Control never gets here */
1265 }
1266
1267 else /* Maximize */
1268 {
1269 Lstart_eptr = Feptr;
1270 for (i = Lmin; i < Lmax; i++)
1271 {
1272 if (Feptr <= mb->end_subject - Flength &&
1273 memcmp(Feptr, Lcharptr, CU2BYTES(Flength)) == 0)
1274 Feptr += Flength;
1275 else if (Loclength > 0 &&
1276 Feptr <= mb->end_subject - Loclength &&
1277 memcmp(Feptr, Foccu, CU2BYTES(Loclength)) == 0)
1278 Feptr += Loclength;
1279 else
1280 {
1281 CHECK_PARTIAL();
1282 break;
1283 }
1284 }
1285
1286 /* After \C in UTF mode, Lstart_eptr might be in the middle of a
1287 Unicode character. Use <= Lstart_eptr to ensure backtracking doesn't
1288 go too far. */
1289
1290 if (reptype != REPTYPE_POS) for(;;)
1291 {
1292 if (Feptr <= Lstart_eptr) break;
1293 RMATCH(Fecode, RM203);
1294 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1295 Feptr--;
1296 BACKCHAR(Feptr);
1297 }
1298 }
1299 break; /* End of repeated wide character handling */
1300 }
1301
1302 /* Length of UTF character is 1. Put it into the preserved variable and
1303 fall through to the non-UTF code. */
1304
1305 Lc = fc;
1306 }
1307 else
1308 #endif /* SUPPORT_UNICODE */
1309
1310 /* When not in UTF mode, load a single-code-unit character. Then proceed as
1311 above, using Unicode casing if either UTF or UCP is set. */
1312
1313 Lc = *Fecode++;
1314
1315 /* Caseless comparison */
1316
1317 if (Fop >= OP_STARI)
1318 {
1319 #if PCRE2_CODE_UNIT_WIDTH == 8
1320 #ifdef SUPPORT_UNICODE
1321 if (ucp && !utf && Lc > 127) Loc = UCD_OTHERCASE(Lc);
1322 else
1323 #endif /* SUPPORT_UNICODE */
1324 /* Lc will be < 128 in UTF-8 mode. */
1325 Loc = mb->fcc[Lc];
1326 #else /* 16-bit & 32-bit */
1327 #ifdef SUPPORT_UNICODE
1328 if ((utf || ucp) && Lc > 127) Loc = UCD_OTHERCASE(Lc);
1329 else
1330 #endif /* SUPPORT_UNICODE */
1331 Loc = TABLE_GET(Lc, mb->fcc, Lc);
1332 #endif /* PCRE2_CODE_UNIT_WIDTH == 8 */
1333
1334 for (i = 1; i <= Lmin; i++)
1335 {
1336 uint32_t cc; /* Faster than PCRE2_UCHAR */
1337 if (Feptr >= mb->end_subject)
1338 {
1339 SCHECK_PARTIAL();
1340 RRETURN(MATCH_NOMATCH);
1341 }
1342 cc = UCHAR21TEST(Feptr);
1343 if (Lc != cc && Loc != cc) RRETURN(MATCH_NOMATCH);
1344 Feptr++;
1345 }
1346 if (Lmin == Lmax) continue;
1347
1348 if (reptype == REPTYPE_MIN)
1349 {
1350 for (;;)
1351 {
1352 uint32_t cc; /* Faster than PCRE2_UCHAR */
1353 RMATCH(Fecode, RM25);
1354 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1355 if (Lmin++ >= Lmax) RRETURN(MATCH_NOMATCH);
1356 if (Feptr >= mb->end_subject)
1357 {
1358 SCHECK_PARTIAL();
1359 RRETURN(MATCH_NOMATCH);
1360 }
1361 cc = UCHAR21TEST(Feptr);
1362 if (Lc != cc && Loc != cc) RRETURN(MATCH_NOMATCH);
1363 Feptr++;
1364 }
1365 /* Control never gets here */
1366 }
1367
1368 else /* Maximize */
1369 {
1370 Lstart_eptr = Feptr;
1371 for (i = Lmin; i < Lmax; i++)
1372 {
1373 uint32_t cc; /* Faster than PCRE2_UCHAR */
1374 if (Feptr >= mb->end_subject)
1375 {
1376 SCHECK_PARTIAL();
1377 break;
1378 }
1379 cc = UCHAR21TEST(Feptr);
1380 if (Lc != cc && Loc != cc) break;
1381 Feptr++;
1382 }
1383 if (reptype != REPTYPE_POS) for (;;)
1384 {
1385 if (Feptr == Lstart_eptr) break;
1386 RMATCH(Fecode, RM26);
1387 Feptr--;
1388 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1389 }
1390 }
1391 }
1392
1393 /* Caseful comparisons (includes all multi-byte characters) */
1394
1395 else
1396 {
1397 for (i = 1; i <= Lmin; i++)
1398 {
1399 if (Feptr >= mb->end_subject)
1400 {
1401 SCHECK_PARTIAL();
1402 RRETURN(MATCH_NOMATCH);
1403 }
1404 if (Lc != UCHAR21INCTEST(Feptr)) RRETURN(MATCH_NOMATCH);
1405 }
1406
1407 if (Lmin == Lmax) continue;
1408
1409 if (reptype == REPTYPE_MIN)
1410 {
1411 for (;;)
1412 {
1413 RMATCH(Fecode, RM27);
1414 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1415 if (Lmin++ >= Lmax) RRETURN(MATCH_NOMATCH);
1416 if (Feptr >= mb->end_subject)
1417 {
1418 SCHECK_PARTIAL();
1419 RRETURN(MATCH_NOMATCH);
1420 }
1421 if (Lc != UCHAR21INCTEST(Feptr)) RRETURN(MATCH_NOMATCH);
1422 }
1423 /* Control never gets here */
1424 }
1425 else /* Maximize */
1426 {
1427 Lstart_eptr = Feptr;
1428 for (i = Lmin; i < Lmax; i++)
1429 {
1430 if (Feptr >= mb->end_subject)
1431 {
1432 SCHECK_PARTIAL();
1433 break;
1434 }
1435
1436 if (Lc != UCHAR21TEST(Feptr)) break;
1437 Feptr++;
1438 }
1439
1440 if (reptype != REPTYPE_POS) for (;;)
1441 {
1442 if (Feptr <= Lstart_eptr) break;
1443 RMATCH(Fecode, RM28);
1444 Feptr--;
1445 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1446 }
1447 }
1448 }
1449 break;
1450
1451 #undef Loclength
1452 #undef Lstart_eptr
1453 #undef Lcharptr
1454 #undef Lmin
1455 #undef Lmax
1456 #undef Lc
1457 #undef Loc
1458
1459
1460 /* ===================================================================== */
1461 /* Match a negated single one-byte character repeatedly. This is almost a
1462 repeat of the code for a repeated single character, but I haven't found a
1463 nice way of commoning these up that doesn't require a test of the
1464 positive/negative option for each character match. Maybe that wouldn't add
1465 very much to the time taken, but character matching *is* what this is all
1466 about... */
1467
1468 #define Lstart_eptr F->temp_sptr[0]
1469 #define Lmin F->temp_32[0]
1470 #define Lmax F->temp_32[1]
1471 #define Lc F->temp_32[2]
1472 #define Loc F->temp_32[3]
1473
1474 case OP_NOTEXACT:
1475 case OP_NOTEXACTI:
1476 Lmin = Lmax = GET2(Fecode, 1);
1477 Fecode += 1 + IMM2_SIZE;
1478 goto REPEATNOTCHAR;
1479
1480 case OP_NOTUPTO:
1481 case OP_NOTUPTOI:
1482 Lmin = 0;
1483 Lmax = GET2(Fecode, 1);
1484 reptype = REPTYPE_MAX;
1485 Fecode += 1 + IMM2_SIZE;
1486 goto REPEATNOTCHAR;
1487
1488 case OP_NOTMINUPTO:
1489 case OP_NOTMINUPTOI:
1490 Lmin = 0;
1491 Lmax = GET2(Fecode, 1);
1492 reptype = REPTYPE_MIN;
1493 Fecode += 1 + IMM2_SIZE;
1494 goto REPEATNOTCHAR;
1495
1496 case OP_NOTPOSSTAR:
1497 case OP_NOTPOSSTARI:
1498 reptype = REPTYPE_POS;
1499 Lmin = 0;
1500 Lmax = UINT32_MAX;
1501 Fecode++;
1502 goto REPEATNOTCHAR;
1503
1504 case OP_NOTPOSPLUS:
1505 case OP_NOTPOSPLUSI:
1506 reptype = REPTYPE_POS;
1507 Lmin = 1;
1508 Lmax = UINT32_MAX;
1509 Fecode++;
1510 goto REPEATNOTCHAR;
1511
1512 case OP_NOTPOSQUERY:
1513 case OP_NOTPOSQUERYI:
1514 reptype = REPTYPE_POS;
1515 Lmin = 0;
1516 Lmax = 1;
1517 Fecode++;
1518 goto REPEATNOTCHAR;
1519
1520 case OP_NOTPOSUPTO:
1521 case OP_NOTPOSUPTOI:
1522 reptype = REPTYPE_POS;
1523 Lmin = 0;
1524 Lmax = GET2(Fecode, 1);
1525 Fecode += 1 + IMM2_SIZE;
1526 goto REPEATNOTCHAR;
1527
1528 case OP_NOTSTAR:
1529 case OP_NOTSTARI:
1530 case OP_NOTMINSTAR:
1531 case OP_NOTMINSTARI:
1532 case OP_NOTPLUS:
1533 case OP_NOTPLUSI:
1534 case OP_NOTMINPLUS:
1535 case OP_NOTMINPLUSI:
1536 case OP_NOTQUERY:
1537 case OP_NOTQUERYI:
1538 case OP_NOTMINQUERY:
1539 case OP_NOTMINQUERYI:
1540 fc = *Fecode++ - ((Fop >= OP_NOTSTARI)? OP_NOTSTARI: OP_NOTSTAR);
1541 Lmin = rep_min[fc];
1542 Lmax = rep_max[fc];
1543 reptype = rep_typ[fc];
1544
1545 /* Common code for all repeated single-character non-matches. */
1546
1547 REPEATNOTCHAR:
1548 GETCHARINCTEST(Lc, Fecode);
1549
1550 /* The code is duplicated for the caseless and caseful cases, for speed,
1551 since matching characters is likely to be quite common. First, ensure the
1552 minimum number of matches are present. If Lmin = Lmax, we are done.
1553 Otherwise, if minimizing, keep trying the rest of the expression and
1554 advancing one matching character if failing, up to the maximum.
1555 Alternatively, if maximizing, find the maximum number of characters and
1556 work backwards. */
1557
1558 if (Fop >= OP_NOTSTARI) /* Caseless */
1559 {
1560 #ifdef SUPPORT_UNICODE
1561 if ((utf || ucp) && Lc > 127)
1562 Loc = UCD_OTHERCASE(Lc);
1563 else
1564 #endif /* SUPPORT_UNICODE */
1565
1566 Loc = TABLE_GET(Lc, mb->fcc, Lc); /* Other case from table */
1567
1568 #ifdef SUPPORT_UNICODE
1569 if (utf)
1570 {
1571 uint32_t d;
1572 for (i = 1; i <= Lmin; i++)
1573 {
1574 if (Feptr >= mb->end_subject)
1575 {
1576 SCHECK_PARTIAL();
1577 RRETURN(MATCH_NOMATCH);
1578 }
1579 GETCHARINC(d, Feptr);
1580 if (Lc == d || Loc == d) RRETURN(MATCH_NOMATCH);
1581 }
1582 }
1583 else
1584 #endif /* SUPPORT_UNICODE */
1585
1586 /* Not UTF mode */
1587 {
1588 for (i = 1; i <= Lmin; i++)
1589 {
1590 if (Feptr >= mb->end_subject)
1591 {
1592 SCHECK_PARTIAL();
1593 RRETURN(MATCH_NOMATCH);
1594 }
1595 if (Lc == *Feptr || Loc == *Feptr) RRETURN(MATCH_NOMATCH);
1596 Feptr++;
1597 }
1598 }
1599
1600 if (Lmin == Lmax) continue; /* Finished for exact count */
1601
1602 if (reptype == REPTYPE_MIN)
1603 {
1604 #ifdef SUPPORT_UNICODE
1605 if (utf)
1606 {
1607 uint32_t d;
1608 for (;;)
1609 {
1610 RMATCH(Fecode, RM204);
1611 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1612 if (Lmin++ >= Lmax) RRETURN(MATCH_NOMATCH);
1613 if (Feptr >= mb->end_subject)
1614 {
1615 SCHECK_PARTIAL();
1616 RRETURN(MATCH_NOMATCH);
1617 }
1618 GETCHARINC(d, Feptr);
1619 if (Lc == d || Loc == d) RRETURN(MATCH_NOMATCH);
1620 }
1621 }
1622 else
1623 #endif /*SUPPORT_UNICODE */
1624
1625 /* Not UTF mode */
1626 {
1627 for (;;)
1628 {
1629 RMATCH(Fecode, RM29);
1630 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1631 if (Lmin++ >= Lmax) RRETURN(MATCH_NOMATCH);
1632 if (Feptr >= mb->end_subject)
1633 {
1634 SCHECK_PARTIAL();
1635 RRETURN(MATCH_NOMATCH);
1636 }
1637 if (Lc == *Feptr || Loc == *Feptr) RRETURN(MATCH_NOMATCH);
1638 Feptr++;
1639 }
1640 }
1641 /* Control never gets here */
1642 }
1643
1644 /* Maximize case */
1645
1646 else
1647 {
1648 Lstart_eptr = Feptr;
1649
1650 #ifdef SUPPORT_UNICODE
1651 if (utf)
1652 {
1653 uint32_t d;
1654 for (i = Lmin; i < Lmax; i++)
1655 {
1656 int len = 1;
1657 if (Feptr >= mb->end_subject)
1658 {
1659 SCHECK_PARTIAL();
1660 break;
1661 }
1662 GETCHARLEN(d, Feptr, len);
1663 if (Lc == d || Loc == d) break;
1664 Feptr += len;
1665 }
1666
1667 /* After \C in UTF mode, Lstart_eptr might be in the middle of a
1668 Unicode character. Use <= Lstart_eptr to ensure backtracking doesn't
1669 go too far. */
1670
1671 if (reptype != REPTYPE_POS) for(;;)
1672 {
1673 if (Feptr <= Lstart_eptr) break;
1674 RMATCH(Fecode, RM205);
1675 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1676 Feptr--;
1677 BACKCHAR(Feptr);
1678 }
1679 }
1680 else
1681 #endif /* SUPPORT_UNICODE */
1682
1683 /* Not UTF mode */
1684 {
1685 for (i = Lmin; i < Lmax; i++)
1686 {
1687 if (Feptr >= mb->end_subject)
1688 {
1689 SCHECK_PARTIAL();
1690 break;
1691 }
1692 if (Lc == *Feptr || Loc == *Feptr) break;
1693 Feptr++;
1694 }
1695 if (reptype != REPTYPE_POS) for (;;)
1696 {
1697 if (Feptr == Lstart_eptr) break;
1698 RMATCH(Fecode, RM30);
1699 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1700 Feptr--;
1701 }
1702 }
1703 }
1704 }
1705
1706 /* Caseful comparisons */
1707
1708 else
1709 {
1710 #ifdef SUPPORT_UNICODE
1711 if (utf)
1712 {
1713 uint32_t d;
1714 for (i = 1; i <= Lmin; i++)
1715 {
1716 if (Feptr >= mb->end_subject)
1717 {
1718 SCHECK_PARTIAL();
1719 RRETURN(MATCH_NOMATCH);
1720 }
1721 GETCHARINC(d, Feptr);
1722 if (Lc == d) RRETURN(MATCH_NOMATCH);
1723 }
1724 }
1725 else
1726 #endif
1727 /* Not UTF mode */
1728 {
1729 for (i = 1; i <= Lmin; i++)
1730 {
1731 if (Feptr >= mb->end_subject)
1732 {
1733 SCHECK_PARTIAL();
1734 RRETURN(MATCH_NOMATCH);
1735 }
1736 if (Lc == *Feptr++) RRETURN(MATCH_NOMATCH);
1737 }
1738 }
1739
1740 if (Lmin == Lmax) continue;
1741
1742 if (reptype == REPTYPE_MIN)
1743 {
1744 #ifdef SUPPORT_UNICODE
1745 if (utf)
1746 {
1747 uint32_t d;
1748 for (;;)
1749 {
1750 RMATCH(Fecode, RM206);
1751 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1752 if (Lmin++ >= Lmax) RRETURN(MATCH_NOMATCH);
1753 if (Feptr >= mb->end_subject)
1754 {
1755 SCHECK_PARTIAL();
1756 RRETURN(MATCH_NOMATCH);
1757 }
1758 GETCHARINC(d, Feptr);
1759 if (Lc == d) RRETURN(MATCH_NOMATCH);
1760 }
1761 }
1762 else
1763 #endif
1764 /* Not UTF mode */
1765 {
1766 for (;;)
1767 {
1768 RMATCH(Fecode, RM31);
1769 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1770 if (Lmin++ >= Lmax) RRETURN(MATCH_NOMATCH);
1771 if (Feptr >= mb->end_subject)
1772 {
1773 SCHECK_PARTIAL();
1774 RRETURN(MATCH_NOMATCH);
1775 }
1776 if (Lc == *Feptr++) RRETURN(MATCH_NOMATCH);
1777 }
1778 }
1779 /* Control never gets here */
1780 }
1781
1782 /* Maximize case */
1783
1784 else
1785 {
1786 Lstart_eptr = Feptr;
1787
1788 #ifdef SUPPORT_UNICODE
1789 if (utf)
1790 {
1791 uint32_t d;
1792 for (i = Lmin; i < Lmax; i++)
1793 {
1794 int len = 1;
1795 if (Feptr >= mb->end_subject)
1796 {
1797 SCHECK_PARTIAL();
1798 break;
1799 }
1800 GETCHARLEN(d, Feptr, len);
1801 if (Lc == d) break;
1802 Feptr += len;
1803 }
1804
1805 /* After \C in UTF mode, Lstart_eptr might be in the middle of a
1806 Unicode character. Use <= Lstart_eptr to ensure backtracking doesn't
1807 go too far. */
1808
1809 if (reptype != REPTYPE_POS) for(;;)
1810 {
1811 if (Feptr <= Lstart_eptr) break;
1812 RMATCH(Fecode, RM207);
1813 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1814 Feptr--;
1815 BACKCHAR(Feptr);
1816 }
1817 }
1818 else
1819 #endif
1820 /* Not UTF mode */
1821 {
1822 for (i = Lmin; i < Lmax; i++)
1823 {
1824 if (Feptr >= mb->end_subject)
1825 {
1826 SCHECK_PARTIAL();
1827 break;
1828 }
1829 if (Lc == *Feptr) break;
1830 Feptr++;
1831 }
1832 if (reptype != REPTYPE_POS) for (;;)
1833 {
1834 if (Feptr == Lstart_eptr) break;
1835 RMATCH(Fecode, RM32);
1836 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1837 Feptr--;
1838 }
1839 }
1840 }
1841 }
1842 break;
1843
1844 #undef Lstart_eptr
1845 #undef Lmin
1846 #undef Lmax
1847 #undef Lc
1848 #undef Loc
1849
1850
1851 /* ===================================================================== */
1852 /* Match a bit-mapped character class, possibly repeatedly. These opcodes
1853 are used when all the characters in the class have values in the range
1854 0-255, and either the matching is caseful, or the characters are in the
1855 range 0-127 when UTF processing is enabled. The only difference between
1856 OP_CLASS and OP_NCLASS occurs when a data character outside the range is
1857 encountered. */
1858
1859 #define Lmin F->temp_32[0]
1860 #define Lmax F->temp_32[1]
1861 #define Lstart_eptr F->temp_sptr[0]
1862 #define Lbyte_map_address F->temp_sptr[1]
1863 #define Lbyte_map ((unsigned char *)Lbyte_map_address)
1864
1865 case OP_NCLASS:
1866 case OP_CLASS:
1867 {
1868 Lbyte_map_address = Fecode + 1; /* Save for matching */
1869 Fecode += 1 + (32 / sizeof(PCRE2_UCHAR)); /* Advance past the item */
1870
1871 /* Look past the end of the item to see if there is repeat information
1872 following. Then obey similar code to character type repeats. */
1873
1874 switch (*Fecode)
1875 {
1876 case OP_CRSTAR:
1877 case OP_CRMINSTAR:
1878 case OP_CRPLUS:
1879 case OP_CRMINPLUS:
1880 case OP_CRQUERY:
1881 case OP_CRMINQUERY:
1882 case OP_CRPOSSTAR:
1883 case OP_CRPOSPLUS:
1884 case OP_CRPOSQUERY:
1885 fc = *Fecode++ - OP_CRSTAR;
1886 Lmin = rep_min[fc];
1887 Lmax = rep_max[fc];
1888 reptype = rep_typ[fc];
1889 break;
1890
1891 case OP_CRRANGE:
1892 case OP_CRMINRANGE:
1893 case OP_CRPOSRANGE:
1894 Lmin = GET2(Fecode, 1);
1895 Lmax = GET2(Fecode, 1 + IMM2_SIZE);
1896 if (Lmax == 0) Lmax = UINT32_MAX; /* Max 0 => infinity */
1897 reptype = rep_typ[*Fecode - OP_CRSTAR];
1898 Fecode += 1 + 2 * IMM2_SIZE;
1899 break;
1900
1901 default: /* No repeat follows */
1902 Lmin = Lmax = 1;
1903 break;
1904 }
1905
1906 /* First, ensure the minimum number of matches are present. */
1907
1908 #ifdef SUPPORT_UNICODE
1909 if (utf)
1910 {
1911 for (i = 1; i <= Lmin; i++)
1912 {
1913 if (Feptr >= mb->end_subject)
1914 {
1915 SCHECK_PARTIAL();
1916 RRETURN(MATCH_NOMATCH);
1917 }
1918 GETCHARINC(fc, Feptr);
1919 if (fc > 255)
1920 {
1921 if (Fop == OP_CLASS) RRETURN(MATCH_NOMATCH);
1922 }
1923 else
1924 if ((Lbyte_map[fc/8] & (1u << (fc&7))) == 0) RRETURN(MATCH_NOMATCH);
1925 }
1926 }
1927 else
1928 #endif
1929 /* Not UTF mode */
1930 {
1931 for (i = 1; i <= Lmin; i++)
1932 {
1933 if (Feptr >= mb->end_subject)
1934 {
1935 SCHECK_PARTIAL();
1936 RRETURN(MATCH_NOMATCH);
1937 }
1938 fc = *Feptr++;
1939 #if PCRE2_CODE_UNIT_WIDTH != 8
1940 if (fc > 255)
1941 {
1942 if (Fop == OP_CLASS) RRETURN(MATCH_NOMATCH);
1943 }
1944 else
1945 #endif
1946 if ((Lbyte_map[fc/8] & (1u << (fc&7))) == 0) RRETURN(MATCH_NOMATCH);
1947 }
1948 }
1949
1950 /* If Lmax == Lmin we are done. Continue with main loop. */
1951
1952 if (Lmin == Lmax) continue;
1953
1954 /* If minimizing, keep testing the rest of the expression and advancing
1955 the pointer while it matches the class. */
1956
1957 if (reptype == REPTYPE_MIN)
1958 {
1959 #ifdef SUPPORT_UNICODE
1960 if (utf)
1961 {
1962 for (;;)
1963 {
1964 RMATCH(Fecode, RM200);
1965 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1966 if (Lmin++ >= Lmax) RRETURN(MATCH_NOMATCH);
1967 if (Feptr >= mb->end_subject)
1968 {
1969 SCHECK_PARTIAL();
1970 RRETURN(MATCH_NOMATCH);
1971 }
1972 GETCHARINC(fc, Feptr);
1973 if (fc > 255)
1974 {
1975 if (Fop == OP_CLASS) RRETURN(MATCH_NOMATCH);
1976 }
1977 else
1978 if ((Lbyte_map[fc/8] & (1u << (fc&7))) == 0) RRETURN(MATCH_NOMATCH);
1979 }
1980 }
1981 else
1982 #endif
1983 /* Not UTF mode */
1984 {
1985 for (;;)
1986 {
1987 RMATCH(Fecode, RM23);
1988 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1989 if (Lmin++ >= Lmax) RRETURN(MATCH_NOMATCH);
1990 if (Feptr >= mb->end_subject)
1991 {
1992 SCHECK_PARTIAL();
1993 RRETURN(MATCH_NOMATCH);
1994 }
1995 fc = *Feptr++;
1996 #if PCRE2_CODE_UNIT_WIDTH != 8
1997 if (fc > 255)
1998 {
1999 if (Fop == OP_CLASS) RRETURN(MATCH_NOMATCH);
2000 }
2001 else
2002 #endif
2003 if ((Lbyte_map[fc/8] & (1u << (fc&7))) == 0) RRETURN(MATCH_NOMATCH);
2004 }
2005 }
2006 /* Control never gets here */
2007 }
2008
2009 /* If maximizing, find the longest possible run, then work backwards. */
2010
2011 else
2012 {
2013 Lstart_eptr = Feptr;
2014
2015 #ifdef SUPPORT_UNICODE
2016 if (utf)
2017 {
2018 for (i = Lmin; i < Lmax; i++)
2019 {
2020 int len = 1;
2021 if (Feptr >= mb->end_subject)
2022 {
2023 SCHECK_PARTIAL();
2024 break;
2025 }
2026 GETCHARLEN(fc, Feptr, len);
2027 if (fc > 255)
2028 {
2029 if (Fop == OP_CLASS) break;
2030 }
2031 else
2032 if ((Lbyte_map[fc/8] & (1u << (fc&7))) == 0) break;
2033 Feptr += len;
2034 }
2035
2036 if (reptype == REPTYPE_POS) continue; /* No backtracking */
2037
2038 /* After \C in UTF mode, Lstart_eptr might be in the middle of a
2039 Unicode character. Use <= Lstart_eptr to ensure backtracking doesn't
2040 go too far. */
2041
2042 for (;;)
2043 {
2044 RMATCH(Fecode, RM201);
2045 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2046 if (Feptr-- <= Lstart_eptr) break; /* Tried at original position */
2047 BACKCHAR(Feptr);
2048 }
2049 }
2050 else
2051 #endif
2052 /* Not UTF mode */
2053 {
2054 for (i = Lmin; i < Lmax; i++)
2055 {
2056 if (Feptr >= mb->end_subject)
2057 {
2058 SCHECK_PARTIAL();
2059 break;
2060 }
2061 fc = *Feptr;
2062 #if PCRE2_CODE_UNIT_WIDTH != 8
2063 if (fc > 255)
2064 {
2065 if (Fop == OP_CLASS) break;
2066 }
2067 else
2068 #endif
2069 if ((Lbyte_map[fc/8] & (1u << (fc&7))) == 0) break;
2070 Feptr++;
2071 }
2072
2073 if (reptype == REPTYPE_POS) continue; /* No backtracking */
2074
2075 while (Feptr >= Lstart_eptr)
2076 {
2077 RMATCH(Fecode, RM24);
2078 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2079 Feptr--;
2080 }
2081 }
2082
2083 RRETURN(MATCH_NOMATCH);
2084 }
2085 }
2086 /* Control never gets here */
2087
2088 #undef Lbyte_map_address
2089 #undef Lbyte_map
2090 #undef Lstart_eptr
2091 #undef Lmin
2092 #undef Lmax
2093
2094
2095 /* ===================================================================== */
2096 /* Match an extended character class. In the 8-bit library, this opcode is
2097 encountered only when UTF-8 mode mode is supported. In the 16-bit and
2098 32-bit libraries, codepoints greater than 255 may be encountered even when
2099 UTF is not supported. */
2100
2101 #define Lstart_eptr F->temp_sptr[0]
2102 #define Lxclass_data F->temp_sptr[1]
2103 #define Lmin F->temp_32[0]
2104 #define Lmax F->temp_32[1]
2105
2106 #ifdef SUPPORT_WIDE_CHARS
2107 case OP_XCLASS:
2108 {
2109 Lxclass_data = Fecode + 1 + LINK_SIZE; /* Save for matching */
2110 Fecode += GET(Fecode, 1); /* Advance past the item */
2111
2112 switch (*Fecode)
2113 {
2114 case OP_CRSTAR:
2115 case OP_CRMINSTAR:
2116 case OP_CRPLUS:
2117 case OP_CRMINPLUS:
2118 case OP_CRQUERY:
2119 case OP_CRMINQUERY:
2120 case OP_CRPOSSTAR:
2121 case OP_CRPOSPLUS:
2122 case OP_CRPOSQUERY:
2123 fc = *Fecode++ - OP_CRSTAR;
2124 Lmin = rep_min[fc];
2125 Lmax = rep_max[fc];
2126 reptype = rep_typ[fc];
2127 break;
2128
2129 case OP_CRRANGE:
2130 case OP_CRMINRANGE:
2131 case OP_CRPOSRANGE:
2132 Lmin = GET2(Fecode, 1);
2133 Lmax = GET2(Fecode, 1 + IMM2_SIZE);
2134 if (Lmax == 0) Lmax = UINT32_MAX; /* Max 0 => infinity */
2135 reptype = rep_typ[*Fecode - OP_CRSTAR];
2136 Fecode += 1 + 2 * IMM2_SIZE;
2137 break;
2138
2139 default: /* No repeat follows */
2140 Lmin = Lmax = 1;
2141 break;
2142 }
2143
2144 /* First, ensure the minimum number of matches are present. */
2145
2146 for (i = 1; i <= Lmin; i++)
2147 {
2148 if (Feptr >= mb->end_subject)
2149 {
2150 SCHECK_PARTIAL();
2151 RRETURN(MATCH_NOMATCH);
2152 }
2153 GETCHARINCTEST(fc, Feptr);
2154 if (!PRIV(xclass)(fc, Lxclass_data, utf)) RRETURN(MATCH_NOMATCH);
2155 }
2156
2157 /* If Lmax == Lmin we can just continue with the main loop. */
2158
2159 if (Lmin == Lmax) continue;
2160
2161 /* If minimizing, keep testing the rest of the expression and advancing
2162 the pointer while it matches the class. */
2163
2164 if (reptype == REPTYPE_MIN)
2165 {
2166 for (;;)
2167 {
2168 RMATCH(Fecode, RM100);
2169 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2170 if (Lmin++ >= Lmax) RRETURN(MATCH_NOMATCH);
2171 if (Feptr >= mb->end_subject)
2172 {
2173 SCHECK_PARTIAL();
2174 RRETURN(MATCH_NOMATCH);
2175 }
2176 GETCHARINCTEST(fc, Feptr);
2177 if (!PRIV(xclass)(fc, Lxclass_data, utf)) RRETURN(MATCH_NOMATCH);
2178 }
2179 /* Control never gets here */
2180 }
2181
2182 /* If maximizing, find the longest possible run, then work backwards. */
2183
2184 else
2185 {
2186 Lstart_eptr = Feptr;
2187 for (i = Lmin; i < Lmax; i++)
2188 {
2189 int len = 1;
2190 if (Feptr >= mb->end_subject)
2191 {
2192 SCHECK_PARTIAL();
2193 break;
2194 }
2195 #ifdef SUPPORT_UNICODE
2196 GETCHARLENTEST(fc, Feptr, len);
2197 #else
2198 fc = *Feptr;
2199 #endif
2200 if (!PRIV(xclass)(fc, Lxclass_data, utf)) break;
2201 Feptr += len;
2202 }
2203
2204 if (reptype == REPTYPE_POS) continue; /* No backtracking */
2205
2206 /* After \C in UTF mode, Lstart_eptr might be in the middle of a
2207 Unicode character. Use <= Lstart_eptr to ensure backtracking doesn't
2208 go too far. */
2209
2210 for(;;)
2211 {
2212 RMATCH(Fecode, RM101);
2213 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2214 if (Feptr-- <= Lstart_eptr) break; /* Tried at original position */
2215 #ifdef SUPPORT_UNICODE
2216 if (utf) BACKCHAR(Feptr);
2217 #endif
2218 }
2219 RRETURN(MATCH_NOMATCH);
2220 }
2221
2222 /* Control never gets here */
2223 }
2224 #endif /* SUPPORT_WIDE_CHARS: end of XCLASS */
2225
2226 #undef Lstart_eptr
2227 #undef Lxclass_data
2228 #undef Lmin
2229 #undef Lmax
2230
2231
2232 /* ===================================================================== */
2233 /* Match various character types when PCRE2_UCP is not set. These opcodes
2234 are not generated when PCRE2_UCP is set - instead appropriate property
2235 tests are compiled. */
2236
2237 case OP_NOT_DIGIT:
2238 if (Feptr >= mb->end_subject)
2239 {
2240 SCHECK_PARTIAL();
2241 RRETURN(MATCH_NOMATCH);
2242 }
2243 GETCHARINCTEST(fc, Feptr);
2244 if (CHMAX_255(fc) && (mb->ctypes[fc] & ctype_digit) != 0)
2245 RRETURN(MATCH_NOMATCH);
2246 Fecode++;
2247 break;
2248
2249 case OP_DIGIT:
2250 if (Feptr >= mb->end_subject)
2251 {
2252 SCHECK_PARTIAL();
2253 RRETURN(MATCH_NOMATCH);
2254 }
2255 GETCHARINCTEST(fc, Feptr);
2256 if (!CHMAX_255(fc) || (mb->ctypes[fc] & ctype_digit) == 0)
2257 RRETURN(MATCH_NOMATCH);
2258 Fecode++;
2259 break;
2260
2261 case OP_NOT_WHITESPACE:
2262 if (Feptr >= mb->end_subject)
2263 {
2264 SCHECK_PARTIAL();
2265 RRETURN(MATCH_NOMATCH);
2266 }
2267 GETCHARINCTEST(fc, Feptr);
2268 if (CHMAX_255(fc) && (mb->ctypes[fc] & ctype_space) != 0)
2269 RRETURN(MATCH_NOMATCH);
2270 Fecode++;
2271 break;
2272
2273 case OP_WHITESPACE:
2274 if (Feptr >= mb->end_subject)
2275 {
2276 SCHECK_PARTIAL();
2277 RRETURN(MATCH_NOMATCH);
2278 }
2279 GETCHARINCTEST(fc, Feptr);
2280 if (!CHMAX_255(fc) || (mb->ctypes[fc] & ctype_space) == 0)
2281 RRETURN(MATCH_NOMATCH);
2282 Fecode++;
2283 break;
2284
2285 case OP_NOT_WORDCHAR:
2286 if (Feptr >= mb->end_subject)
2287 {
2288 SCHECK_PARTIAL();
2289 RRETURN(MATCH_NOMATCH);
2290 }
2291 GETCHARINCTEST(fc, Feptr);
2292 if (CHMAX_255(fc) && (mb->ctypes[fc] & ctype_word) != 0)
2293 RRETURN(MATCH_NOMATCH);
2294 Fecode++;
2295 break;
2296
2297 case OP_WORDCHAR:
2298 if (Feptr >= mb->end_subject)
2299 {
2300 SCHECK_PARTIAL();
2301 RRETURN(MATCH_NOMATCH);
2302 }
2303 GETCHARINCTEST(fc, Feptr);
2304 if (!CHMAX_255(fc) || (mb->ctypes[fc] & ctype_word) == 0)
2305 RRETURN(MATCH_NOMATCH);
2306 Fecode++;
2307 break;
2308
2309 case OP_ANYNL:
2310 if (Feptr >= mb->end_subject)
2311 {
2312 SCHECK_PARTIAL();
2313 RRETURN(MATCH_NOMATCH);
2314 }
2315 GETCHARINCTEST(fc, Feptr);
2316 switch(fc)
2317 {
2318 default: RRETURN(MATCH_NOMATCH);
2319
2320 case CHAR_CR:
2321 if (Feptr >= mb->end_subject)
2322 {
2323 SCHECK_PARTIAL();
2324 }
2325 else if (UCHAR21TEST(Feptr) == CHAR_LF) Feptr++;
2326 break;
2327
2328 case CHAR_LF:
2329 break;
2330
2331 case CHAR_VT:
2332 case CHAR_FF:
2333 case CHAR_NEL:
2334 #ifndef EBCDIC
2335 case 0x2028:
2336 case 0x2029:
2337 #endif /* Not EBCDIC */
2338 if (mb->bsr_convention == PCRE2_BSR_ANYCRLF) RRETURN(MATCH_NOMATCH);
2339 break;
2340 }
2341 Fecode++;
2342 break;
2343
2344 case OP_NOT_HSPACE:
2345 if (Feptr >= mb->end_subject)
2346 {
2347 SCHECK_PARTIAL();
2348 RRETURN(MATCH_NOMATCH);
2349 }
2350 GETCHARINCTEST(fc, Feptr);
2351 switch(fc)
2352 {
2353 HSPACE_CASES: RRETURN(MATCH_NOMATCH); /* Byte and multibyte cases */
2354 default: break;
2355 }
2356 Fecode++;
2357 break;
2358
2359 case OP_HSPACE:
2360 if (Feptr >= mb->end_subject)
2361 {
2362 SCHECK_PARTIAL();
2363 RRETURN(MATCH_NOMATCH);
2364 }
2365 GETCHARINCTEST(fc, Feptr);
2366 switch(fc)
2367 {
2368 HSPACE_CASES: break; /* Byte and multibyte cases */
2369 default: RRETURN(MATCH_NOMATCH);
2370 }
2371 Fecode++;
2372 break;
2373
2374 case OP_NOT_VSPACE:
2375 if (Feptr >= mb->end_subject)
2376 {
2377 SCHECK_PARTIAL();
2378 RRETURN(MATCH_NOMATCH);
2379 }
2380 GETCHARINCTEST(fc, Feptr);
2381 switch(fc)
2382 {
2383 VSPACE_CASES: RRETURN(MATCH_NOMATCH);
2384 default: break;
2385 }
2386 Fecode++;
2387 break;
2388
2389 case OP_VSPACE:
2390 if (Feptr >= mb->end_subject)
2391 {
2392 SCHECK_PARTIAL();
2393 RRETURN(MATCH_NOMATCH);
2394 }
2395 GETCHARINCTEST(fc, Feptr);
2396 switch(fc)
2397 {
2398 VSPACE_CASES: break;
2399 default: RRETURN(MATCH_NOMATCH);
2400 }
2401 Fecode++;
2402 break;
2403
2404
2405 #ifdef SUPPORT_UNICODE
2406
2407 /* ===================================================================== */
2408 /* Check the next character by Unicode property. We will get here only
2409 if the support is in the binary; otherwise a compile-time error occurs. */
2410
2411 case OP_PROP:
2412 case OP_NOTPROP:
2413 if (Feptr >= mb->end_subject)
2414 {
2415 SCHECK_PARTIAL();
2416 RRETURN(MATCH_NOMATCH);
2417 }
2418 GETCHARINCTEST(fc, Feptr);
2419 {
2420 const uint32_t *cp;
2421 const ucd_record *prop = GET_UCD(fc);
2422
2423 switch(Fecode[1])
2424 {
2425 case PT_ANY:
2426 if (Fop == OP_NOTPROP) RRETURN(MATCH_NOMATCH);
2427 break;
2428
2429 case PT_LAMP:
2430 if ((prop->chartype == ucp_Lu ||
2431 prop->chartype == ucp_Ll ||
2432 prop->chartype == ucp_Lt) == (Fop == OP_NOTPROP))
2433 RRETURN(MATCH_NOMATCH);
2434 break;
2435
2436 case PT_GC:
2437 if ((Fecode[2] != PRIV(ucp_gentype)[prop->chartype]) == (Fop == OP_PROP))
2438 RRETURN(MATCH_NOMATCH);
2439 break;
2440
2441 case PT_PC:
2442 if ((Fecode[2] != prop->chartype) == (Fop == OP_PROP))
2443 RRETURN(MATCH_NOMATCH);
2444 break;
2445
2446 case PT_SC:
2447 if ((Fecode[2] != prop->script) == (Fop == OP_PROP))
2448 RRETURN(MATCH_NOMATCH);
2449 break;
2450
2451 /* These are specials */
2452
2453 case PT_ALNUM:
2454 if ((PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
2455 PRIV(ucp_gentype)[prop->chartype] == ucp_N) == (Fop == OP_NOTPROP))
2456 RRETURN(MATCH_NOMATCH);
2457 break;
2458
2459 /* Perl space used to exclude VT, but from Perl 5.18 it is included,
2460 which means that Perl space and POSIX space are now identical. PCRE
2461 was changed at release 8.34. */
2462
2463 case PT_SPACE: /* Perl space */
2464 case PT_PXSPACE: /* POSIX space */
2465 switch(fc)
2466 {
2467 HSPACE_CASES:
2468 VSPACE_CASES:
2469 if (Fop == OP_NOTPROP) RRETURN(MATCH_NOMATCH);
2470 break;
2471
2472 default:
2473 if ((PRIV(ucp_gentype)[prop->chartype] == ucp_Z) ==
2474 (Fop == OP_NOTPROP)) RRETURN(MATCH_NOMATCH);
2475 break;
2476 }
2477 break;
2478
2479 case PT_WORD:
2480 if ((PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
2481 PRIV(ucp_gentype)[prop->chartype] == ucp_N ||
2482 fc == CHAR_UNDERSCORE) == (Fop == OP_NOTPROP))
2483 RRETURN(MATCH_NOMATCH);
2484 break;
2485
2486 case PT_CLIST:
2487 cp = PRIV(ucd_caseless_sets) + Fecode[2];
2488 for (;;)
2489 {
2490 if (fc < *cp)
2491 { if (Fop == OP_PROP) { RRETURN(MATCH_NOMATCH); } else break; }
2492 if (fc == *cp++)
2493 { if (Fop == OP_PROP) break; else { RRETURN(MATCH_NOMATCH); } }
2494 }
2495 break;
2496
2497 case PT_UCNC:
2498 if ((fc == CHAR_DOLLAR_SIGN || fc == CHAR_COMMERCIAL_AT ||
2499 fc == CHAR_GRAVE_ACCENT || (fc >= 0xa0 && fc <= 0xd7ff) ||
2500 fc >= 0xe000) == (Fop == OP_NOTPROP))
2501 RRETURN(MATCH_NOMATCH);
2502 break;
2503
2504 /* This should never occur */
2505
2506 default:
2507 return PCRE2_ERROR_INTERNAL;
2508 }
2509
2510 Fecode += 3;
2511 }
2512 break;
2513
2514
2515 /* ===================================================================== */
2516 /* Match an extended Unicode sequence. We will get here only if the support
2517 is in the binary; otherwise a compile-time error occurs. */
2518
2519 case OP_EXTUNI:
2520 if (Feptr >= mb->end_subject)
2521 {
2522 SCHECK_PARTIAL();
2523 RRETURN(MATCH_NOMATCH);
2524 }
2525 else
2526 {
2527 GETCHARINCTEST(fc, Feptr);
2528 Feptr = PRIV(extuni)(fc, Feptr, mb->start_subject, mb->end_subject, utf,
2529 NULL);
2530 }
2531 CHECK_PARTIAL();
2532 Fecode++;
2533 break;
2534
2535 #endif /* SUPPORT_UNICODE */
2536
2537
2538 /* ===================================================================== */
2539 /* Match a single character type repeatedly. Note that the property type
2540 does not need to be in a stack frame as it is not used within an RMATCH()
2541 loop. */
2542
2543 #define Lstart_eptr F->temp_sptr[0]
2544 #define Lmin F->temp_32[0]
2545 #define Lmax F->temp_32[1]
2546 #define Lctype F->temp_32[2]
2547 #define Lpropvalue F->temp_32[3]
2548
2549 case OP_TYPEEXACT:
2550 Lmin = Lmax = GET2(Fecode, 1);
2551 Fecode += 1 + IMM2_SIZE;
2552 goto REPEATTYPE;
2553
2554 case OP_TYPEUPTO:
2555 case OP_TYPEMINUPTO:
2556 Lmin = 0;
2557 Lmax = GET2(Fecode, 1);
2558 reptype = (*Fecode == OP_TYPEMINUPTO)? REPTYPE_MIN : REPTYPE_MAX;
2559 Fecode += 1 + IMM2_SIZE;
2560 goto REPEATTYPE;
2561
2562 case OP_TYPEPOSSTAR:
2563 reptype = REPTYPE_POS;
2564 Lmin = 0;
2565 Lmax = UINT32_MAX;
2566 Fecode++;
2567 goto REPEATTYPE;
2568
2569 case OP_TYPEPOSPLUS:
2570 reptype = REPTYPE_POS;
2571 Lmin = 1;
2572 Lmax = UINT32_MAX;
2573 Fecode++;
2574 goto REPEATTYPE;
2575
2576 case OP_TYPEPOSQUERY:
2577 reptype = REPTYPE_POS;
2578 Lmin = 0;
2579 Lmax = 1;
2580 Fecode++;
2581 goto REPEATTYPE;
2582
2583 case OP_TYPEPOSUPTO:
2584 reptype = REPTYPE_POS;
2585 Lmin = 0;
2586 Lmax = GET2(Fecode, 1);
2587 Fecode += 1 + IMM2_SIZE;
2588 goto REPEATTYPE;
2589
2590 case OP_TYPESTAR:
2591 case OP_TYPEMINSTAR:
2592 case OP_TYPEPLUS:
2593 case OP_TYPEMINPLUS:
2594 case OP_TYPEQUERY:
2595 case OP_TYPEMINQUERY:
2596 fc = *Fecode++ - OP_TYPESTAR;
2597 Lmin = rep_min[fc];
2598 Lmax = rep_max[fc];
2599 reptype = rep_typ[fc];
2600
2601 /* Common code for all repeated character type matches. */
2602
2603 REPEATTYPE:
2604 Lctype = *Fecode++; /* Code for the character type */
2605
2606 #ifdef SUPPORT_UNICODE
2607 if (Lctype == OP_PROP || Lctype == OP_NOTPROP)
2608 {
2609 proptype = *Fecode++;
2610 Lpropvalue = *Fecode++;
2611 }
2612 else proptype = -1;
2613 #endif
2614
2615 /* First, ensure the minimum number of matches are present. Use inline
2616 code for maximizing the speed, and do the type test once at the start
2617 (i.e. keep it out of the loop). The code for UTF mode is separated out for
2618 tidiness, except for Unicode property tests. */
2619
2620 if (Lmin > 0)
2621 {
2622 #ifdef SUPPORT_UNICODE
2623 if (proptype >= 0) /* Property tests in all modes */
2624 {
2625 switch(proptype)
2626 {
2627 case PT_ANY:
2628 if (Lctype == OP_NOTPROP) RRETURN(MATCH_NOMATCH);
2629 for (i = 1; i <= Lmin; i++)
2630 {
2631 if (Feptr >= mb->end_subject)
2632 {
2633 SCHECK_PARTIAL();
2634 RRETURN(MATCH_NOMATCH);
2635 }
2636 GETCHARINCTEST(fc, Feptr);
2637 }
2638 break;
2639
2640 case PT_LAMP:
2641 for (i = 1; i <= Lmin; i++)
2642 {
2643 int chartype;
2644 if (Feptr >= mb->end_subject)
2645 {
2646 SCHECK_PARTIAL();
2647 RRETURN(MATCH_NOMATCH);
2648 }
2649 GETCHARINCTEST(fc, Feptr);
2650 chartype = UCD_CHARTYPE(fc);
2651 if ((chartype == ucp_Lu ||
2652 chartype == ucp_Ll ||
2653 chartype == ucp_Lt) == (Lctype == OP_NOTPROP))
2654 RRETURN(MATCH_NOMATCH);
2655 }
2656 break;
2657
2658 case PT_GC:
2659 for (i = 1; i <= Lmin; i++)
2660 {
2661 if (Feptr >= mb->end_subject)
2662 {
2663 SCHECK_PARTIAL();
2664 RRETURN(MATCH_NOMATCH);
2665 }
2666 GETCHARINCTEST(fc, Feptr);
2667 if ((UCD_CATEGORY(fc) == Lpropvalue) == (Lctype == OP_NOTPROP))
2668 RRETURN(MATCH_NOMATCH);
2669 }
2670 break;
2671
2672 case PT_PC:
2673 for (i = 1; i <= Lmin; i++)
2674 {
2675 if (Feptr >= mb->end_subject)
2676 {
2677 SCHECK_PARTIAL();
2678 RRETURN(MATCH_NOMATCH);
2679 }
2680 GETCHARINCTEST(fc, Feptr);
2681 if ((UCD_CHARTYPE(fc) == Lpropvalue) == (Lctype == OP_NOTPROP))
2682 RRETURN(MATCH_NOMATCH);
2683 }
2684 break;
2685
2686 case PT_SC:
2687 for (i = 1; i <= Lmin; i++)
2688 {
2689 if (Feptr >= mb->end_subject)
2690 {
2691 SCHECK_PARTIAL();
2692 RRETURN(MATCH_NOMATCH);
2693 }
2694 GETCHARINCTEST(fc, Feptr);
2695 if ((UCD_SCRIPT(fc) == Lpropvalue) == (Lctype == OP_NOTPROP))
2696 RRETURN(MATCH_NOMATCH);
2697 }
2698 break;
2699
2700 case PT_ALNUM:
2701 for (i = 1; i <= Lmin; i++)
2702 {
2703 int category;
2704 if (Feptr >= mb->end_subject)
2705 {
2706 SCHECK_PARTIAL();
2707 RRETURN(MATCH_NOMATCH);
2708 }
2709 GETCHARINCTEST(fc, Feptr);
2710 category = UCD_CATEGORY(fc);
2711 if ((category == ucp_L || category == ucp_N) == (Lctype == OP_NOTPROP))
2712 RRETURN(MATCH_NOMATCH);
2713 }
2714 break;
2715
2716 /* Perl space used to exclude VT, but from Perl 5.18 it is included,
2717 which means that Perl space and POSIX space are now identical. PCRE
2718 was changed at release 8.34. */
2719
2720 case PT_SPACE: /* Perl space */
2721 case PT_PXSPACE: /* POSIX space */
2722 for (i = 1; i <= Lmin; i++)
2723 {
2724 if (Feptr >= mb->end_subject)
2725 {
2726 SCHECK_PARTIAL();
2727 RRETURN(MATCH_NOMATCH);
2728 }
2729 GETCHARINCTEST(fc, Feptr);
2730 switch(fc)
2731 {
2732 HSPACE_CASES:
2733 VSPACE_CASES:
2734 if (Lctype == OP_NOTPROP) RRETURN(MATCH_NOMATCH);
2735 break;
2736
2737 default:
2738 if ((UCD_CATEGORY(fc) == ucp_Z) == (Lctype == OP_NOTPROP))
2739 RRETURN(MATCH_NOMATCH);
2740 break;
2741 }
2742 }
2743 break;
2744
2745 case PT_WORD:
2746 for (i = 1; i <= Lmin; i++)
2747 {
2748 int category;
2749 if (Feptr >= mb->end_subject)
2750 {
2751 SCHECK_PARTIAL();
2752 RRETURN(MATCH_NOMATCH);
2753 }
2754 GETCHARINCTEST(fc, Feptr);
2755 category = UCD_CATEGORY(fc);
2756 if ((category == ucp_L || category == ucp_N ||
2757 fc == CHAR_UNDERSCORE) == (Lctype == OP_NOTPROP))
2758 RRETURN(MATCH_NOMATCH);
2759 }
2760 break;
2761
2762 case PT_CLIST:
2763 for (i = 1; i <= Lmin; i++)
2764 {
2765 const uint32_t *cp;
2766 if (Feptr >= mb->end_subject)
2767 {
2768 SCHECK_PARTIAL();
2769 RRETURN(MATCH_NOMATCH);
2770 }
2771 GETCHARINCTEST(fc, Feptr);
2772 cp = PRIV(ucd_caseless_sets) + Lpropvalue;
2773 for (;;)
2774 {
2775 if (fc < *cp)
2776 {
2777 if (Lctype == OP_NOTPROP) break;
2778 RRETURN(MATCH_NOMATCH);
2779 }
2780 if (fc == *cp++)
2781 {
2782 if (Lctype == OP_NOTPROP) RRETURN(MATCH_NOMATCH);
2783 break;
2784 }
2785 }
2786 }
2787 break;
2788
2789 case PT_UCNC:
2790 for (i = 1; i <= Lmin; i++)
2791 {
2792 if (Feptr >= mb->end_subject)
2793 {
2794 SCHECK_PARTIAL();
2795 RRETURN(MATCH_NOMATCH);
2796 }
2797 GETCHARINCTEST(fc, Feptr);
2798 if ((fc == CHAR_DOLLAR_SIGN || fc == CHAR_COMMERCIAL_AT ||
2799 fc == CHAR_GRAVE_ACCENT || (fc >= 0xa0 && fc <= 0xd7ff) ||
2800 fc >= 0xe000) == (Lctype == OP_NOTPROP))
2801 RRETURN(MATCH_NOMATCH);
2802 }
2803 break;
2804
2805 /* This should not occur */
2806
2807 default:
2808 return PCRE2_ERROR_INTERNAL;
2809 }
2810 }
2811
2812 /* Match extended Unicode sequences. We will get here only if the
2813 support is in the binary; otherwise a compile-time error occurs. */
2814
2815 else if (Lctype == OP_EXTUNI)
2816 {
2817 for (i = 1; i <= Lmin; i++)
2818 {
2819 if (Feptr >= mb->end_subject)
2820 {
2821 SCHECK_PARTIAL();
2822 RRETURN(MATCH_NOMATCH);
2823 }
2824 else
2825 {
2826 GETCHARINCTEST(fc, Feptr);
2827 Feptr = PRIV(extuni)(fc, Feptr, mb->start_subject,
2828 mb->end_subject, utf, NULL);
2829 }
2830 CHECK_PARTIAL();
2831 }
2832 }
2833 else
2834 #endif /* SUPPORT_UNICODE */
2835
2836 /* Handle all other cases in UTF mode */
2837
2838 #ifdef SUPPORT_UNICODE
2839 if (utf) switch(Lctype)
2840 {
2841 case OP_ANY:
2842 for (i = 1; i <= Lmin; i++)
2843 {
2844 if (Feptr >= mb->end_subject)
2845 {
2846 SCHECK_PARTIAL();
2847 RRETURN(MATCH_NOMATCH);
2848 }
2849 if (IS_NEWLINE(Feptr)) RRETURN(MATCH_NOMATCH);
2850 if (mb->partial != 0 &&
2851 Feptr + 1 >= mb->end_subject &&
2852 NLBLOCK->nltype == NLTYPE_FIXED &&
2853 NLBLOCK->nllen == 2 &&
2854 UCHAR21(Feptr) == NLBLOCK->nl[0])
2855 {
2856 mb->hitend = TRUE;
2857 if (mb->partial > 1) return PCRE2_ERROR_PARTIAL;
2858 }
2859 Feptr++;
2860 ACROSSCHAR(Feptr < mb->end_subject, Feptr, Feptr++);
2861 }
2862 break;
2863
2864 case OP_ALLANY:
2865 for (i = 1; i <= Lmin; i++)
2866 {
2867 if (Feptr >= mb->end_subject)
2868 {
2869 SCHECK_PARTIAL();
2870 RRETURN(MATCH_NOMATCH);
2871 }
2872 Feptr++;
2873 ACROSSCHAR(Feptr < mb->end_subject, Feptr, Feptr++);
2874 }
2875 break;
2876
2877 case OP_ANYBYTE:
2878 if (Feptr > mb->end_subject - Lmin) RRETURN(MATCH_NOMATCH);
2879 Feptr += Lmin;
2880 break;
2881
2882 case OP_ANYNL:
2883 for (i = 1; i <= Lmin; i++)
2884 {
2885 if (Feptr >= mb->end_subject)
2886 {
2887 SCHECK_PARTIAL();
2888 RRETURN(MATCH_NOMATCH);
2889 }
2890 GETCHARINC(fc, Feptr);
2891 switch(fc)
2892 {
2893 default: RRETURN(MATCH_NOMATCH);
2894
2895 case CHAR_CR:
2896 if (Feptr < mb->end_subject && UCHAR21(Feptr) == CHAR_LF) Feptr++;
2897 break;
2898
2899 case CHAR_LF:
2900 break;
2901
2902 case CHAR_VT:
2903 case CHAR_FF:
2904 case CHAR_NEL:
2905 #ifndef EBCDIC
2906 case 0x2028:
2907 case 0x2029:
2908 #endif /* Not EBCDIC */
2909 if (mb->bsr_convention == PCRE2_BSR_ANYCRLF) RRETURN(MATCH_NOMATCH);
2910 break;
2911 }
2912 }
2913 break;
2914
2915 case OP_NOT_HSPACE:
2916 for (i = 1; i <= Lmin; i++)
2917 {
2918 if (Feptr >= mb->end_subject)
2919 {
2920 SCHECK_PARTIAL();
2921 RRETURN(MATCH_NOMATCH);
2922 }
2923 GETCHARINC(fc, Feptr);
2924 switch(fc)
2925 {
2926 HSPACE_CASES: RRETURN(MATCH_NOMATCH);
2927 default: break;
2928 }
2929 }
2930 break;
2931
2932 case OP_HSPACE:
2933 for (i = 1; i <= Lmin; i++)
2934 {
2935 if (Feptr >= mb->end_subject)
2936 {
2937 SCHECK_PARTIAL();
2938 RRETURN(MATCH_NOMATCH);
2939 }
2940 GETCHARINC(fc, Feptr);
2941 switch(fc)
2942 {
2943 HSPACE_CASES: break;
2944 default: RRETURN(MATCH_NOMATCH);
2945 }
2946 }
2947 break;
2948
2949 case OP_NOT_VSPACE:
2950 for (i = 1; i <= Lmin; i++)
2951 {
2952 if (Feptr >= mb->end_subject)
2953 {
2954 SCHECK_PARTIAL();
2955 RRETURN(MATCH_NOMATCH);
2956 }
2957 GETCHARINC(fc, Feptr);
2958 switch(fc)
2959 {
2960 VSPACE_CASES: RRETURN(MATCH_NOMATCH);
2961 default: break;
2962 }
2963 }
2964 break;
2965
2966 case OP_VSPACE:
2967 for (i = 1; i <= Lmin; i++)
2968 {
2969 if (Feptr >= mb->end_subject)
2970 {
2971 SCHECK_PARTIAL();
2972 RRETURN(MATCH_NOMATCH);
2973 }
2974 GETCHARINC(fc, Feptr);
2975 switch(fc)
2976 {
2977 VSPACE_CASES: break;
2978 default: RRETURN(MATCH_NOMATCH);
2979 }
2980 }
2981 break;
2982
2983 case OP_NOT_DIGIT:
2984 for (i = 1; i <= Lmin; i++)
2985 {
2986 if (Feptr >= mb->end_subject)
2987 {
2988 SCHECK_PARTIAL();
2989 RRETURN(MATCH_NOMATCH);
2990 }
2991 GETCHARINC(fc, Feptr);
2992 if (fc < 128 && (mb->ctypes[fc] & ctype_digit) != 0)
2993 RRETURN(MATCH_NOMATCH);
2994 }
2995 break;
2996
2997 case OP_DIGIT:
2998 for (i = 1; i <= Lmin; i++)
2999 {
3000 uint32_t cc;
3001 if (Feptr >= mb->end_subject)
3002 {
3003 SCHECK_PARTIAL();
3004 RRETURN(MATCH_NOMATCH);
3005 }
3006 cc = UCHAR21(Feptr);
3007 if (cc >= 128 || (mb->ctypes[cc] & ctype_digit) == 0)
3008 RRETURN(MATCH_NOMATCH);
3009 Feptr++;
3010 /* No need to skip more code units - we know it has only one. */
3011 }
3012 break;
3013
3014 case OP_NOT_WHITESPACE:
3015 for (i = 1; i <= Lmin; i++)
3016 {
3017 uint32_t cc;
3018 if (Feptr >= mb->end_subject)
3019 {
3020 SCHECK_PARTIAL();
3021 RRETURN(MATCH_NOMATCH);
3022 }
3023 cc = UCHAR21(Feptr);
3024 if (cc < 128 && (mb->ctypes[cc] & ctype_space) != 0)
3025 RRETURN(MATCH_NOMATCH);
3026 Feptr++;
3027 ACROSSCHAR(Feptr < mb->end_subject, Feptr, Feptr++);
3028 }
3029 break;
3030
3031 case OP_WHITESPACE:
3032 for (i = 1; i <= Lmin; i++)
3033 {
3034 uint32_t cc;
3035 if (Feptr >= mb->end_subject)
3036 {
3037 SCHECK_PARTIAL();
3038 RRETURN(MATCH_NOMATCH);
3039 }
3040 cc = UCHAR21(Feptr);
3041 if (cc >= 128 || (mb->ctypes[cc] & ctype_space) == 0)
3042 RRETURN(MATCH_NOMATCH);
3043 Feptr++;
3044 /* No need to skip more code units - we know it has only one. */
3045 }
3046 break;
3047
3048 case OP_NOT_WORDCHAR:
3049 for (i = 1; i <= Lmin; i++)
3050 {
3051 uint32_t cc;
3052 if (Feptr >= mb->end_subject)
3053 {
3054 SCHECK_PARTIAL();
3055 RRETURN(MATCH_NOMATCH);
3056 }
3057 cc = UCHAR21(Feptr);
3058 if (cc < 128 && (mb->ctypes[cc] & ctype_word) != 0)
3059 RRETURN(MATCH_NOMATCH);
3060 Feptr++;
3061 ACROSSCHAR(Feptr < mb->end_subject, Feptr, Feptr++);
3062 }
3063 break;
3064
3065 case OP_WORDCHAR:
3066 for (i = 1; i <= Lmin; i++)
3067 {
3068 uint32_t cc;
3069 if (Feptr >= mb->end_subject)
3070 {
3071 SCHECK_PARTIAL();
3072 RRETURN(MATCH_NOMATCH);
3073 }
3074 cc = UCHAR21(Feptr);
3075 if (cc >= 128 || (mb->ctypes[cc] & ctype_word) == 0)
3076 RRETURN(MATCH_NOMATCH);
3077 Feptr++;
3078 /* No need to skip more code units - we know it has only one. */
3079 }
3080 break;
3081
3082 default:
3083 return PCRE2_ERROR_INTERNAL;
3084 } /* End switch(Lctype) */
3085
3086 else
3087 #endif /* SUPPORT_UNICODE */
3088
3089 /* Code for the non-UTF case for minimum matching of operators other
3090 than OP_PROP and OP_NOTPROP. */
3091
3092 switch(Lctype)
3093 {
3094 case OP_ANY:
3095 for (i = 1; i <= Lmin; i++)
3096 {
3097 if (Feptr >= mb->end_subject)
3098 {
3099 SCHECK_PARTIAL();
3100 RRETURN(MATCH_NOMATCH);
3101 }
3102 if (IS_NEWLINE(Feptr)) RRETURN(MATCH_NOMATCH);
3103 if (mb->partial != 0 &&
3104 Feptr + 1 >= mb->end_subject &&
3105 NLBLOCK->nltype == NLTYPE_FIXED &&
3106 NLBLOCK->nllen == 2 &&
3107 *Feptr == NLBLOCK->nl[0])
3108 {
3109 mb->hitend = TRUE;
3110 if (mb->partial > 1) return PCRE2_ERROR_PARTIAL;
3111 }
3112 Feptr++;
3113 }
3114 break;
3115
3116 case OP_ALLANY:
3117 if (Feptr > mb->end_subject - Lmin)
3118 {
3119 SCHECK_PARTIAL();
3120 RRETURN(MATCH_NOMATCH);
3121 }
3122 Feptr += Lmin;
3123 break;
3124
3125 /* This OP_ANYBYTE case will never be reached because \C gets turned
3126 into OP_ALLANY in non-UTF mode. Cut out the code so that coverage
3127 reports don't complain about it's never being used. */
3128
3129 /* case OP_ANYBYTE:
3130 * if (Feptr > mb->end_subject - Lmin)
3131 * {
3132 * SCHECK_PARTIAL();
3133 * RRETURN(MATCH_NOMATCH);
3134 * }
3135 * Feptr += Lmin;
3136 * break;
3137 */
3138 case OP_ANYNL:
3139 for (i = 1; i <= Lmin; i++)
3140 {
3141 if (Feptr >= mb->end_subject)
3142 {
3143 SCHECK_PARTIAL();
3144 RRETURN(MATCH_NOMATCH);
3145 }
3146 switch(*Feptr++)
3147 {
3148 default: RRETURN(MATCH_NOMATCH);
3149
3150 case CHAR_CR:
3151 if (Feptr < mb->end_subject && *Feptr == CHAR_LF) Feptr++;
3152 break;
3153
3154 case CHAR_LF:
3155 break;
3156
3157 case CHAR_VT:
3158 case CHAR_FF:
3159 case CHAR_NEL:
3160 #if PCRE2_CODE_UNIT_WIDTH != 8
3161 case 0x2028:
3162 case 0x2029:
3163 #endif
3164 if (mb->bsr_convention == PCRE2_BSR_ANYCRLF) RRETURN(MATCH_NOMATCH);
3165 break;
3166 }
3167 }
3168 break;
3169
3170 case OP_NOT_HSPACE:
3171 for (i = 1; i <= Lmin; i++)
3172 {
3173 if (Feptr >= mb->end_subject)
3174 {
3175 SCHECK_PARTIAL();
3176 RRETURN(MATCH_NOMATCH);
3177 }
3178 switch(*Feptr++)
3179 {
3180 default: break;
3181 HSPACE_BYTE_CASES:
3182 #if PCRE2_CODE_UNIT_WIDTH != 8
3183 HSPACE_MULTIBYTE_CASES:
3184 #endif
3185 RRETURN(MATCH_NOMATCH);
3186 }
3187 }
3188 break;
3189
3190 case OP_HSPACE:
3191 for (i = 1; i <= Lmin; i++)
3192 {
3193 if (Feptr >= mb->end_subject)
3194 {
3195 SCHECK_PARTIAL();
3196 RRETURN(MATCH_NOMATCH);
3197 }
3198 switch(*Feptr++)
3199 {
3200 default: RRETURN(MATCH_NOMATCH);
3201 HSPACE_BYTE_CASES:
3202 #if PCRE2_CODE_UNIT_WIDTH != 8
3203 HSPACE_MULTIBYTE_CASES:
3204 #endif
3205 break;
3206 }
3207 }
3208 break;
3209
3210 case OP_NOT_VSPACE:
3211 for (i = 1; i <= Lmin; i++)
3212 {
3213 if (Feptr >= mb->end_subject)
3214 {
3215 SCHECK_PARTIAL();
3216 RRETURN(MATCH_NOMATCH);
3217 }
3218 switch(*Feptr++)
3219 {
3220 VSPACE_BYTE_CASES:
3221 #if PCRE2_CODE_UNIT_WIDTH != 8
3222 VSPACE_MULTIBYTE_CASES:
3223 #endif
3224 RRETURN(MATCH_NOMATCH);
3225 default: break;
3226 }
3227 }
3228 break;
3229
3230 case OP_VSPACE:
3231 for (i = 1; i <= Lmin; i++)
3232 {
3233 if (Feptr >= mb->end_subject)
3234 {
3235 SCHECK_PARTIAL();
3236 RRETURN(MATCH_NOMATCH);
3237 }
3238 switch(*Feptr++)
3239 {
3240 default: RRETURN(MATCH_NOMATCH);
3241 VSPACE_BYTE_CASES:
3242 #if PCRE2_CODE_UNIT_WIDTH != 8
3243 VSPACE_MULTIBYTE_CASES:
3244 #endif
3245 break;
3246 }
3247 }
3248 break;
3249
3250 case OP_NOT_DIGIT:
3251 for (i = 1; i <= Lmin; i++)
3252 {
3253 if (Feptr >= mb->end_subject)
3254 {
3255 SCHECK_PARTIAL();
3256 RRETURN(MATCH_NOMATCH);
3257 }
3258 if (MAX_255(*Feptr) && (mb->ctypes[*Feptr] & ctype_digit) != 0)
3259 RRETURN(MATCH_NOMATCH);
3260 Feptr++;
3261 }
3262 break;
3263
3264 case OP_DIGIT:
3265 for (i = 1; i <= Lmin; i++)
3266 {
3267 if (Feptr >= mb->end_subject)
3268 {
3269 SCHECK_PARTIAL();
3270 RRETURN(MATCH_NOMATCH);
3271 }
3272 if (!MAX_255(*Feptr) || (mb->ctypes[*Feptr] & ctype_digit) == 0)
3273 RRETURN(MATCH_NOMATCH);
3274 Feptr++;
3275 }
3276 break;
3277
3278 case OP_NOT_WHITESPACE:
3279 for (i = 1; i <= Lmin; i++)
3280 {
3281 if (Feptr >= mb->end_subject)
3282 {
3283 SCHECK_PARTIAL();
3284 RRETURN(MATCH_NOMATCH);
3285 }
3286 if (MAX_255(*Feptr) && (mb->ctypes[*Feptr] & ctype_space) != 0)
3287 RRETURN(MATCH_NOMATCH);
3288 Feptr++;
3289 }
3290 break;
3291
3292 case OP_WHITESPACE:
3293 for (i = 1; i <= Lmin; i++)
3294 {
3295 if (Feptr >= mb->end_subject)
3296 {
3297 SCHECK_PARTIAL();
3298 RRETURN(MATCH_NOMATCH);
3299 }
3300 if (!MAX_255(*Feptr) || (mb->ctypes[*Feptr] & ctype_space) == 0)
3301 RRETURN(MATCH_NOMATCH);
3302 Feptr++;
3303 }
3304 break;
3305
3306 case OP_NOT_WORDCHAR:
3307 for (i = 1; i <= Lmin; i++)
3308 {
3309 if (Feptr >= mb->end_subject)
3310 {
3311 SCHECK_PARTIAL();
3312 RRETURN(MATCH_NOMATCH);
3313 }
3314 if (MAX_255(*Feptr) && (mb->ctypes[*Feptr] & ctype_word) != 0)
3315 RRETURN(MATCH_NOMATCH);
3316 Feptr++;
3317 }
3318 break;
3319
3320 case OP_WORDCHAR:
3321 for (i = 1; i <= Lmin; i++)
3322 {
3323 if (Feptr >= mb->end_subject)
3324 {
3325 SCHECK_PARTIAL();
3326 RRETURN(MATCH_NOMATCH);
3327 }
3328 if (!MAX_255(*Feptr) || (mb->ctypes[*Feptr] & ctype_word) == 0)
3329 RRETURN(MATCH_NOMATCH);
3330 Feptr++;
3331 }
3332 break;
3333
3334 default:
3335 return PCRE2_ERROR_INTERNAL;
3336 }
3337 }
3338
3339 /* If Lmin = Lmax we are done. Continue with the main loop. */
3340
3341 if (Lmin == Lmax) continue;
3342
3343 /* If minimizing, we have to test the rest of the pattern before each
3344 subsequent match. */
3345
3346 if (reptype == REPTYPE_MIN)
3347 {
3348 #ifdef SUPPORT_UNICODE
3349 if (proptype >= 0)
3350 {
3351 switch(proptype)
3352 {
3353 case PT_ANY:
3354 for (;;)
3355 {
3356 RMATCH(Fecode, RM208);
3357 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3358 if (Lmin++ >= Lmax) RRETURN(MATCH_NOMATCH);
3359 if (Feptr >= mb->end_subject)
3360 {
3361 SCHECK_PARTIAL();
3362 RRETURN(MATCH_NOMATCH);
3363 }
3364 GETCHARINCTEST(fc, Feptr);
3365 if (Lctype == OP_NOTPROP) RRETURN(MATCH_NOMATCH);
3366 }
3367 /* Control never gets here */
3368
3369 case PT_LAMP:
3370 for (;;)
3371 {
3372 int chartype;
3373 RMATCH(Fecode, RM209);
3374 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3375 if (Lmin++ >= Lmax) RRETURN(MATCH_NOMATCH);
3376 if (Feptr >= mb->end_subject)
3377 {
3378 SCHECK_PARTIAL();
3379 RRETURN(MATCH_NOMATCH);
3380 }
3381 GETCHARINCTEST(fc, Feptr);
3382 chartype = UCD_CHARTYPE(fc);
3383 if ((chartype == ucp_Lu ||
3384 chartype == ucp_Ll ||
3385 chartype == ucp_Lt) == (Lctype == OP_NOTPROP))
3386 RRETURN(MATCH_NOMATCH);
3387 }
3388 /* Control never gets here */
3389
3390 case PT_GC:
3391 for (;;)
3392 {
3393 RMATCH(Fecode, RM210);
3394 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3395 if (Lmin++ >= Lmax) RRETURN(MATCH_NOMATCH);
3396 if (Feptr >= mb->end_subject)
3397 {
3398 SCHECK_PARTIAL();
3399 RRETURN(MATCH_NOMATCH);
3400 }
3401 GETCHARINCTEST(fc, Feptr);
3402 if ((UCD_CATEGORY(fc) == Lpropvalue) == (Lctype == OP_NOTPROP))
3403 RRETURN(MATCH_NOMATCH);
3404 }
3405 /* Control never gets here */
3406
3407 case PT_PC:
3408 for (;;)
3409 {
3410 RMATCH(Fecode, RM211);
3411 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3412 if (Lmin++ >= Lmax) RRETURN(MATCH_NOMATCH);
3413 if (Feptr >= mb->end_subject)
3414 {
3415 SCHECK_PARTIAL();
3416 RRETURN(MATCH_NOMATCH);
3417 }
3418 GETCHARINCTEST(fc, Feptr);
3419 if ((UCD_CHARTYPE(fc) == Lpropvalue) == (Lctype == OP_NOTPROP))
3420 RRETURN(MATCH_NOMATCH);
3421 }
3422 /* Control never gets here */
3423
3424 case PT_SC:
3425 for (;;)
3426 {
3427 RMATCH(Fecode, RM212);
3428 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3429 if (Lmin++ >= Lmax) RRETURN(MATCH_NOMATCH);
3430 if (Feptr >= mb->end_subject)
3431 {
3432 SCHECK_PARTIAL();
3433 RRETURN(MATCH_NOMATCH);
3434 }
3435 GETCHARINCTEST(fc, Feptr);
3436 if ((UCD_SCRIPT(fc) == Lpropvalue) == (Lctype == OP_NOTPROP))
3437 RRETURN(MATCH_NOMATCH);
3438 }
3439 /* Control never gets here */
3440
3441 case PT_ALNUM:
3442 for (;;)
3443 {
3444 int category;
3445 RMATCH(Fecode, RM213);
3446 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3447 if (Lmin++ >= Lmax) RRETURN(MATCH_NOMATCH);
3448 if (Feptr >= mb->end_subject)
3449 {
3450 SCHECK_PARTIAL();
3451 RRETURN(MATCH_NOMATCH);
3452 }
3453 GETCHARINCTEST(fc, Feptr);
3454 category = UCD_CATEGORY(fc);
3455 if ((category == ucp_L || category == ucp_N) ==
3456 (Lctype == OP_NOTPROP))
3457 RRETURN(MATCH_NOMATCH);
3458 }
3459 /* Control never gets here */
3460
3461 /* Perl space used to exclude VT, but from Perl 5.18 it is included,
3462 which means that Perl space and POSIX space are now identical. PCRE
3463 was changed at release 8.34. */
3464
3465 case PT_SPACE: /* Perl space */
3466 case PT_PXSPACE: /* POSIX space */
3467 for (;;)
3468 {
3469 RMATCH(Fecode, RM214);
3470 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3471 if (Lmin++ >= Lmax) RRETURN(MATCH_NOMATCH);
3472 if (Feptr >= mb->end_subject)
3473 {
3474 SCHECK_PARTIAL();
3475 RRETURN(MATCH_NOMATCH);
3476 }
3477 GETCHARINCTEST(fc, Feptr);
3478 switch(fc)
3479 {
3480 HSPACE_CASES:
3481 VSPACE_CASES:
3482 if (Lctype == OP_NOTPROP) RRETURN(MATCH_NOMATCH);
3483 break;
3484
3485 default:
3486 if ((UCD_CATEGORY(fc) == ucp_Z) == (Lctype == OP_NOTPROP))
3487 RRETURN(MATCH_NOMATCH);
3488 break;
3489 }
3490 }
3491 /* Control never gets here */
3492
3493 case PT_WORD:
3494 for (;;)
3495 {
3496 int category;
3497 RMATCH(Fecode, RM215);
3498 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3499 if (Lmin++ >= Lmax) RRETURN(MATCH_NOMATCH);
3500 if (Feptr >= mb->end_subject)
3501 {
3502 SCHECK_PARTIAL();
3503 RRETURN(MATCH_NOMATCH);
3504 }
3505 GETCHARINCTEST(fc, Feptr);
3506 category = UCD_CATEGORY(fc);
3507 if ((category == ucp_L ||
3508 category == ucp_N ||
3509 fc == CHAR_UNDERSCORE) == (Lctype == OP_NOTPROP))
3510 RRETURN(MATCH_NOMATCH);
3511 }
3512 /* Control never gets here */
3513
3514 case PT_CLIST:
3515 for (;;)
3516 {
3517 const uint32_t *cp;
3518 RMATCH(Fecode, RM216);
3519 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3520 if (Lmin++ >= Lmax) RRETURN(MATCH_NOMATCH);
3521 if (Feptr >= mb->end_subject)
3522 {
3523 SCHECK_PARTIAL();
3524 RRETURN(MATCH_NOMATCH);
3525 }
3526 GETCHARINCTEST(fc, Feptr);
3527 cp = PRIV(ucd_caseless_sets) + Lpropvalue;
3528 for (;;)
3529 {
3530 if (fc < *cp)
3531 {
3532 if (Lctype == OP_NOTPROP) break;
3533 RRETURN(MATCH_NOMATCH);
3534 }
3535 if (fc == *cp++)
3536 {
3537 if (Lctype == OP_NOTPROP) RRETURN(MATCH_NOMATCH);
3538 break;
3539 }
3540 }
3541 }
3542 /* Control never gets here */
3543
3544 case PT_UCNC:
3545 for (;;)
3546 {
3547 RMATCH(Fecode, RM217);
3548 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3549 if (Lmin++ >= Lmax) RRETURN(MATCH_NOMATCH);
3550 if (Feptr >= mb->end_subject)
3551 {
3552 SCHECK_PARTIAL();
3553 RRETURN(MATCH_NOMATCH);
3554 }
3555 GETCHARINCTEST(fc, Feptr);
3556 if ((fc == CHAR_DOLLAR_SIGN || fc == CHAR_COMMERCIAL_AT ||
3557 fc == CHAR_GRAVE_ACCENT || (fc >= 0xa0 && fc <= 0xd7ff) ||
3558 fc >= 0xe000) == (Lctype == OP_NOTPROP))
3559 RRETURN(MATCH_NOMATCH);
3560 }
3561 /* Control never gets here */
3562
3563 /* This should never occur */
3564 default:
3565 return PCRE2_ERROR_INTERNAL;
3566 }
3567 }
3568
3569 /* Match extended Unicode sequences. We will get here only if the
3570 support is in the binary; otherwise a compile-time error occurs. */
3571
3572 else if (Lctype == OP_EXTUNI)
3573 {
3574 for (;;)
3575 {
3576 RMATCH(Fecode, RM218);
3577 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3578 if (Lmin++ >= Lmax) RRETURN(MATCH_NOMATCH);
3579 if (Feptr >= mb->end_subject)
3580 {
3581 SCHECK_PARTIAL();
3582 RRETURN(MATCH_NOMATCH);
3583 }
3584 else
3585 {
3586 GETCHARINCTEST(fc, Feptr);
3587 Feptr = PRIV(extuni)(fc, Feptr, mb->start_subject, mb->end_subject,
3588 utf, NULL);
3589 }
3590 CHECK_PARTIAL();
3591 }
3592 }
3593 else
3594 #endif /* SUPPORT_UNICODE */
3595
3596 /* UTF mode for non-property testing character types. */
3597
3598 #ifdef SUPPORT_UNICODE
3599 if (utf)
3600 {
3601 for (;;)
3602 {
3603 RMATCH(Fecode, RM219);
3604 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3605 if (Lmin++ >= Lmax) RRETURN(MATCH_NOMATCH);
3606 if (Feptr >= mb->end_subject)
3607 {
3608 SCHECK_PARTIAL();
3609 RRETURN(MATCH_NOMATCH);
3610 }
3611 if (Lctype == OP_ANY && IS_NEWLINE(Feptr)) RRETURN(MATCH_NOMATCH);
3612 GETCHARINC(fc, Feptr);
3613 switch(Lctype)
3614 {
3615 case OP_ANY: /* This is the non-NL case */
3616 if (mb->partial != 0 && /* Take care with CRLF partial */
3617 Feptr >= mb->end_subject &&
3618 NLBLOCK->nltype == NLTYPE_FIXED &&
3619 NLBLOCK->nllen == 2 &&
3620 fc == NLBLOCK->nl[0])
3621 {
3622 mb->hitend = TRUE;
3623 if (mb->partial > 1) return PCRE2_ERROR_PARTIAL;
3624 }
3625 break;
3626
3627 case OP_ALLANY:
3628 case OP_ANYBYTE:
3629 break;
3630
3631 case OP_ANYNL:
3632 switch(fc)
3633 {
3634 default: RRETURN(MATCH_NOMATCH);
3635
3636 case CHAR_CR:
3637 if (Feptr < mb->end_subject && UCHAR21(Feptr) == CHAR_LF) Feptr++;
3638 break;
3639
3640 case CHAR_LF:
3641 break;
3642
3643 case CHAR_VT:
3644 case CHAR_FF:
3645 case CHAR_NEL:
3646 #ifndef EBCDIC
3647 case 0x2028:
3648 case 0x2029:
3649 #endif /* Not EBCDIC */
3650 if (mb->bsr_convention == PCRE2_BSR_ANYCRLF)
3651 RRETURN(MATCH_NOMATCH);
3652 break;
3653 }
3654 break;
3655
3656 case OP_NOT_HSPACE:
3657 switch(fc)
3658 {
3659 HSPACE_CASES: RRETURN(MATCH_NOMATCH);
3660 default: break;
3661 }
3662 break;
3663
3664 case OP_HSPACE:
3665 switch(fc)
3666 {
3667 HSPACE_CASES: break;
3668 default: RRETURN(MATCH_NOMATCH);
3669 }
3670 break;
3671
3672 case OP_NOT_VSPACE:
3673 switch(fc)
3674 {
3675 VSPACE_CASES: RRETURN(MATCH_NOMATCH);
3676 default: break;
3677 }
3678 break;
3679
3680 case OP_VSPACE:
3681 switch(fc)
3682 {
3683 VSPACE_CASES: break;
3684 default: RRETURN(MATCH_NOMATCH);
3685 }
3686 break;
3687
3688 case OP_NOT_DIGIT:
3689 if (fc < 256 && (mb->ctypes[fc] & ctype_digit) != 0)
3690 RRETURN(MATCH_NOMATCH);
3691 break;
3692
3693 case OP_DIGIT:
3694 if (fc >= 256 || (mb->ctypes[fc] & ctype_digit) == 0)
3695 RRETURN(MATCH_NOMATCH);
3696 break;
3697
3698 case OP_NOT_WHITESPACE:
3699 if (fc < 256 && (mb->ctypes[fc] & ctype_space) != 0)
3700 RRETURN(MATCH_NOMATCH);
3701 break;
3702
3703 case OP_WHITESPACE:
3704 if (fc >= 256 || (mb->ctypes[fc] & ctype_space) == 0)
3705 RRETURN(MATCH_NOMATCH);
3706 break;
3707
3708 case OP_NOT_WORDCHAR:
3709 if (fc < 256 && (mb->ctypes[fc] & ctype_word) != 0)
3710 RRETURN(MATCH_NOMATCH);
3711 break;
3712
3713 case OP_WORDCHAR:
3714 if (fc >= 256 || (mb->ctypes[fc] & ctype_word) == 0)
3715 RRETURN(MATCH_NOMATCH);
3716 break;
3717
3718 default:
3719 return PCRE2_ERROR_INTERNAL;
3720 }
3721 }
3722 }
3723 else
3724 #endif /* SUPPORT_UNICODE */
3725
3726 /* Not UTF mode */
3727 {
3728 for (;;)
3729 {
3730 RMATCH(Fecode, RM33);
3731 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3732 if (Lmin++ >= Lmax) RRETURN(MATCH_NOMATCH);
3733 if (Feptr >= mb->end_subject)
3734 {
3735 SCHECK_PARTIAL();
3736 RRETURN(MATCH_NOMATCH);
3737 }
3738 if (Lctype == OP_ANY && IS_NEWLINE(Feptr))
3739 RRETURN(MATCH_NOMATCH);
3740 fc = *Feptr++;
3741 switch(Lctype)
3742 {
3743 case OP_ANY: /* This is the non-NL case */
3744 if (mb->partial != 0 && /* Take care with CRLF partial */
3745 Feptr >= mb->end_subject &&
3746 NLBLOCK->nltype == NLTYPE_FIXED &&
3747 NLBLOCK->nllen == 2 &&
3748 fc == NLBLOCK->nl[0])
3749 {
3750 mb->hitend = TRUE;
3751 if (mb->partial > 1) return PCRE2_ERROR_PARTIAL;
3752 }
3753 break;
3754
3755 case OP_ALLANY:
3756 case OP_ANYBYTE:
3757 break;
3758
3759 case OP_ANYNL:
3760 switch(fc)
3761 {
3762 default: RRETURN(MATCH_NOMATCH);
3763
3764 case CHAR_CR:
3765 if (Feptr < mb->end_subject && *Feptr == CHAR_LF) Feptr++;
3766 break;
3767
3768 case CHAR_LF:
3769 break;
3770
3771 case CHAR_VT:
3772 case CHAR_FF:
3773 case CHAR_NEL:
3774 #if PCRE2_CODE_UNIT_WIDTH != 8
3775 case 0x2028:
3776 case 0x2029:
3777 #endif
3778 if (mb->bsr_convention == PCRE2_BSR_ANYCRLF)
3779 RRETURN(MATCH_NOMATCH);
3780 break;
3781 }
3782 break;
3783
3784 case OP_NOT_HSPACE:
3785 switch(fc)
3786 {
3787 default: break;
3788 HSPACE_BYTE_CASES:
3789 #if PCRE2_CODE_UNIT_WIDTH != 8
3790 HSPACE_MULTIBYTE_CASES:
3791 #endif
3792 RRETURN(MATCH_NOMATCH);
3793 }
3794 break;
3795
3796 case OP_HSPACE:
3797 switch(fc)
3798 {
3799 default: RRETURN(MATCH_NOMATCH);
3800 HSPACE_BYTE_CASES:
3801 #if PCRE2_CODE_UNIT_WIDTH != 8
3802 HSPACE_MULTIBYTE_CASES:
3803 #endif
3804 break;
3805 }
3806 break;
3807
3808 case OP_NOT_VSPACE:
3809 switch(fc)
3810 {
3811 default: break;
3812 VSPACE_BYTE_CASES:
3813 #if PCRE2_CODE_UNIT_WIDTH != 8
3814 VSPACE_MULTIBYTE_CASES:
3815 #endif
3816 RRETURN(MATCH_NOMATCH);
3817 }
3818 break;
3819
3820 case OP_VSPACE:
3821 switch(fc)
3822 {
3823 default: RRETURN(MATCH_NOMATCH);
3824 VSPACE_BYTE_CASES:
3825 #if PCRE2_CODE_UNIT_WIDTH != 8
3826 VSPACE_MULTIBYTE_CASES:
3827 #endif
3828 break;
3829 }
3830 break;
3831
3832 case OP_NOT_DIGIT:
3833 if (MAX_255(fc) && (mb->ctypes[fc] & ctype_digit) != 0)
3834 RRETURN(MATCH_NOMATCH);
3835 break;
3836
3837 case OP_DIGIT:
3838 if (!MAX_255(fc) || (mb->ctypes[fc] & ctype_digit) == 0)
3839 RRETURN(MATCH_NOMATCH);
3840 break;
3841
3842 case OP_NOT_WHITESPACE:
3843 if (MAX_255(fc) && (mb->ctypes[fc] & ctype_space) != 0)
3844 RRETURN(MATCH_NOMATCH);
3845 break;
3846
3847 case OP_WHITESPACE:
3848 if (!MAX_255(fc) || (mb->ctypes[fc] & ctype_space) == 0)
3849 RRETURN(MATCH_NOMATCH);
3850 break;
3851
3852 case OP_NOT_WORDCHAR:
3853 if (MAX_255(fc) && (mb->ctypes[fc] & ctype_word) != 0)
3854 RRETURN(MATCH_NOMATCH);
3855 break;
3856
3857 case OP_WORDCHAR:
3858 if (!MAX_255(fc) || (mb->ctypes[fc] & ctype_word) == 0)
3859 RRETURN(MATCH_NOMATCH);
3860 break;
3861
3862 default:
3863 return PCRE2_ERROR_INTERNAL;
3864 }
3865 }
3866 }
3867 /* Control never gets here */
3868 }
3869
3870 /* If maximizing, it is worth using inline code for speed, doing the type
3871 test once at the start (i.e. keep it out of the loop). */
3872
3873 else
3874 {
3875 Lstart_eptr = Feptr; /* Remember where we started */
3876
3877 #ifdef SUPPORT_UNICODE
3878 if (proptype >= 0)
3879 {
3880 switch(proptype)
3881 {
3882 case PT_ANY:
3883 for (i = Lmin; i < Lmax; i++)
3884 {
3885 int len = 1;
3886 if (Feptr >= mb->end_subject)
3887 {
3888 SCHECK_PARTIAL();
3889 break;
3890 }
3891 GETCHARLENTEST(fc, Feptr, len);
3892 if (Lctype == OP_NOTPROP) break;
3893 Feptr+= len;
3894 }
3895 break;
3896
3897 case PT_LAMP:
3898 for (i = Lmin; i < Lmax; i++)
3899 {
3900 int chartype;
3901 int len = 1;
3902 if (Feptr >= mb->end_subject)
3903 {
3904 SCHECK_PARTIAL();
3905 break;
3906 }
3907 GETCHARLENTEST(fc, Feptr, len);
3908 chartype = UCD_CHARTYPE(fc);
3909 if ((chartype == ucp_Lu ||
3910 chartype == ucp_Ll ||
3911 chartype == ucp_Lt) == (Lctype == OP_NOTPROP))
3912 break;
3913 Feptr+= len;
3914 }
3915 break;
3916
3917 case PT_GC:
3918 for (i = Lmin; i < Lmax; i++)
3919 {
3920 int len = 1;
3921 if (Feptr >= mb->end_subject)
3922 {
3923 SCHECK_PARTIAL();
3924 break;
3925 }
3926 GETCHARLENTEST(fc, Feptr, len);
3927 if ((UCD_CATEGORY(fc) == Lpropvalue) == (Lctype == OP_NOTPROP))
3928 break;
3929 Feptr+= len;
3930 }
3931 break;
3932
3933 case PT_PC:
3934 for (i = Lmin; i < Lmax; i++)
3935 {
3936 int len = 1;
3937 if (Feptr >= mb->end_subject)
3938 {
3939 SCHECK_PARTIAL();
3940 break;
3941 }
3942 GETCHARLENTEST(fc, Feptr, len);
3943 if ((UCD_CHARTYPE(fc) == Lpropvalue) == (Lctype == OP_NOTPROP))
3944 break;
3945 Feptr+= len;
3946 }
3947 break;
3948
3949 case PT_SC:
3950 for (i = Lmin; i < Lmax; i++)
3951 {
3952 int len = 1;
3953 if (Feptr >= mb->end_subject)
3954 {
3955 SCHECK_PARTIAL();
3956 break;
3957 }
3958 GETCHARLENTEST(fc, Feptr, len);
3959 if ((UCD_SCRIPT(fc) == Lpropvalue) == (Lctype == OP_NOTPROP))
3960 break;
3961 Feptr+= len;
3962 }
3963 break;
3964
3965 case PT_ALNUM:
3966 for (i = Lmin; i < Lmax; i++)
3967 {
3968 int category;
3969 int len = 1;
3970 if (Feptr >= mb->end_subject)
3971 {
3972 SCHECK_PARTIAL();
3973 break;
3974 }
3975 GETCHARLENTEST(fc, Feptr, len);
3976 category = UCD_CATEGORY(fc);
3977 if ((category == ucp_L || category == ucp_N) ==
3978 (Lctype == OP_NOTPROP))
3979 break;
3980 Feptr+= len;
3981 }
3982 break;
3983
3984 /* Perl space used to exclude VT, but from Perl 5.18 it is included,
3985 which means that Perl space and POSIX space are now identical. PCRE
3986 was changed at release 8.34. */
3987
3988 case PT_SPACE: /* Perl space */
3989 case PT_PXSPACE: /* POSIX space */
3990 for (i = Lmin; i < Lmax; i++)
3991 {
3992 int len = 1;
3993 if (Feptr >= mb->end_subject)
3994 {
3995 SCHECK_PARTIAL();
3996 break;
3997 }
3998 GETCHARLENTEST(fc, Feptr, len);
3999 switch(fc)
4000 {
4001 HSPACE_CASES:
4002 VSPACE_CASES:
4003 if (Lctype == OP_NOTPROP) goto ENDLOOP99; /* Break the loop */
4004 break;
4005
4006 default:
4007 if ((UCD_CATEGORY(fc) == ucp_Z) == (Lctype == OP_NOTPROP))
4008 goto ENDLOOP99; /* Break the loop */
4009 break;
4010 }
4011 Feptr+= len;
4012 }
4013 ENDLOOP99:
4014 break;
4015
4016 case PT_WORD:
4017 for (i = Lmin; i < Lmax; i++)
4018 {
4019 int category;
4020 int len = 1;
4021 if (Feptr >= mb->end_subject)
4022 {
4023 SCHECK_PARTIAL();
4024 break;
4025 }
4026 GETCHARLENTEST(fc, Feptr, len);
4027 category = UCD_CATEGORY(fc);
4028 if ((category == ucp_L || category == ucp_N ||
4029 fc == CHAR_UNDERSCORE) == (Lctype == OP_NOTPROP))
4030 break;
4031 Feptr+= len;
4032 }
4033 break;
4034
4035 case PT_CLIST:
4036 for (i = Lmin; i < Lmax; i++)
4037 {
4038 const uint32_t *cp;
4039 int len = 1;
4040 if (Feptr >= mb->end_subject)
4041 {
4042 SCHECK_PARTIAL();
4043 break;
4044 }
4045 GETCHARLENTEST(fc, Feptr, len);
4046 cp = PRIV(ucd_caseless_sets) + Lpropvalue;
4047 for (;;)
4048 {
4049 if (fc < *cp)
4050 { if (Lctype == OP_NOTPROP) break; else goto GOT_MAX; }
4051 if (fc == *cp++)
4052 { if (Lctype == OP_NOTPROP) goto GOT_MAX; else break; }
4053 }
4054 Feptr += len;
4055 }
4056 GOT_MAX:
4057 break;
4058
4059 case PT_UCNC:
4060 for (i = Lmin; i < Lmax; i++)
4061 {
4062 int len = 1;
4063 if (Feptr >= mb->end_subject)
4064 {
4065 SCHECK_PARTIAL();
4066 break;
4067 }
4068 GETCHARLENTEST(fc, Feptr, len);
4069 if ((fc == CHAR_DOLLAR_SIGN || fc == CHAR_COMMERCIAL_AT ||
4070 fc == CHAR_GRAVE_ACCENT || (fc >= 0xa0 && fc <= 0xd7ff) ||
4071 fc >= 0xe000) == (Lctype == OP_NOTPROP))
4072 break;
4073 Feptr += len;
4074 }
4075 break;
4076
4077 default:
4078 return PCRE2_ERROR_INTERNAL;
4079 }
4080
4081 /* Feptr is now past the end of the maximum run */
4082
4083 if (reptype == REPTYPE_POS) continue; /* No backtracking */
4084
4085 /* After \C in UTF mode, Lstart_eptr might be in the middle of a
4086 Unicode character. Use <= Lstart_eptr to ensure backtracking doesn't
4087 go too far. */
4088
4089 for(;;)
4090 {
4091 if (Feptr <= Lstart_eptr) break;
4092 RMATCH(Fecode, RM222);
4093 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4094 Feptr--;
4095 if (utf) BACKCHAR(Feptr);
4096 }
4097 }
4098
4099 /* Match extended Unicode grapheme clusters. We will get here only if the
4100 support is in the binary; otherwise a compile-time error occurs. */
4101
4102 else if (Lctype == OP_EXTUNI)
4103 {
4104 for (i = Lmin; i < Lmax; i++)
4105 {
4106 if (Feptr >= mb->end_subject)
4107 {
4108 SCHECK_PARTIAL();
4109 break;
4110 }
4111 else
4112 {
4113 GETCHARINCTEST(fc, Feptr);
4114 Feptr = PRIV(extuni)(fc, Feptr, mb->start_subject, mb->end_subject,
4115 utf, NULL);
4116 }
4117 CHECK_PARTIAL();
4118 }
4119
4120 /* Feptr is now past the end of the maximum run */
4121
4122 if (reptype == REPTYPE_POS) continue; /* No backtracking */
4123
4124 /* We use <= Lstart_eptr rather than == Lstart_eptr to detect the start
4125 of the run while backtracking because the use of \C in UTF mode can
4126 cause BACKCHAR to move back past Lstart_eptr. This is just palliative;
4127 the use of \C in UTF mode is fraught with danger. */
4128
4129 for(;;)
4130 {
4131 int lgb, rgb;
4132 PCRE2_SPTR fptr;
4133
4134 if (Feptr <= Lstart_eptr) break; /* At start of char run */
4135 RMATCH(Fecode, RM220);
4136 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4137
4138 /* Backtracking over an extended grapheme cluster involves inspecting
4139 the previous two characters (if present) to see if a break is
4140 permitted between them. */
4141
4142 Feptr--;
4143 if (!utf) fc = *Feptr; else
4144 {
4145 BACKCHAR(Feptr);
4146 GETCHAR(fc, Feptr);
4147 }
4148 rgb = UCD_GRAPHBREAK(fc);
4149
4150 for (;;)
4151 {
4152 if (Feptr <= Lstart_eptr) break; /* At start of char run */
4153 fptr = Feptr - 1;
4154 if (!utf) fc = *fptr; else
4155 {
4156 BACKCHAR(fptr);
4157 GETCHAR(fc, fptr);
4158 }
4159 lgb = UCD_GRAPHBREAK(fc);
4160 if ((PRIV(ucp_gbtable)[lgb] & (1u << rgb)) == 0) break;
4161 Feptr = fptr;
4162 rgb = lgb;
4163 }
4164 }
4165 }
4166
4167 else
4168 #endif /* SUPPORT_UNICODE */
4169
4170 #ifdef SUPPORT_UNICODE
4171 if (utf)
4172 {
4173 switch(Lctype)
4174 {
4175 case OP_ANY:
4176 for (i = Lmin; i < Lmax; i++)
4177 {
4178 if (Feptr >= mb->end_subject)
4179 {
4180 SCHECK_PARTIAL();
4181 break;
4182 }
4183 if (IS_NEWLINE(Feptr)) break;
4184 if (mb->partial != 0 && /* Take care with CRLF partial */
4185 Feptr + 1 >= mb->end_subject &&
4186 NLBLOCK->nltype == NLTYPE_FIXED &&
4187 NLBLOCK->nllen == 2 &&
4188 UCHAR21(Feptr) == NLBLOCK->nl[0])
4189 {
4190 mb->hitend = TRUE;
4191 if (mb->partial > 1) return PCRE2_ERROR_PARTIAL;
4192 }
4193 Feptr++;
4194 ACROSSCHAR(Feptr < mb->end_subject, Feptr, Feptr++);
4195 }
4196 break;
4197
4198 case OP_ALLANY:
4199 if (Lmax < UINT32_MAX)
4200 {
4201 for (i = Lmin; i < Lmax; i++)
4202 {
4203 if (Feptr >= mb->end_subject)
4204 {
4205 SCHECK_PARTIAL();
4206 break;
4207 }
4208 Feptr++;
4209 ACROSSCHAR(Feptr < mb->end_subject, Feptr, Feptr++);
4210 }
4211 }
4212 else
4213 {
4214 Feptr = mb->end_subject; /* Unlimited UTF-8 repeat */
4215 SCHECK_PARTIAL();
4216 }
4217 break;
4218
4219 /* The "byte" (i.e. "code unit") case is the same as non-UTF */
4220
4221 case OP_ANYBYTE:
4222 fc = Lmax - Lmin;
4223 if (fc > (uint32_t)(mb->end_subject - Feptr))
4224 {
4225 Feptr = mb->end_subject;
4226 SCHECK_PARTIAL();
4227 }
4228 else Feptr += fc;
4229 break;
4230
4231 case OP_ANYNL:
4232 for (i = Lmin; i < Lmax; i++)
4233 {
4234 int len = 1;
4235 if (Feptr >= mb->end_subject)
4236 {
4237 SCHECK_PARTIAL();
4238 break;
4239 }
4240 GETCHARLEN(fc, Feptr, len);
4241 if (fc == CHAR_CR)
4242 {
4243 if (++Feptr >= mb->end_subject) break;
4244 if (UCHAR21(Feptr) == CHAR_LF) Feptr++;
4245 }
4246 else
4247 {
4248 if (fc != CHAR_LF &&
4249 (mb->bsr_convention == PCRE2_BSR_ANYCRLF ||
4250 (fc != CHAR_VT && fc != CHAR_FF && fc != CHAR_NEL
4251 #ifndef EBCDIC
4252 && fc != 0x2028 && fc != 0x2029
4253 #endif /* Not EBCDIC */
4254 )))
4255 break;
4256 Feptr += len;
4257 }
4258 }
4259 break;
4260
4261 case OP_NOT_HSPACE:
4262 case OP_HSPACE:
4263 for (i = Lmin; i < Lmax; i++)
4264 {
4265 BOOL gotspace;
4266 int len = 1;
4267 if (Feptr >= mb->end_subject)
4268 {
4269 SCHECK_PARTIAL();
4270 break;
4271 }
4272 GETCHARLEN(fc, Feptr, len);
4273 switch(fc)
4274 {
4275 HSPACE_CASES: gotspace = TRUE; break;
4276 default: gotspace = FALSE; break;
4277 }
4278 if (gotspace == (Lctype == OP_NOT_HSPACE)) break;
4279 Feptr += len;
4280 }
4281 break;
4282
4283 case OP_NOT_VSPACE:
4284 case OP_VSPACE:
4285 for (i = Lmin; i < Lmax; i++)
4286 {
4287 BOOL gotspace;
4288 int len = 1;
4289 if (Feptr >= mb->end_subject)
4290 {
4291 SCHECK_PARTIAL();
4292 break;
4293 }
4294 GETCHARLEN(fc, Feptr, len);
4295 switch(fc)
4296 {
4297 VSPACE_CASES: gotspace = TRUE; break;
4298 default: gotspace = FALSE; break;
4299 }
4300 if (gotspace == (Lctype == OP_NOT_VSPACE)) break;
4301 Feptr += len;
4302 }
4303 break;
4304
4305 case OP_NOT_DIGIT:
4306 for (i = Lmin; i < Lmax; i++)
4307 {
4308 int len = 1;
4309 if (Feptr >= mb->end_subject)
4310 {
4311 SCHECK_PARTIAL();
4312 break;
4313 }
4314 GETCHARLEN(fc, Feptr, len);
4315 if (fc < 256 && (mb->ctypes[fc] & ctype_digit) != 0) break;
4316 Feptr+= len;
4317 }
4318 break;
4319
4320 case OP_DIGIT:
4321 for (i = Lmin; i < Lmax; i++)
4322 {
4323 int len = 1;
4324 if (Feptr >= mb->end_subject)
4325 {
4326 SCHECK_PARTIAL();
4327 break;
4328 }
4329 GETCHARLEN(fc, Feptr, len);
4330 if (fc >= 256 ||(mb->ctypes[fc] & ctype_digit) == 0) break;
4331 Feptr+= len;
4332 }
4333 break;
4334
4335 case OP_NOT_WHITESPACE:
4336 for (i = Lmin; i < Lmax; i++)
4337 {
4338 int len = 1;
4339 if (Feptr >= mb->end_subject)
4340 {
4341 SCHECK_PARTIAL();
4342 break;
4343 }
4344 GETCHARLEN(fc, Feptr, len);
4345 if (fc < 256 && (mb->ctypes[fc] & ctype_space) != 0) break;
4346 Feptr+= len;
4347 }
4348 break;
4349
4350 case OP_WHITESPACE:
4351 for (i = Lmin; i < Lmax; i++)
4352 {
4353 int len = 1;
4354 if (Feptr >= mb->end_subject)
4355 {
4356 SCHECK_PARTIAL();
4357 break;
4358 }
4359 GETCHARLEN(fc, Feptr, len);
4360 if (fc >= 256 ||(mb->ctypes[fc] & ctype_space) == 0) break;
4361 Feptr+= len;
4362 }
4363 break;
4364
4365 case OP_NOT_WORDCHAR:
4366 for (i = Lmin; i < Lmax; i++)
4367 {
4368 int len = 1;
4369 if (Feptr >= mb->end_subject)
4370 {
4371 SCHECK_PARTIAL();
4372 break;
4373 }
4374 GETCHARLEN(fc, Feptr, len);
4375 if (fc < 256 && (mb->ctypes[fc] & ctype_word) != 0) break;
4376 Feptr+= len;
4377 }
4378 break;
4379
4380 case OP_WORDCHAR:
4381 for (i = Lmin; i < Lmax; i++)
4382 {
4383 int len = 1;
4384 if (Feptr >= mb->end_subject)
4385 {
4386 SCHECK_PARTIAL();
4387 break;
4388 }
4389 GETCHARLEN(fc, Feptr, len);
4390 if (fc >= 256 || (mb->ctypes[fc] & ctype_word) == 0) break;
4391 Feptr+= len;
4392 }
4393 break;
4394
4395 default:
4396 return PCRE2_ERROR_INTERNAL;
4397 }
4398
4399 if (reptype == REPTYPE_POS) continue; /* No backtracking */
4400
4401 /* After \C in UTF mode, Lstart_eptr might be in the middle of a
4402 Unicode character. Use <= Lstart_eptr to ensure backtracking doesn't go
4403 too far. */
4404
4405 for(;;)
4406 {
4407 if (Feptr <= Lstart_eptr) break;
4408 RMATCH(Fecode, RM221);
4409 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4410 Feptr--;
4411 BACKCHAR(Feptr);
4412 if (Lctype == OP_ANYNL && Feptr > Lstart_eptr &&
4413 UCHAR21(Feptr) == CHAR_NL && UCHAR21(Feptr - 1) == CHAR_CR)
4414 Feptr--;
4415 }
4416 }
4417 else
4418 #endif /* SUPPORT_UNICODE */
4419
4420 /* Not UTF mode */
4421 {
4422 switch(Lctype)
4423 {
4424 case OP_ANY:
4425 for (i = Lmin; i < Lmax; i++)
4426 {
4427 if (Feptr >= mb->end_subject)
4428 {
4429 SCHECK_PARTIAL();
4430 break;
4431 }
4432 if (IS_NEWLINE(Feptr)) break;
4433 if (mb->partial != 0 && /* Take care with CRLF partial */
4434 Feptr + 1 >= mb->end_subject &&
4435 NLBLOCK->nltype == NLTYPE_FIXED &&
4436 NLBLOCK->nllen == 2 &&
4437 *Feptr == NLBLOCK->nl[0])
4438 {
4439 mb->hitend = TRUE;
4440 if (mb->partial > 1) return PCRE2_ERROR_PARTIAL;
4441 }
4442 Feptr++;
4443 }
4444 break;
4445
4446 case OP_ALLANY:
4447 case OP_ANYBYTE:
4448 fc = Lmax - Lmin;
4449 if (fc > (uint32_t)(mb->end_subject - Feptr))
4450 {
4451 Feptr = mb->end_subject;
4452 SCHECK_PARTIAL();
4453 }
4454 else Feptr += fc;
4455 break;
4456
4457 case OP_ANYNL:
4458 for (i = Lmin; i < Lmax; i++)
4459 {
4460 if (Feptr >= mb->end_subject)
4461 {
4462 SCHECK_PARTIAL();
4463 break;
4464 }
4465 fc = *Feptr;
4466 if (fc == CHAR_CR)
4467 {
4468 if (++Feptr >= mb->end_subject) break;
4469 if (*Feptr == CHAR_LF) Feptr++;
4470 }
4471 else
4472 {
4473 if (fc != CHAR_LF && (mb->bsr_convention == PCRE2_BSR_ANYCRLF ||
4474 (fc != CHAR_VT && fc != CHAR_FF && fc != CHAR_NEL
4475 #if PCRE2_CODE_UNIT_WIDTH != 8
4476 && fc != 0x2028 && fc != 0x2029
4477 #endif
4478 ))) break;
4479 Feptr++;
4480 }
4481 }
4482 break;
4483
4484 case OP_NOT_HSPACE:
4485 for (i = Lmin; i < Lmax; i++)
4486 {
4487 if (Feptr >= mb->end_subject)
4488 {
4489 SCHECK_PARTIAL();
4490 break;
4491 }
4492 switch(*Feptr)
4493 {
4494 default: Feptr++; break;
4495 HSPACE_BYTE_CASES:
4496 #if PCRE2_CODE_UNIT_WIDTH != 8
4497 HSPACE_MULTIBYTE_CASES:
4498 #endif
4499 goto ENDLOOP00;
4500 }
4501 }
4502 ENDLOOP00:
4503 break;
4504
4505 case OP_HSPACE:
4506 for (i = Lmin; i < Lmax; i++)
4507 {
4508 if (Feptr >= mb->end_subject)
4509 {
4510 SCHECK_PARTIAL();
4511 break;
4512 }
4513 switch(*Feptr)
4514 {
4515 default: goto ENDLOOP01;
4516 HSPACE_BYTE_CASES:
4517 #if PCRE2_CODE_UNIT_WIDTH != 8
4518 HSPACE_MULTIBYTE_CASES:
4519 #endif
4520 Feptr++; break;
4521 }
4522 }
4523 ENDLOOP01:
4524 break;
4525
4526 case OP_NOT_VSPACE:
4527 for (i = Lmin; i < Lmax; i++)
4528 {
4529 if (Feptr >= mb->end_subject)
4530 {
4531 SCHECK_PARTIAL();
4532 break;
4533 }
4534 switch(*Feptr)
4535 {
4536 default: Feptr++; break;
4537 VSPACE_BYTE_CASES:
4538 #if PCRE2_CODE_UNIT_WIDTH != 8
4539 VSPACE_MULTIBYTE_CASES:
4540 #endif
4541 goto ENDLOOP02;
4542 }
4543 }
4544 ENDLOOP02:
4545 break;
4546
4547 case OP_VSPACE:
4548 for (i = Lmin; i < Lmax; i++)
4549 {
4550 if (Feptr >= mb->end_subject)
4551 {
4552 SCHECK_PARTIAL();
4553 break;
4554 }
4555 switch(*Feptr)
4556 {
4557 default: goto ENDLOOP03;
4558 VSPACE_BYTE_CASES:
4559 #if PCRE2_CODE_UNIT_WIDTH != 8
4560 VSPACE_MULTIBYTE_CASES:
4561 #endif
4562 Feptr++; break;
4563 }
4564 }
4565 ENDLOOP03:
4566 break;
4567
4568 case OP_NOT_DIGIT:
4569 for (i = Lmin; i < Lmax; i++)
4570 {
4571 if (Feptr >= mb->end_subject)
4572 {
4573 SCHECK_PARTIAL();
4574 break;
4575 }
4576 if (MAX_255(*Feptr) && (mb->ctypes[*Feptr] & ctype_digit) != 0)
4577 break;
4578 Feptr++;
4579 }
4580 break;
4581
4582 case OP_DIGIT:
4583 for (i = Lmin; i < Lmax; i++)
4584 {
4585 if (Feptr >= mb->end_subject)
4586 {
4587 SCHECK_PARTIAL();
4588 break;
4589 }
4590 if (!MAX_255(*Feptr) || (mb->ctypes[*Feptr] & ctype_digit) == 0)
4591 break;
4592 Feptr++;
4593 }
4594 break;
4595
4596 case OP_NOT_WHITESPACE:
4597 for (i = Lmin; i < Lmax; i++)
4598 {
4599 if (Feptr >= mb->end_subject)
4600 {
4601 SCHECK_PARTIAL();
4602 break;
4603 }
4604 if (MAX_255(*Feptr) && (mb->ctypes[*Feptr] & ctype_space) != 0)
4605 break;
4606 Feptr++;
4607 }
4608 break;
4609
4610 case OP_WHITESPACE:
4611 for (i = Lmin; i < Lmax; i++)
4612 {
4613 if (Feptr >= mb->end_subject)
4614 {
4615 SCHECK_PARTIAL();
4616 break;
4617 }
4618 if (!MAX_255(*Feptr) || (mb->ctypes[*Feptr] & ctype_space) == 0)
4619 break;
4620 Feptr++;
4621 }
4622 break;
4623
4624 case OP_NOT_WORDCHAR:
4625 for (i = Lmin; i < Lmax; i++)
4626 {
4627 if (Feptr >= mb->end_subject)
4628 {
4629 SCHECK_PARTIAL();
4630 break;
4631 }
4632 if (MAX_255(*Feptr) && (mb->ctypes[*Feptr] & ctype_word) != 0)
4633 break;
4634 Feptr++;
4635 }
4636 break;
4637
4638 case OP_WORDCHAR:
4639 for (i = Lmin; i < Lmax; i++)
4640 {
4641 if (Feptr >= mb->end_subject)
4642 {
4643 SCHECK_PARTIAL();
4644 break;
4645 }
4646 if (!MAX_255(*Feptr) || (mb->ctypes[*Feptr] & ctype_word) == 0)
4647 break;
4648 Feptr++;
4649 }
4650 break;
4651
4652 default:
4653 return PCRE2_ERROR_INTERNAL;
4654 }
4655
4656 if (reptype == REPTYPE_POS) continue; /* No backtracking */
4657
4658 for (;;)
4659 {
4660 if (Feptr == Lstart_eptr) break;
4661 RMATCH(Fecode, RM34);
4662 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4663 Feptr--;
4664 if (Lctype == OP_ANYNL && Feptr > Lstart_eptr && *Feptr == CHAR_LF &&
4665 Feptr[-1] == CHAR_CR) Feptr--;
4666 }
4667 }
4668 }
4669 break; /* End of repeat character type processing */
4670
4671 #undef Lstart_eptr
4672 #undef Lmin
4673 #undef Lmax
4674 #undef Lctype
4675 #undef Lpropvalue
4676
4677
4678 /* ===================================================================== */
4679 /* Match a back reference, possibly repeatedly. Look past the end of the
4680 item to see if there is repeat information following. The OP_REF and
4681 OP_REFI opcodes are used for a reference to a numbered group or to a
4682 non-duplicated named group. For a duplicated named group, OP_DNREF and
4683 OP_DNREFI are used. In this case we must scan the list of groups to which
4684 the name refers, and use the first one that is set. */
4685
4686 #define Lmin F->temp_32[0]
4687 #define Lmax F->temp_32[1]
4688 #define Lcaseless F->temp_32[2]
4689 #define Lstart F->temp_sptr[0]
4690 #define Loffset F->temp_size
4691
4692 case OP_DNREF:
4693 case OP_DNREFI:
4694 Lcaseless = (Fop == OP_DNREFI);
4695 {
4696 int count = GET2(Fecode, 1+IMM2_SIZE);
4697 PCRE2_SPTR slot = mb->name_table + GET2(Fecode, 1) * mb->name_entry_size;
4698 Fecode += 1 + 2*IMM2_SIZE;
4699
4700 while (count-- > 0)
4701 {
4702 Loffset = (GET2(slot, 0) << 1) - 2;
4703 if (Loffset < Foffset_top && Fovector[Loffset] != PCRE2_UNSET) break;
4704 slot += mb->name_entry_size;
4705 }
4706 }
4707 goto REF_REPEAT;
4708
4709 case OP_REF:
4710 case OP_REFI:
4711 Lcaseless = (Fop == OP_REFI);
4712 Loffset = (GET2(Fecode, 1) << 1) - 2;
4713 Fecode += 1 + IMM2_SIZE;
4714
4715 /* Set up for repetition, or handle the non-repeated case. The maximum and
4716 minimum must be in the heap frame, but as they are short-term values, we
4717 use temporary fields. */
4718
4719 REF_REPEAT:
4720 switch (*Fecode)
4721 {
4722 case OP_CRSTAR:
4723 case OP_CRMINSTAR:
4724 case OP_CRPLUS:
4725 case OP_CRMINPLUS:
4726 case OP_CRQUERY:
4727 case OP_CRMINQUERY:
4728 fc = *Fecode++ - OP_CRSTAR;
4729 Lmin = rep_min[fc];
4730 Lmax = rep_max[fc];
4731 reptype = rep_typ[fc];
4732 break;
4733
4734 case OP_CRRANGE:
4735 case OP_CRMINRANGE:
4736 Lmin = GET2(Fecode, 1);
4737 Lmax = GET2(Fecode, 1 + IMM2_SIZE);
4738 reptype = rep_typ[*Fecode - OP_CRSTAR];
4739 if (Lmax == 0) Lmax = UINT32_MAX; /* Max 0 => infinity */
4740 Fecode += 1 + 2 * IMM2_SIZE;
4741 break;
4742
4743 default: /* No repeat follows */
4744 {
4745 rrc = match_ref(Loffset, Lcaseless, F, mb, &length);
4746 if (rrc != 0)
4747 {
4748 if (rrc > 0) Feptr = mb->end_subject; /* Partial match */
4749 CHECK_PARTIAL();
4750 RRETURN(MATCH_NOMATCH);
4751 }
4752 }
4753 Feptr += length;
4754 continue; /* With the main loop */
4755 }
4756
4757 /* Handle repeated back references. If a set group has length zero, just
4758 continue with the main loop, because it matches however many times. For an
4759 unset reference, if the minimum is zero, we can also just continue. We can
4760 also continue if PCRE2_MATCH_UNSET_BACKREF is set, because this makes unset
4761 group behave as a zero-length group. For any other unset cases, carrying
4762 on will result in NOMATCH. */
4763
4764 if (Loffset < Foffset_top && Fovector[Loffset] != PCRE2_UNSET)
4765 {
4766 if (Fovector[Loffset] == Fovector[Loffset + 1]) continue;
4767 }
4768 else /* Group is not set */
4769 {
4770 if (Lmin == 0 || (mb->poptions & PCRE2_MATCH_UNSET_BACKREF) != 0)
4771 continue;
4772 }
4773
4774 /* First, ensure the minimum number of matches are present. */
4775
4776 for (i = 1; i <= Lmin; i++)
4777 {
4778 PCRE2_SIZE slength;
4779 rrc = match_ref(Loffset, Lcaseless, F, mb, &slength);
4780 if (rrc != 0)
4781 {
4782 if (rrc > 0) Feptr = mb->end_subject; /* Partial match */
4783 CHECK_PARTIAL();
4784 RRETURN(MATCH_NOMATCH);
4785 }
4786 Feptr += slength;
4787 }
4788
4789 /* If min = max, we are done. They are not both allowed to be zero. */
4790
4791 if (Lmin == Lmax) continue;
4792
4793 /* If minimizing, keep trying and advancing the pointer. */
4794
4795 if (reptype == REPTYPE_MIN)
4796 {
4797 for (;;)
4798 {
4799 PCRE2_SIZE slength;
4800 RMATCH(Fecode, RM20);
4801 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4802 if (Lmin++ >= Lmax) RRETURN(MATCH_NOMATCH);
4803 rrc = match_ref(Loffset, Lcaseless, F, mb, &slength);
4804 if (rrc != 0)
4805 {
4806 if (rrc > 0) Feptr = mb->end_subject; /* Partial match */
4807 CHECK_PARTIAL();
4808 RRETURN(MATCH_NOMATCH);
4809 }
4810 Feptr += slength;
4811 }
4812 /* Control never gets here */
4813 }
4814
4815 /* If maximizing, find the longest string and work backwards, as long as
4816 the matched lengths for each iteration are the same. */
4817
4818 else
4819 {
4820 BOOL samelengths = TRUE;
4821 Lstart = Feptr; /* Starting position */
4822 Flength = Fovector[Loffset+1] - Fovector[Loffset];
4823
4824 for (i = Lmin; i < Lmax; i++)
4825 {
4826 PCRE2_SIZE slength;
4827 rrc = match_ref(Loffset, Lcaseless, F, mb, &slength);
4828 if (rrc != 0)
4829 {
4830 /* Can't use CHECK_PARTIAL because we don't want to update Feptr in
4831 the soft partial matching case. */
4832
4833 if (rrc > 0 && mb->partial != 0 &&
4834 mb->end_subject > mb->start_used_ptr)
4835 {
4836 mb->hitend = TRUE;
4837 if (mb->partial > 1) return PCRE2_ERROR_PARTIAL;
4838 }
4839 break;
4840 }
4841
4842 if (slength != Flength) samelengths = FALSE;
4843 Feptr += slength;
4844 }
4845
4846 /* If the length matched for each repetition is the same as the length of
4847 the captured group, we can easily work backwards. This is the normal
4848 case. However, in caseless UTF-8 mode there are pairs of case-equivalent
4849 characters whose lengths (in terms of code units) differ. However, this
4850 is very rare, so we handle it by re-matching fewer and fewer times. */
4851
4852 if (samelengths)
4853 {
4854 while (Feptr >= Lstart)
4855 {
4856 RMATCH(Fecode, RM21);
4857 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4858 Feptr -= Flength;
4859 }
4860 }
4861
4862 /* The rare case of non-matching lengths. Re-scan the repetition for each
4863 iteration. We know that match_ref() will succeed every time. */
4864
4865 else
4866 {
4867 Lmax = i;
4868 for (;;)
4869 {
4870 RMATCH(Fecode, RM22);
4871 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4872 if (Feptr == Lstart) break; /* Failed after minimal repetition */
4873 Feptr = Lstart;
4874 Lmax--;
4875 for (i = Lmin; i < Lmax; i++)
4876 {
4877 PCRE2_SIZE slength;
4878 (void)match_ref(Loffset, Lcaseless, F, mb, &slength);
4879 Feptr += slength;
4880 }
4881 }
4882 }
4883
4884 RRETURN(MATCH_NOMATCH);
4885 }
4886 /* Control never gets here */
4887
4888 #undef Lcaseless
4889 #undef Lmin
4890 #undef Lmax
4891 #undef Lstart
4892 #undef Loffset
4893
4894
4895
4896 /* ========================================================================= */
4897 /* Opcodes for the start of various parenthesized items */
4898 /* ========================================================================= */
4899
4900 /* In all cases, if the result of RMATCH() is MATCH_THEN, check whether the
4901 (*THEN) is within the current branch by comparing the address of OP_THEN
4902 that is passed back with the end of the branch. If (*THEN) is within the
4903 current branch, and the branch is one of two or more alternatives (it
4904 either starts or ends with OP_ALT), we have reached the limit of THEN's
4905 action, so convert the return code to NOMATCH, which will cause normal
4906 backtracking to happen from now on. Otherwise, THEN is passed back to an
4907 outer alternative. This implements Perl's treatment of parenthesized
4908 groups, where a group not containing | does not affect the current
4909 alternative, that is, (X) is NOT the same as (X|(*F)). */
4910
4911
4912 /* ===================================================================== */
4913 /* BRAZERO, BRAMINZERO and SKIPZERO occur just before a non-possessive
4914 bracket group, indicating that it may occur zero times. It may repeat
4915 infinitely, or not at all - i.e. it could be ()* or ()? or even (){0} in
4916 the pattern. Brackets with fixed upper repeat limits are compiled as a
4917 number of copies, with the optional ones preceded by BRAZERO or BRAMINZERO.
4918 Possessive groups with possible zero repeats are preceded by BRAPOSZERO. */
4919
4920 #define Lnext_ecode F->temp_sptr[0]
4921
4922 case OP_BRAZERO:
4923 Lnext_ecode = Fecode + 1;
4924 RMATCH(Lnext_ecode, RM9);
4925 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4926 do Lnext_ecode += GET(Lnext_ecode, 1); while (*Lnext_ecode == OP_ALT);
4927 Fecode = Lnext_ecode + 1 + LINK_SIZE;
4928 break;
4929
4930 case OP_BRAMINZERO:
4931 Lnext_ecode = Fecode + 1;
4932 do Lnext_ecode += GET(Lnext_ecode, 1); while (*Lnext_ecode == OP_ALT);
4933 RMATCH(Lnext_ecode + 1 + LINK_SIZE, RM10);
4934 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4935 Fecode++;
4936 break;
4937
4938 #undef Lnext_ecode
4939
4940 case OP_SKIPZERO:
4941 Fecode++;
4942 do Fecode += GET(Fecode,1); while (*Fecode == OP_ALT);
4943 Fecode += 1 + LINK_SIZE;
4944 break;
4945
4946
4947 /* ===================================================================== */
4948 /* Handle possessive brackets with an unlimited repeat. The end of these
4949 brackets will always be OP_KETRPOS, which returns MATCH_KETRPOS without
4950 going further in the pattern. */
4951
4952 #define Lframe_type F->temp_32[0]
4953 #define Lmatched_once F->temp_32[1]
4954 #define Lzero_allowed F->temp_32[2]
4955 #define Lstart_eptr F->temp_sptr[0]
4956 #define Lstart_group F->temp_sptr[1]
4957
4958 case OP_BRAPOSZERO:
4959 Lzero_allowed = TRUE; /* Zero repeat is allowed */
4960 Fecode += 1;
4961 if (*Fecode == OP_CBRAPOS || *Fecode == OP_SCBRAPOS)
4962 goto POSSESSIVE_CAPTURE;
4963 goto POSSESSIVE_NON_CAPTURE;
4964
4965 case OP_BRAPOS:
4966 case OP_SBRAPOS:
4967 Lzero_allowed = FALSE; /* Zero repeat not allowed */
4968
4969 POSSESSIVE_NON_CAPTURE:
4970 Lframe_type = GF_NOCAPTURE; /* Remembered frame type */
4971 goto POSSESSIVE_GROUP;
4972
4973 case OP_CBRAPOS:
4974 case OP_SCBRAPOS:
4975 Lzero_allowed = FALSE; /* Zero repeat not allowed */
4976
4977 POSSESSIVE_CAPTURE:
4978 number = GET2(Fecode, 1+LINK_SIZE);
4979 Lframe_type = GF_CAPTURE | number; /* Remembered frame type */
4980
4981 POSSESSIVE_GROUP:
4982 Lmatched_once = FALSE; /* Never matched */
4983 Lstart_group = Fecode; /* Start of this group */
4984
4985 for (;;)
4986 {
4987 Lstart_eptr = Feptr; /* Position at group start */
4988 group_frame_type = Lframe_type;
4989 RMATCH(Fecode + PRIV(OP_lengths)[*Fecode], RM8);
4990 if (rrc == MATCH_KETRPOS)
4991 {
4992 Lmatched_once = TRUE; /* Matched at least once */
4993 if (Feptr == Lstart_eptr) /* Empty match; skip to end */
4994 {
4995 do Fecode += GET(Fecode, 1); while (*Fecode == OP_ALT);
4996 break;
4997 }
4998
4999 Fecode = Lstart_group;
5000 continue;
5001 }
5002
5003 /* See comment above about handling THEN. */
5004
5005 if (rrc == MATCH_THEN)
5006 {
5007 PCRE2_SPTR next_ecode = Fecode + GET(Fecode,1);
5008 if (mb->verb_ecode_ptr < next_ecode &&
5009 (*Fecode == OP_ALT || *next_ecode == OP_ALT))
5010 rrc = MATCH_NOMATCH;
5011 }
5012
5013 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5014 Fecode += GET(Fecode, 1);
5015 if (*Fecode != OP_ALT) break;
5016 }
5017
5018 /* Success if matched something or zero repeat allowed */
5019
5020 if (Lmatched_once || Lzero_allowed)
5021 {
5022 Fecode += 1 + LINK_SIZE;
5023 break;
5024 }
5025
5026 RRETURN(MATCH_NOMATCH);
5027
5028 #undef Lmatched_once
5029 #undef Lzero_allowed
5030 #undef Lframe_type
5031 #undef Lstart_eptr
5032 #undef Lstart_group
5033
5034
5035 /* ===================================================================== */
5036 /* Handle non-capturing brackets that cannot match an empty string. When we
5037 get to the final alternative within the brackets, as long as there are no
5038 THEN's in the pattern, we can optimize by not recording a new backtracking
5039 point. (Ideally we should test for a THEN within this group, but we don't
5040 have that information.) Don't do this if we are at the very top level,
5041 however, because that would make handling assertions and once-only brackets
5042 messier when there is nothing to go back to. */
5043
5044 #define Lframe_type F->temp_32[0] /* Set for all that use GROUPLOOP */
5045 #define Lnext_branch F->temp_sptr[0] /* Used only in OP_BRA handling */
5046
5047 case OP_BRA:
5048 if (mb->hasthen || Frdepth == 0)
5049 {
5050 Lframe_type = 0;
5051 goto GROUPLOOP;
5052 }
5053
5054 for (;;)
5055 {
5056 Lnext_branch = Fecode + GET(Fecode, 1);
5057 if (*Lnext_branch != OP_ALT) break;
5058
5059 /* This is never the final branch. We do not need to test for MATCH_THEN
5060 here because this code is not used when there is a THEN in the pattern. */
5061
5062 RMATCH(Fecode + PRIV(OP_lengths)[*Fecode], RM1);
5063 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5064 Fecode = Lnext_branch;
5065 }
5066
5067 /* Hit the start of the final branch. Continue at this level. */
5068
5069 Fecode += PRIV(OP_lengths)[*Fecode];
5070 break;
5071
5072 #undef Lnext_branch
5073
5074
5075 /* ===================================================================== */
5076 /* Handle a capturing bracket, other than those that are possessive with an
5077 unlimited repeat. */
5078
5079 case OP_CBRA:
5080 case OP_SCBRA:
5081 Lframe_type = GF_CAPTURE | GET2(Fecode, 1+LINK_SIZE);
5082 goto GROUPLOOP;
5083
5084
5085 /* ===================================================================== */
5086 /* Atomic groups and non-capturing brackets that can match an empty string
5087 must record a backtracking point and also set up a chained frame. */
5088
5089 case OP_ONCE:
5090 case OP_SCRIPT_RUN:
5091 case OP_SBRA:
5092 Lframe_type = GF_NOCAPTURE | Fop;
5093
5094 GROUPLOOP:
5095 for (;;)
5096 {
5097 group_frame_type = Lframe_type;
5098 RMATCH(Fecode + PRIV(OP_lengths)[*Fecode], RM2);
5099 if (rrc == MATCH_THEN)
5100 {
5101 PCRE2_SPTR next_ecode = Fecode + GET(Fecode,1);
5102 if (mb->verb_ecode_ptr < next_ecode &&
5103 (*Fecode == OP_ALT || *next_ecode == OP_ALT))
5104 rrc = MATCH_NOMATCH;
5105 }
5106 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5107 Fecode += GET(Fecode, 1);
5108 if (*Fecode != OP_ALT) RRETURN(MATCH_NOMATCH);
5109 }
5110 /* Control never reaches here. */
5111
5112 #undef Lframe_type
5113
5114
5115 /* ===================================================================== */
5116 /* Recursion either matches the current regex, or some subexpression. The
5117 offset data is the offset to the starting bracket from the start of the
5118 whole pattern. (This is so that it works from duplicated subpatterns.) */
5119
5120 #define Lframe_type F->temp_32[0]
5121 #define Lstart_branch F->temp_sptr[0]
5122
5123 case OP_RECURSE:
5124 bracode = mb->start_code + GET(Fecode, 1);
5125 number = (bracode == mb->start_code)? 0 : GET2(bracode, 1 + LINK_SIZE);
5126
5127 /* If we are already in a recursion, check for repeating the same one
5128 without advancing the subject pointer. This should catch convoluted mutual
5129 recursions. (Some simple cases are caught at compile time.) */
5130
5131 if (Fcurrent_recurse != RECURSE_UNSET)
5132 {
5133 offset = Flast_group_offset;
5134 while (offset != PCRE2_UNSET)
5135 {
5136 N = (heapframe *)((char *)mb->match_frames + offset);
5137 P = (heapframe *)((char *)N - frame_size);
5138 if (N->group_frame_type == (GF_RECURSE | number))
5139 {
5140 if (Feptr == P->eptr) return PCRE2_ERROR_RECURSELOOP;
5141 break;
5142 }
5143 offset = P->last_group_offset;
5144 }
5145 }
5146
5147 /* Now run the recursion, branch by branch. */
5148
5149 Lstart_branch = bracode;
5150 Lframe_type = GF_RECURSE | number;
5151
5152 for (;;)
5153 {
5154 PCRE2_SPTR next_ecode;
5155
5156 group_frame_type = Lframe_type;
5157 RMATCH(Lstart_branch + PRIV(OP_lengths)[*Lstart_branch], RM11);
5158 next_ecode = Lstart_branch + GET(Lstart_branch,1);
5159
5160 /* Handle backtracking verbs, which are defined in a range that can
5161 easily be tested for. PCRE does not allow THEN, SKIP, PRUNE or COMMIT to
5162 escape beyond a recursion; they cause a NOMATCH for the entire recursion.
5163
5164 When one of these verbs triggers, the current recursion group number is
5165 recorded. If it matches the recursion we are processing, the verb
5166 happened within the recursion and we must deal with it. Otherwise it must
5167 have happened after the recursion completed, and so has to be passed
5168 back. See comment above about handling THEN. */
5169
5170 if (rrc >= MATCH_BACKTRACK_MIN && rrc <= MATCH_BACKTRACK_MAX &&
5171 mb->verb_current_recurse == (Lframe_type ^ GF_RECURSE))
5172 {
5173 if (rrc == MATCH_THEN && mb->verb_ecode_ptr < next_ecode &&
5174 (*Lstart_branch == OP_ALT || *next_ecode == OP_ALT))
5175 rrc = MATCH_NOMATCH;
5176 else RRETURN(MATCH_NOMATCH);
5177 }
5178
5179 /* Note that carrying on after (*ACCEPT) in a recursion is handled in the
5180 OP_ACCEPT code. Nothing needs to be done here. */
5181
5182 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5183 Lstart_branch = next_ecode;
5184 if (*Lstart_branch != OP_ALT) RRETURN(MATCH_NOMATCH);
5185 }
5186 /* Control never reaches here. */
5187
5188 #undef Lframe_type
5189 #undef Lstart_branch
5190
5191
5192 /* ===================================================================== */
5193 /* Positive assertions are like other groups except that PCRE doesn't allow
5194 the effect of (*THEN) to escape beyond an assertion; it is therefore
5195 treated as NOMATCH. (*ACCEPT) is treated as successful assertion, with its
5196 captures and mark retained. Any other return is an error. */
5197
5198 #define Lframe_type F->temp_32[0]
5199
5200 case OP_ASSERT:
5201 case OP_ASSERTBACK:
5202 case OP_ASSERT_NA:
5203 case OP_ASSERTBACK_NA:
5204 Lframe_type = GF_NOCAPTURE | Fop;
5205 for (;;)
5206 {
5207 group_frame_type = Lframe_type;
5208 RMATCH(Fecode + PRIV(OP_lengths)[*Fecode], RM3);
5209 if (rrc == MATCH_ACCEPT)
5210 {
5211 memcpy(Fovector,
5212 (char *)assert_accept_frame + offsetof(heapframe, ovector),
5213 assert_accept_frame->offset_top * sizeof(PCRE2_SIZE));
5214 Foffset_top = assert_accept_frame->offset_top;
5215 Fmark = assert_accept_frame->mark;
5216 break;
5217 }
5218 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
5219 Fecode += GET(Fecode, 1);
5220 if (*Fecode != OP_ALT) RRETURN(MATCH_NOMATCH);
5221 }
5222
5223 do Fecode += GET(Fecode, 1); while (*Fecode == OP_ALT);
5224 Fecode += 1 + LINK_SIZE;
5225 break;
5226
5227 #undef Lframe_type
5228
5229
5230 /* ===================================================================== */
5231 /* Handle negative assertions. Loop for each non-matching branch as for
5232 positive assertions. */
5233
5234 #define Lframe_type F->temp_32[0]
5235
5236 case OP_ASSERT_NOT:
5237 case OP_ASSERTBACK_NOT:
5238 Lframe_type = GF_NOCAPTURE | Fop;
5239
5240 for (;;)
5241 {
5242 group_frame_type = Lframe_type;
5243 RMATCH(Fecode + PRIV(OP_lengths)[*Fecode], RM4);
5244 switch(rrc)
5245 {
5246 case MATCH_ACCEPT: /* Assertion matched, therefore it fails. */
5247 case MATCH_MATCH:
5248 RRETURN (MATCH_NOMATCH);
5249
5250 case MATCH_NOMATCH: /* Branch failed, try next if present. */
5251 case MATCH_THEN:
5252 Fecode += GET(Fecode, 1);
5253 if (*Fecode != OP_ALT) goto ASSERT_NOT_FAILED;
5254 break;
5255
5256 case MATCH_COMMIT: /* Assertion forced to fail, therefore continue. */
5257 case MATCH_SKIP:
5258 case MATCH_PRUNE:
5259 do Fecode += GET(Fecode, 1); while (*Fecode == OP_ALT);
5260 goto ASSERT_NOT_FAILED;
5261
5262 default: /* Pass back any other return */
5263 RRETURN(rrc);
5264 }
5265 }
5266
5267 /* None of the branches have matched or there was a backtrack to (*COMMIT),
5268 (*SKIP), (*PRUNE), or (*THEN) in the last branch. This is success for a
5269 negative assertion, so carry on. */
5270
5271 ASSERT_NOT_FAILED:
5272 Fecode += 1 + LINK_SIZE;
5273 break;
5274
5275 #undef Lframe_type
5276
5277
5278 /* ===================================================================== */
5279 /* The callout item calls an external function, if one is provided, passing
5280 details of the match so far. This is mainly for debugging, though the
5281 function is able to force a failure. */
5282
5283 case OP_CALLOUT:
5284 case OP_CALLOUT_STR:
5285 rrc = do_callout(F, mb, &length);
5286 if (rrc > 0) RRETURN(MATCH_NOMATCH);
5287 if (rrc < 0) RRETURN(rrc);
5288 Fecode += length;
5289 break;
5290
5291
5292 /* ===================================================================== */
5293 /* Conditional group: compilation checked that there are no more than two
5294 branches. If the condition is false, skipping the first branch takes us
5295 past the end of the item if there is only one branch, but that's exactly
5296 what we want. */
5297
5298 case OP_COND:
5299 case OP_SCOND:
5300
5301 /* The variable Flength will be added to Fecode when the condition is
5302 false, to get to the second branch. Setting it to the offset to the ALT or
5303 KET, then incrementing Fecode achieves this effect. However, if the second
5304 branch is non-existent, we must point to the KET so that the end of the
5305 group is correctly processed. We now have Fecode pointing to the condition
5306 or callout. */
5307
5308 Flength = GET(Fecode, 1); /* Offset to the second branch */
5309 if (Fecode[Flength] != OP_ALT) Flength -= 1 + LINK_SIZE;
5310 Fecode += 1 + LINK_SIZE; /* From this opcode */
5311
5312 /* Because of the way auto-callout works during compile, a callout item is
5313 inserted between OP_COND and an assertion condition. Such a callout can
5314 also be inserted manually. */
5315
5316 if (*Fecode == OP_CALLOUT || *Fecode == OP_CALLOUT_STR)
5317 {
5318 rrc = do_callout(F, mb, &length);
5319 if (rrc > 0) RRETURN(MATCH_NOMATCH);
5320 if (rrc < 0) RRETURN(rrc);
5321
5322 /* Advance Fecode past the callout, so it now points to the condition. We
5323 must adjust Flength so that the value of Fecode+Flength is unchanged. */
5324
5325 Fecode += length;
5326 Flength -= length;
5327 }
5328
5329 /* Test the various possible conditions */
5330
5331 condition = FALSE;
5332 switch(*Fecode)
5333 {
5334 case OP_RREF: /* Group recursion test */
5335 if (Fcurrent_recurse != RECURSE_UNSET)
5336 {
5337 number = GET2(Fecode, 1);
5338 condition = (number == RREF_ANY || number == Fcurrent_recurse);
5339 }
5340 break;
5341
5342 case OP_DNRREF: /* Duplicate named group recursion test */
5343 if (Fcurrent_recurse != RECURSE_UNSET)
5344 {
5345 int count = GET2(Fecode, 1 + IMM2_SIZE);
5346 PCRE2_SPTR slot = mb->name_table + GET2(Fecode, 1) * mb->name_entry_size;
5347 while (count-- > 0)
5348 {
5349 number = GET2(slot, 0);
5350 condition = number == Fcurrent_recurse;
5351 if (condition) break;
5352 slot += mb->name_entry_size;
5353 }
5354 }
5355 break;
5356
5357 case OP_CREF: /* Numbered group used test */
5358 offset = (GET2(Fecode, 1) << 1) - 2; /* Doubled ref number */
5359 condition = offset < Foffset_top && Fovector[offset] != PCRE2_UNSET;
5360 break;
5361
5362 case OP_DNCREF: /* Duplicate named group used test */
5363 {
5364 int count = GET2(Fecode, 1 + IMM2_SIZE);
5365 PCRE2_SPTR slot = mb->name_table + GET2(Fecode, 1) * mb->name_entry_size;
5366 while (count-- > 0)
5367 {
5368 offset = (GET2(slot, 0) << 1) - 2;
5369 condition = offset < Foffset_top && Fovector[offset] != PCRE2_UNSET;
5370 if (condition) break;
5371 slot += mb->name_entry_size;
5372 }
5373 }
5374 break;
5375
5376 case OP_FALSE:
5377 case OP_FAIL: /* The assertion (?!) becomes OP_FAIL */
5378 break;
5379
5380 case OP_TRUE:
5381 condition = TRUE;
5382 break;
5383
5384 /* The condition is an assertion. Run code similar to the assertion code
5385 above. */
5386
5387 #define Lpositive F->temp_32[0]
5388 #define Lstart_branch F->temp_sptr[0]
5389
5390 default:
5391 Lpositive = (*Fecode == OP_ASSERT || *Fecode == OP_ASSERTBACK);
5392 Lstart_branch = Fecode;
5393
5394 for (;;)
5395 {
5396 group_frame_type = GF_CONDASSERT | *Fecode;
5397 RMATCH(Lstart_branch + PRIV(OP_lengths)[*Lstart_branch], RM5);
5398
5399 switch(rrc)
5400 {
5401 case MATCH_ACCEPT: /* Save captures */
5402 memcpy(Fovector,
5403 (char *)assert_accept_frame + offsetof(heapframe, ovector),
5404 assert_accept_frame->offset_top * sizeof(PCRE2_SIZE));
5405 Foffset_top = assert_accept_frame->offset_top;
5406
5407 /* Fall through */
5408 /* In the case of a match, the captures have already been put into
5409 the current frame. */
5410
5411 case MATCH_MATCH:
5412 condition = Lpositive; /* TRUE for positive assertion */
5413 break;
5414
5415 /* PCRE doesn't allow the effect of (*THEN) to escape beyond an
5416 assertion; it is therefore always treated as NOMATCH. */
5417
5418 case MATCH_NOMATCH:
5419 case MATCH_THEN:
5420 Lstart_branch += GET(Lstart_branch, 1);
5421 if (*Lstart_branch == OP_ALT) continue; /* Try next branch */
5422 condition = !Lpositive; /* TRUE for negative assertion */
5423 break;
5424
5425 /* These force no match without checking other branches. */
5426
5427 case MATCH_COMMIT:
5428 case MATCH_SKIP:
5429 case MATCH_PRUNE:
5430 condition = !Lpositive;
5431 break;
5432
5433 default:
5434 RRETURN(rrc);
5435 }
5436 break; /* Out of the branch loop */
5437 }
5438
5439 /* If the condition is true, find the end of the assertion so that
5440 advancing past it gets us to the start of the first branch. */
5441
5442 if (condition)
5443 {
5444 do Fecode += GET(Fecode, 1); while (*Fecode == OP_ALT);
5445 }
5446 break; /* End of assertion condition */
5447 }
5448
5449 #undef Lpositive
5450 #undef Lstart_branch
5451
5452 /* Choose branch according to the condition. */
5453
5454 Fecode += condition? PRIV(OP_lengths)[*Fecode] : Flength;
5455
5456 /* If the opcode is OP_SCOND it means we are at a repeated conditional
5457 group that might match an empty string. We must therefore descend a level
5458 so that the start is remembered for checking. For OP_COND we can just
5459 continue at this level. */
5460
5461 if (Fop == OP_SCOND)
5462 {
5463 group_frame_type = GF_NOCAPTURE | Fop;
5464 RMATCH(Fecode, RM35);
5465 RRETURN(rrc);
5466 }
5467 break;
5468
5469
5470
5471 /* ========================================================================= */
5472 /* End of start of parenthesis opcodes */
5473 /* ========================================================================= */
5474
5475
5476 /* ===================================================================== */
5477 /* Move the subject pointer back. This occurs only at the start of each
5478 branch of a lookbehind assertion. If we are too close to the start to move
5479 back, fail. When working with UTF-8 we move back a number of characters,
5480 not bytes. */
5481
5482 case OP_REVERSE:
5483 number = GET(Fecode, 1);
5484 #ifdef SUPPORT_UNICODE
5485 if (utf)
5486 {
5487 while (number-- > 0)
5488 {
5489 if (Feptr <= mb->check_subject) RRETURN(MATCH_NOMATCH);
5490 Feptr--;
5491 BACKCHAR(Feptr);
5492 }
5493 }
5494 else
5495 #endif
5496
5497 /* No UTF-8 support, or not in UTF-8 mode: count is code unit count */
5498
5499 {
5500 if ((ptrdiff_t)number > Feptr - mb->start_subject) RRETURN(MATCH_NOMATCH);
5501 Feptr -= number;
5502 }
5503
5504 /* Save the earliest consulted character, then skip to next opcode */
5505
5506 if (Feptr < mb->start_used_ptr) mb->start_used_ptr = Feptr;
5507 Fecode += 1 + LINK_SIZE;
5508 break;
5509
5510
5511 /* ===================================================================== */
5512 /* An alternation is the end of a branch; scan along to find the end of the
5513 bracketed group. */
5514
5515 case OP_ALT:
5516 do Fecode += GET(Fecode,1); while (*Fecode == OP_ALT);
5517 break;
5518
5519
5520 /* ===================================================================== */
5521 /* The end of a parenthesized group. For all but OP_BRA and OP_COND, the
5522 starting frame was added to the chained frames in order to remember the
5523 starting subject position for the group. */
5524
5525 case OP_KET:
5526 case OP_KETRMIN:
5527 case OP_KETRMAX:
5528 case OP_KETRPOS:
5529
5530 bracode = Fecode - GET(Fecode, 1);
5531
5532 /* Point N to the frame at the start of the most recent group.
5533 Remember the subject pointer at the start of the group. */
5534
5535 if (*bracode != OP_BRA && *bracode != OP_COND)
5536 {
5537 N = (heapframe *)((char *)mb->match_frames + Flast_group_offset);
5538 P = (heapframe *)((char *)N - frame_size);
5539 Flast_group_offset = P->last_group_offset;
5540
5541 #ifdef DEBUG_SHOW_RMATCH
5542 fprintf(stderr, "++ KET for frame=%d type=%x prev char offset=%lu\n",
5543 N->rdepth, N->group_frame_type,
5544 (char *)P->eptr - (char *)mb->start_subject);
5545 #endif
5546
5547 /* If we are at the end of an assertion that is a condition, return a
5548 match, discarding any intermediate backtracking points. Copy back the
5549 mark setting and the captures into the frame before N so that they are
5550 set on return. Doing this for all assertions, both positive and negative,
5551 seems to match what Perl does. */
5552
5553 if (GF_IDMASK(N->group_frame_type) == GF_CONDASSERT)
5554 {
5555 memcpy((char *)P + offsetof(heapframe, ovector), Fovector,
5556 Foffset_top * sizeof(PCRE2_SIZE));
5557 P->offset_top = Foffset_top;
5558 P->mark = Fmark;
5559 Fback_frame = (char *)F - (char *)P;
5560 RRETURN(MATCH_MATCH);
5561 }
5562 }
5563 else P = NULL; /* Indicates starting frame not recorded */
5564
5565 /* The group was not a conditional assertion. */
5566
5567 switch (*bracode)
5568 {
5569 case OP_BRA: /* No need to do anything for these */
5570 case OP_COND:
5571 case OP_SCOND:
5572 break;
5573
5574 /* Non-atomic positive assertions are like OP_BRA, except that the
5575 subject pointer must be put back to where it was at the start of the
5576 assertion. */
5577
5578 case OP_ASSERT_NA:
5579 case OP_ASSERTBACK_NA:
5580 if (Feptr > mb->last_used_ptr) mb->last_used_ptr = Feptr;
5581 Feptr = P->eptr;
5582 break;
5583
5584 /* Atomic positive assertions are like OP_ONCE, except that in addition
5585 the subject pointer must be put back to where it was at the start of the
5586 assertion. */
5587
5588 case OP_ASSERT:
5589 case OP_ASSERTBACK:
5590 if (Feptr > mb->last_used_ptr) mb->last_used_ptr = Feptr;
5591 Feptr = P->eptr;
5592 /* Fall through */
5593
5594 /* For an atomic group, discard internal backtracking points. We must
5595 also ensure that any remaining branches within the top-level of the group
5596 are not tried. Do this by adjusting the code pointer within the backtrack
5597 frame so that it points to the final branch. */
5598
5599 case OP_ONCE:
5600 Fback_frame = ((char *)F - (char *)P);
5601 for (;;)
5602 {
5603 uint32_t y = GET(P->ecode,1);
5604 if ((P->ecode)[y] != OP_ALT) break;
5605 P->ecode += y;
5606 }
5607 break;
5608
5609 /* A matching negative assertion returns MATCH, which is turned into
5610 NOMATCH at the assertion level. */
5611
5612 case OP_ASSERT_NOT:
5613 case OP_ASSERTBACK_NOT:
5614 RRETURN(MATCH_MATCH);
5615
5616 /* At the end of a script run, apply the script-checking rules. This code
5617 will never by exercised if Unicode support it not compiled, because in
5618 that environment script runs cause an error at compile time. */
5619
5620 case OP_SCRIPT_RUN:
5621 if (!PRIV(script_run)(P->eptr, Feptr, utf)) RRETURN(MATCH_NOMATCH);
5622 break;
5623
5624 /* Whole-pattern recursion is coded as a recurse into group 0, so it
5625 won't be picked up here. Instead, we catch it when the OP_END is reached.
5626 Other recursion is handled here. */
5627
5628 case OP_CBRA:
5629 case OP_CBRAPOS:
5630 case OP_SCBRA:
5631 case OP_SCBRAPOS:
5632 number = GET2(bracode, 1+LINK_SIZE);
5633
5634 /* Handle a recursively called group. We reinstate the previous set of
5635 captures and then carry on after the recursion call. */
5636
5637 if (Fcurrent_recurse == number)
5638 {
5639 P = (heapframe *)((char *)N - frame_size);
5640 memcpy((char *)F + offsetof(heapframe, ovector), P->ovector,
5641 P->offset_top * sizeof(PCRE2_SIZE));
5642 Foffset_top = P->offset_top;
5643 Fcapture_last = P->capture_last;
5644 Fcurrent_recurse = P->current_recurse;
5645 Fecode = P->ecode + 1 + LINK_SIZE;
5646 continue; /* With next opcode */
5647 }
5648
5649 /* Deal with actual capturing. */
5650
5651 offset = (number << 1) - 2;
5652 Fcapture_last = number;
5653 Fovector[offset] = P->eptr - mb->start_subject;
5654 Fovector[offset+1] = Feptr - mb->start_subject;
5655 if (offset >= Foffset_top) Foffset_top = offset + 2;
5656 break;
5657 } /* End actions relating to the starting opcode */
5658
5659 /* OP_KETRPOS is a possessive repeating ket. Remember the current position,
5660 and return the MATCH_KETRPOS. This makes it possible to do the repeats one
5661 at a time from the outer level. This must precede the empty string test -
5662 in this case that test is done at the outer level. */
5663
5664 if (*Fecode == OP_KETRPOS)
5665 {
5666 memcpy((char *)P + offsetof(heapframe, eptr),
5667 (char *)F + offsetof(heapframe, eptr),
5668 frame_copy_size);
5669 RRETURN(MATCH_KETRPOS);
5670 }
5671
5672 /* Handle the different kinds of closing brackets. A non-repeating ket
5673 needs no special action, just continuing at this level. This also happens
5674 for the repeating kets if the group matched no characters, in order to
5675 forcibly break infinite loops. Otherwise, the repeating kets try the rest
5676 of the pattern or restart from the preceding bracket, in the appropriate
5677 order. */
5678
5679 if (Fop != OP_KET && (P == NULL || Feptr != P->eptr))
5680 {
5681 if (Fop == OP_KETRMIN)
5682 {
5683 RMATCH(Fecode + 1 + LINK_SIZE, RM6);
5684 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5685 Fecode -= GET(Fecode, 1);
5686 break; /* End of ket processing */
5687 }
5688
5689 /* Repeat the maximum number of times (KETRMAX) */
5690
5691 RMATCH(bracode, RM7);
5692 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5693 }
5694
5695 /* Carry on at this level for a non-repeating ket, or after matching an
5696 empty string, or after repeating for a maximum number of times. */
5697
5698 Fecode += 1 + LINK_SIZE;
5699 break;
5700
5701
5702 /* ===================================================================== */
5703 /* Start and end of line assertions, not multiline mode. */
5704
5705 case OP_CIRC: /* Start of line, unless PCRE2_NOTBOL is set. */
5706 if (Feptr != mb->start_subject || (mb->moptions & PCRE2_NOTBOL) != 0)
5707 RRETURN(MATCH_NOMATCH);
5708 Fecode++;
5709 break;
5710
5711 case OP_SOD: /* Unconditional start of subject */
5712 if (Feptr != mb->start_subject) RRETURN(MATCH_NOMATCH);
5713 Fecode++;
5714 break;
5715
5716 /* When PCRE2_NOTEOL is unset, assert before the subject end, or a
5717 terminating newline unless PCRE2_DOLLAR_ENDONLY is set. */
5718
5719 case OP_DOLL:
5720 if ((mb->moptions & PCRE2_NOTEOL) != 0) RRETURN(MATCH_NOMATCH);
5721 if ((mb->poptions & PCRE2_DOLLAR_ENDONLY) == 0) goto ASSERT_NL_OR_EOS;
5722
5723 /* Fall through */
5724 /* Unconditional end of subject assertion (\z) */
5725
5726 case OP_EOD:
5727 if (Feptr < mb->end_subject) RRETURN(MATCH_NOMATCH);
5728 if (mb->partial != 0)
5729 {
5730 mb->hitend = TRUE;
5731 if (mb->partial > 1) return PCRE2_ERROR_PARTIAL;
5732 }
5733 Fecode++;
5734 break;
5735
5736 /* End of subject or ending \n assertion (\Z) */
5737
5738 case OP_EODN:
5739 ASSERT_NL_OR_EOS:
5740 if (Feptr < mb->end_subject &&
5741 (!IS_NEWLINE(Feptr) || Feptr != mb->end_subject - mb->nllen))
5742 {
5743 if (mb->partial != 0 &&
5744 Feptr + 1 >= mb->end_subject &&
5745 NLBLOCK->nltype == NLTYPE_FIXED &&
5746 NLBLOCK->nllen == 2 &&
5747 UCHAR21TEST(Feptr) == NLBLOCK->nl[0])
5748 {
5749 mb->hitend = TRUE;
5750 if (mb->partial > 1) return PCRE2_ERROR_PARTIAL;
5751 }
5752 RRETURN(MATCH_NOMATCH);
5753 }
5754
5755 /* Either at end of string or \n before end. */
5756
5757 if (mb->partial != 0)
5758 {
5759 mb->hitend = TRUE;
5760 if (mb->partial > 1) return PCRE2_ERROR_PARTIAL;
5761 }
5762 Fecode++;
5763 break;
5764
5765
5766 /* ===================================================================== */
5767 /* Start and end of line assertions, multiline mode. */
5768
5769 /* Start of subject unless notbol, or after any newline except for one at
5770 the very end, unless PCRE2_ALT_CIRCUMFLEX is set. */
5771
5772 case OP_CIRCM:
5773 if ((mb->moptions & PCRE2_NOTBOL) != 0 && Feptr == mb->start_subject)
5774 RRETURN(MATCH_NOMATCH);
5775 if (Feptr != mb->start_subject &&
5776 ((Feptr == mb->end_subject &&
5777 (mb->poptions & PCRE2_ALT_CIRCUMFLEX) == 0) ||
5778 !WAS_NEWLINE(Feptr)))
5779 RRETURN(MATCH_NOMATCH);
5780 Fecode++;
5781 break;
5782
5783 /* Assert before any newline, or before end of subject unless noteol is
5784 set. */
5785
5786 case OP_DOLLM:
5787 if (Feptr < mb->end_subject)
5788 {
5789 if (!IS_NEWLINE(Feptr))
5790 {
5791 if (mb->partial != 0 &&
5792 Feptr + 1 >= mb->end_subject &&
5793 NLBLOCK->nltype == NLTYPE_FIXED &&
5794 NLBLOCK->nllen == 2 &&
5795 UCHAR21TEST(Feptr) == NLBLOCK->nl[0])
5796 {
5797 mb->hitend = TRUE;
5798 if (mb->partial > 1) return PCRE2_ERROR_PARTIAL;
5799 }
5800 RRETURN(MATCH_NOMATCH);
5801 }
5802 }
5803 else
5804 {
5805 if ((mb->moptions & PCRE2_NOTEOL) != 0) RRETURN(MATCH_NOMATCH);
5806 SCHECK_PARTIAL();
5807 }
5808 Fecode++;
5809 break;
5810
5811
5812 /* ===================================================================== */
5813 /* Start of match assertion */
5814
5815 case OP_SOM:
5816 if (Feptr != mb->start_subject + mb->start_offset) RRETURN(MATCH_NOMATCH);
5817 Fecode++;
5818 break;
5819
5820
5821 /* ===================================================================== */
5822 /* Reset the start of match point */
5823
5824 case OP_SET_SOM:
5825 Fstart_match = Feptr;
5826 Fecode++;
5827 break;
5828
5829
5830 /* ===================================================================== */
5831 /* Word boundary assertions. Find out if the previous and current
5832 characters are "word" characters. It takes a bit more work in UTF mode.
5833 Characters > 255 are assumed to be "non-word" characters when PCRE2_UCP is
5834 not set. When it is set, use Unicode properties if available, even when not
5835 in UTF mode. Remember the earliest and latest consulted characters. */
5836
5837 case OP_NOT_WORD_BOUNDARY:
5838 case OP_WORD_BOUNDARY:
5839 if (Feptr == mb->check_subject) prev_is_word = FALSE; else
5840 {
5841 PCRE2_SPTR lastptr = Feptr - 1;
5842 #ifdef SUPPORT_UNICODE
5843 if (utf)
5844 {
5845 BACKCHAR(lastptr);
5846 GETCHAR(fc, lastptr);
5847 }
5848 else
5849 #endif /* SUPPORT_UNICODE */
5850 fc = *lastptr;
5851 if (lastptr < mb->start_used_ptr) mb->start_used_ptr = lastptr;
5852 #ifdef SUPPORT_UNICODE
5853 if ((mb->poptions & PCRE2_UCP) != 0)
5854 {
5855 if (fc == '_') prev_is_word = TRUE; else
5856 {
5857 int cat = UCD_CATEGORY(fc);
5858 prev_is_word = (cat == ucp_L || cat == ucp_N);
5859 }
5860 }
5861 else
5862 #endif /* SUPPORT_UNICODE */
5863 prev_is_word = CHMAX_255(fc) && (mb->ctypes[fc] & ctype_word) != 0;
5864 }
5865
5866 /* Get status of next character */
5867
5868 if (Feptr >= mb->end_subject)
5869 {
5870 SCHECK_PARTIAL();
5871 cur_is_word = FALSE;
5872 }
5873 else
5874 {
5875 PCRE2_SPTR nextptr = Feptr + 1;
5876 #ifdef SUPPORT_UNICODE
5877 if (utf)
5878 {
5879 FORWARDCHARTEST(nextptr, mb->end_subject);
5880 GETCHAR(fc, Feptr);
5881 }
5882 else
5883 #endif /* SUPPORT_UNICODE */
5884 fc = *Feptr;
5885 if (nextptr > mb->last_used_ptr) mb->last_used_ptr = nextptr;
5886 #ifdef SUPPORT_UNICODE
5887 if ((mb->poptions & PCRE2_UCP) != 0)
5888 {
5889 if (fc == '_') cur_is_word = TRUE; else
5890 {
5891 int cat = UCD_CATEGORY(fc);
5892 cur_is_word = (cat == ucp_L || cat == ucp_N);
5893 }
5894 }
5895 else
5896 #endif /* SUPPORT_UNICODE */
5897 cur_is_word = CHMAX_255(fc) && (mb->ctypes[fc] & ctype_word) != 0;
5898 }
5899
5900 /* Now see if the situation is what we want */
5901
5902 if ((*Fecode++ == OP_WORD_BOUNDARY)?
5903 cur_is_word == prev_is_word : cur_is_word != prev_is_word)
5904 RRETURN(MATCH_NOMATCH);
5905 break;
5906
5907
5908 /* ===================================================================== */
5909 /* Backtracking (*VERB)s, with and without arguments. Note that if the
5910 pattern is successfully matched, we do not come back from RMATCH. */
5911
5912 case OP_MARK:
5913 Fmark = mb->nomatch_mark = Fecode + 2;
5914 RMATCH(Fecode + PRIV(OP_lengths)[*Fecode] + Fecode[1], RM12);
5915
5916 /* A return of MATCH_SKIP_ARG means that matching failed at SKIP with an
5917 argument, and we must check whether that argument matches this MARK's
5918 argument. It is passed back in mb->verb_skip_ptr. If it does match, we
5919 return MATCH_SKIP with mb->verb_skip_ptr now pointing to the subject
5920 position that corresponds to this mark. Otherwise, pass back the return
5921 code unaltered. */
5922
5923 if (rrc == MATCH_SKIP_ARG &&
5924 PRIV(strcmp)(Fecode + 2, mb->verb_skip_ptr) == 0)
5925 {
5926 mb->verb_skip_ptr = Feptr; /* Pass back current position */
5927 RRETURN(MATCH_SKIP);
5928 }
5929 RRETURN(rrc);
5930
5931 case OP_FAIL:
5932 RRETURN(MATCH_NOMATCH);
5933
5934 /* Record the current recursing group number in mb->verb_current_recurse
5935 when a backtracking return such as MATCH_COMMIT is given. This enables the
5936 recurse processing to catch verbs from within the recursion. */
5937
5938 case OP_COMMIT:
5939 RMATCH(Fecode + PRIV(OP_lengths)[*Fecode], RM13);
5940 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5941 mb->verb_current_recurse = Fcurrent_recurse;
5942 RRETURN(MATCH_COMMIT);
5943
5944 case OP_COMMIT_ARG:
5945 Fmark = mb->nomatch_mark = Fecode + 2;
5946 RMATCH(Fecode + PRIV(OP_lengths)[*Fecode] + Fecode[1], RM36);
5947 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5948 mb->verb_current_recurse = Fcurrent_recurse;
5949 RRETURN(MATCH_COMMIT);
5950
5951 case OP_PRUNE:
5952 RMATCH(Fecode + PRIV(OP_lengths)[*Fecode], RM14);
5953 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5954 mb->verb_current_recurse = Fcurrent_recurse;
5955 RRETURN(MATCH_PRUNE);
5956
5957 case OP_PRUNE_ARG:
5958 Fmark = mb->nomatch_mark = Fecode + 2;
5959 RMATCH(Fecode + PRIV(OP_lengths)[*Fecode] + Fecode[1], RM15);
5960 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5961 mb->verb_current_recurse = Fcurrent_recurse;
5962 RRETURN(MATCH_PRUNE);
5963
5964 case OP_SKIP:
5965 RMATCH(Fecode + PRIV(OP_lengths)[*Fecode], RM16);
5966 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5967 mb->verb_skip_ptr = Feptr; /* Pass back current position */
5968 mb->verb_current_recurse = Fcurrent_recurse;
5969 RRETURN(MATCH_SKIP);
5970
5971 /* Note that, for Perl compatibility, SKIP with an argument does NOT set
5972 nomatch_mark. When a pattern match ends with a SKIP_ARG for which there was
5973 not a matching mark, we have to re-run the match, ignoring the SKIP_ARG
5974 that failed and any that precede it (either they also failed, or were not
5975 triggered). To do this, we maintain a count of executed SKIP_ARGs. If a
5976 SKIP_ARG gets to top level, the match is re-run with mb->ignore_skip_arg
5977 set to the count of the one that failed. */
5978
5979 case OP_SKIP_ARG:
5980 mb->skip_arg_count++;
5981 if (mb->skip_arg_count <= mb->ignore_skip_arg)
5982 {
5983 Fecode += PRIV(OP_lengths)[*Fecode] + Fecode[1];
5984 break;
5985 }
5986 RMATCH(Fecode + PRIV(OP_lengths)[*Fecode] + Fecode[1], RM17);
5987 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5988
5989 /* Pass back the current skip name and return the special MATCH_SKIP_ARG
5990 return code. This will either be caught by a matching MARK, or get to the
5991 top, where it causes a rematch with mb->ignore_skip_arg set to the value of
5992 mb->skip_arg_count. */
5993
5994 mb->verb_skip_ptr = Fecode + 2;
5995 mb->verb_current_recurse = Fcurrent_recurse;
5996 RRETURN(MATCH_SKIP_ARG);
5997
5998 /* For THEN (and THEN_ARG) we pass back the address of the opcode, so that
5999 the branch in which it occurs can be determined. */
6000
6001 case OP_THEN:
6002 RMATCH(Fecode + PRIV(OP_lengths)[*Fecode], RM18);
6003 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
6004 mb->verb_ecode_ptr = Fecode;
6005 mb->verb_current_recurse = Fcurrent_recurse;
6006 RRETURN(MATCH_THEN);
6007
6008 case OP_THEN_ARG:
6009 Fmark = mb->nomatch_mark = Fecode + 2;
6010 RMATCH(Fecode + PRIV(OP_lengths)[*Fecode] + Fecode[1], RM19);
6011 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
6012 mb->verb_ecode_ptr = Fecode;
6013 mb->verb_current_recurse = Fcurrent_recurse;
6014 RRETURN(MATCH_THEN);
6015
6016
6017 /* ===================================================================== */
6018 /* There's been some horrible disaster. Arrival here can only mean there is
6019 something seriously wrong in the code above or the OP_xxx definitions. */
6020
6021 default:
6022 return PCRE2_ERROR_INTERNAL;
6023 }
6024
6025 /* Do not insert any code in here without much thought; it is assumed
6026 that "continue" in the code above comes out to here to repeat the main
6027 loop. */
6028
6029 } /* End of main loop */
6030 /* Control never reaches here */
6031
6032
6033 /* ========================================================================= */
6034 /* The RRETURN() macro jumps here. The number that is saved in Freturn_id
6035 indicates which label we actually want to return to. The value in Frdepth is
6036 the index number of the frame in the vector. The return value has been placed
6037 in rrc. */
6038
6039 #define LBL(val) case val: goto L_RM##val;
6040
6041 RETURN_SWITCH:
6042 if (Feptr > mb->last_used_ptr) mb->last_used_ptr = Feptr;
6043 if (Frdepth == 0) return rrc; /* Exit from the top level */
6044 F = (heapframe *)((char *)F - Fback_frame); /* Backtrack */
6045 mb->cb->callout_flags |= PCRE2_CALLOUT_BACKTRACK; /* Note for callouts */
6046
6047 #ifdef DEBUG_SHOW_RMATCH
6048 fprintf(stderr, "++ RETURN %d to %d\n", rrc, Freturn_id);
6049 #endif
6050
6051 switch (Freturn_id)
6052 {
6053 LBL( 1) LBL( 2) LBL( 3) LBL( 4) LBL( 5) LBL( 6) LBL( 7) LBL( 8)
6054 LBL( 9) LBL(10) LBL(11) LBL(12) LBL(13) LBL(14) LBL(15) LBL(16)
6055 LBL(17) LBL(18) LBL(19) LBL(20) LBL(21) LBL(22) LBL(23) LBL(24)
6056 LBL(25) LBL(26) LBL(27) LBL(28) LBL(29) LBL(30) LBL(31) LBL(32)
6057 LBL(33) LBL(34) LBL(35) LBL(36)
6058
6059 #ifdef SUPPORT_WIDE_CHARS
6060 LBL(100) LBL(101)
6061 #endif
6062
6063 #ifdef SUPPORT_UNICODE
6064 LBL(200) LBL(201) LBL(202) LBL(203) LBL(204) LBL(205) LBL(206)
6065 LBL(207) LBL(208) LBL(209) LBL(210) LBL(211) LBL(212) LBL(213)
6066 LBL(214) LBL(215) LBL(216) LBL(217) LBL(218) LBL(219) LBL(220)
6067 LBL(221) LBL(222)
6068 #endif
6069
6070 default:
6071 return PCRE2_ERROR_INTERNAL;
6072 }
6073 #undef LBL
6074 }
6075
6076
6077 /*************************************************
6078 * Match a Regular Expression *
6079 *************************************************/
6080
6081 /* This function applies a compiled pattern to a subject string and picks out
6082 portions of the string if it matches. Two elements in the vector are set for
6083 each substring: the offsets to the start and end of the substring.
6084
6085 Arguments:
6086 code points to the compiled expression
6087 subject points to the subject string
6088 length length of subject string (may contain binary zeros)
6089 start_offset where to start in the subject string
6090 options option bits
6091 match_data points to a match_data block
6092 mcontext points a PCRE2 context
6093
6094 Returns: > 0 => success; value is the number of ovector pairs filled
6095 = 0 => success, but ovector is not big enough
6096 = -1 => failed to match (PCRE2_ERROR_NOMATCH)
6097 = -2 => partial match (PCRE2_ERROR_PARTIAL)
6098 < -2 => some kind of unexpected problem
6099 */
6100
6101 PCRE2_EXP_DEFN int PCRE2_CALL_CONVENTION
pcre2_match(const pcre2_code * code,PCRE2_SPTR subject,PCRE2_SIZE length,PCRE2_SIZE start_offset,uint32_t options,pcre2_match_data * match_data,pcre2_match_context * mcontext)6102 pcre2_match(const pcre2_code *code, PCRE2_SPTR subject, PCRE2_SIZE length,
6103 PCRE2_SIZE start_offset, uint32_t options, pcre2_match_data *match_data,
6104 pcre2_match_context *mcontext)
6105 {
6106 int rc;
6107 int was_zero_terminated = 0;
6108 const uint8_t *start_bits = NULL;
6109 const pcre2_real_code *re = (const pcre2_real_code *)code;
6110
6111 BOOL anchored;
6112 BOOL firstline;
6113 BOOL has_first_cu = FALSE;
6114 BOOL has_req_cu = FALSE;
6115 BOOL startline;
6116
6117 #if PCRE2_CODE_UNIT_WIDTH == 8
6118 BOOL memchr_not_found_first_cu;
6119 BOOL memchr_not_found_first_cu2;
6120 #endif
6121
6122 PCRE2_UCHAR first_cu = 0;
6123 PCRE2_UCHAR first_cu2 = 0;
6124 PCRE2_UCHAR req_cu = 0;
6125 PCRE2_UCHAR req_cu2 = 0;
6126
6127 PCRE2_SPTR bumpalong_limit;
6128 PCRE2_SPTR end_subject;
6129 PCRE2_SPTR true_end_subject;
6130 PCRE2_SPTR start_match = subject + start_offset;
6131 PCRE2_SPTR req_cu_ptr = start_match - 1;
6132 PCRE2_SPTR start_partial;
6133 PCRE2_SPTR match_partial;
6134
6135 #ifdef SUPPORT_JIT
6136 BOOL use_jit;
6137 #endif
6138
6139 /* This flag is needed even when Unicode is not supported for convenience
6140 (it is used by the IS_NEWLINE macro). */
6141
6142 BOOL utf = FALSE;
6143
6144 #ifdef SUPPORT_UNICODE
6145 BOOL ucp = FALSE;
6146 BOOL allow_invalid;
6147 uint32_t fragment_options = 0;
6148 #ifdef SUPPORT_JIT
6149 BOOL jit_checked_utf = FALSE;
6150 #endif
6151 #endif /* SUPPORT_UNICODE */
6152
6153 PCRE2_SIZE frame_size;
6154
6155 /* We need to have mb as a pointer to a match block, because the IS_NEWLINE
6156 macro is used below, and it expects NLBLOCK to be defined as a pointer. */
6157
6158 pcre2_callout_block cb;
6159 match_block actual_match_block;
6160 match_block *mb = &actual_match_block;
6161
6162 /* Allocate an initial vector of backtracking frames on the stack. If this
6163 proves to be too small, it is replaced by a larger one on the heap. To get a
6164 vector of the size required that is aligned for pointers, allocate it as a
6165 vector of pointers. */
6166
6167 PCRE2_SPTR stack_frames_vector[START_FRAMES_SIZE/sizeof(PCRE2_SPTR)]
6168 PCRE2_KEEP_UNINITIALIZED;
6169 mb->stack_frames = (heapframe *)stack_frames_vector;
6170
6171 /* A length equal to PCRE2_ZERO_TERMINATED implies a zero-terminated
6172 subject string. */
6173
6174 if (length == PCRE2_ZERO_TERMINATED)
6175 {
6176 length = PRIV(strlen)(subject);
6177 was_zero_terminated = 1;
6178 }
6179 true_end_subject = end_subject = subject + length;
6180
6181 /* Plausibility checks */
6182
6183 if ((options & ~PUBLIC_MATCH_OPTIONS) != 0) return PCRE2_ERROR_BADOPTION;
6184 if (code == NULL || subject == NULL || match_data == NULL)
6185 return PCRE2_ERROR_NULL;
6186 if (start_offset > length) return PCRE2_ERROR_BADOFFSET;
6187
6188 /* Check that the first field in the block is the magic number. */
6189
6190 if (re->magic_number != MAGIC_NUMBER) return PCRE2_ERROR_BADMAGIC;
6191
6192 /* Check the code unit width. */
6193
6194 if ((re->flags & PCRE2_MODE_MASK) != PCRE2_CODE_UNIT_WIDTH/8)
6195 return PCRE2_ERROR_BADMODE;
6196
6197 /* PCRE2_NOTEMPTY and PCRE2_NOTEMPTY_ATSTART are match-time flags in the
6198 options variable for this function. Users of PCRE2 who are not calling the
6199 function directly would like to have a way of setting these flags, in the same
6200 way that they can set pcre2_compile() flags like PCRE2_NO_AUTOPOSSESS with
6201 constructions like (*NO_AUTOPOSSESS). To enable this, (*NOTEMPTY) and
6202 (*NOTEMPTY_ATSTART) set bits in the pattern's "flag" function which we now
6203 transfer to the options for this function. The bits are guaranteed to be
6204 adjacent, but do not have the same values. This bit of Boolean trickery assumes
6205 that the match-time bits are not more significant than the flag bits. If by
6206 accident this is not the case, a compile-time division by zero error will
6207 occur. */
6208
6209 #define FF (PCRE2_NOTEMPTY_SET|PCRE2_NE_ATST_SET)
6210 #define OO (PCRE2_NOTEMPTY|PCRE2_NOTEMPTY_ATSTART)
6211 options |= (re->flags & FF) / ((FF & (~FF+1)) / (OO & (~OO+1)));
6212 #undef FF
6213 #undef OO
6214
6215 /* If the pattern was successfully studied with JIT support, we will run the
6216 JIT executable instead of the rest of this function. Most options must be set
6217 at compile time for the JIT code to be usable. */
6218
6219 #ifdef SUPPORT_JIT
6220 use_jit = (re->executable_jit != NULL &&
6221 (options & ~PUBLIC_JIT_MATCH_OPTIONS) == 0);
6222 #endif
6223
6224 /* Initialize UTF/UCP parameters. */
6225
6226 #ifdef SUPPORT_UNICODE
6227 utf = (re->overall_options & PCRE2_UTF) != 0;
6228 allow_invalid = (re->overall_options & PCRE2_MATCH_INVALID_UTF) != 0;
6229 ucp = (re->overall_options & PCRE2_UCP) != 0;
6230 #endif /* SUPPORT_UNICODE */
6231
6232 /* Convert the partial matching flags into an integer. */
6233
6234 mb->partial = ((options & PCRE2_PARTIAL_HARD) != 0)? 2 :
6235 ((options & PCRE2_PARTIAL_SOFT) != 0)? 1 : 0;
6236
6237 /* Partial matching and PCRE2_ENDANCHORED are currently not allowed at the same
6238 time. */
6239
6240 if (mb->partial != 0 &&
6241 ((re->overall_options | options) & PCRE2_ENDANCHORED) != 0)
6242 return PCRE2_ERROR_BADOPTION;
6243
6244 /* It is an error to set an offset limit without setting the flag at compile
6245 time. */
6246
6247 if (mcontext != NULL && mcontext->offset_limit != PCRE2_UNSET &&
6248 (re->overall_options & PCRE2_USE_OFFSET_LIMIT) == 0)
6249 return PCRE2_ERROR_BADOFFSETLIMIT;
6250
6251 /* If the match data block was previously used with PCRE2_COPY_MATCHED_SUBJECT,
6252 free the memory that was obtained. Set the field to NULL for no match cases. */
6253
6254 if ((match_data->flags & PCRE2_MD_COPIED_SUBJECT) != 0)
6255 {
6256 match_data->memctl.free((void *)match_data->subject,
6257 match_data->memctl.memory_data);
6258 match_data->flags &= ~PCRE2_MD_COPIED_SUBJECT;
6259 }
6260 match_data->subject = NULL;
6261
6262 /* Zero the error offset in case the first code unit is invalid UTF. */
6263
6264 match_data->startchar = 0;
6265
6266
6267 /* ============================= JIT matching ============================== */
6268
6269 /* Prepare for JIT matching. Check a UTF string for validity unless no check is
6270 requested or invalid UTF can be handled. We check only the portion of the
6271 subject that might be be inspected during matching - from the offset minus the
6272 maximum lookbehind to the given length. This saves time when a small part of a
6273 large subject is being matched by the use of a starting offset. Note that the
6274 maximum lookbehind is a number of characters, not code units. */
6275
6276 #ifdef SUPPORT_JIT
6277 if (use_jit)
6278 {
6279 #ifdef SUPPORT_UNICODE
6280 if (utf && (options & PCRE2_NO_UTF_CHECK) == 0 && !allow_invalid)
6281 {
6282 #if PCRE2_CODE_UNIT_WIDTH != 32
6283 unsigned int i;
6284 #endif
6285
6286 /* For 8-bit and 16-bit UTF, check that the first code unit is a valid
6287 character start. */
6288
6289 #if PCRE2_CODE_UNIT_WIDTH != 32
6290 if (start_match < end_subject && NOT_FIRSTCU(*start_match))
6291 {
6292 if (start_offset > 0) return PCRE2_ERROR_BADUTFOFFSET;
6293 #if PCRE2_CODE_UNIT_WIDTH == 8
6294 return PCRE2_ERROR_UTF8_ERR20; /* Isolated 0x80 byte */
6295 #else
6296 return PCRE2_ERROR_UTF16_ERR3; /* Isolated low surrogate */
6297 #endif
6298 }
6299 #endif /* WIDTH != 32 */
6300
6301 /* Move back by the maximum lookbehind, just in case it happens at the very
6302 start of matching. */
6303
6304 #if PCRE2_CODE_UNIT_WIDTH != 32
6305 for (i = re->max_lookbehind; i > 0 && start_match > subject; i--)
6306 {
6307 start_match--;
6308 while (start_match > subject &&
6309 #if PCRE2_CODE_UNIT_WIDTH == 8
6310 (*start_match & 0xc0) == 0x80)
6311 #else /* 16-bit */
6312 (*start_match & 0xfc00) == 0xdc00)
6313 #endif
6314 start_match--;
6315 }
6316 #else /* PCRE2_CODE_UNIT_WIDTH != 32 */
6317
6318 /* In the 32-bit library, one code unit equals one character. However,
6319 we cannot just subtract the lookbehind and then compare pointers, because
6320 a very large lookbehind could create an invalid pointer. */
6321
6322 if (start_offset >= re->max_lookbehind)
6323 start_match -= re->max_lookbehind;
6324 else
6325 start_match = subject;
6326 #endif /* PCRE2_CODE_UNIT_WIDTH != 32 */
6327
6328 /* Validate the relevant portion of the subject. Adjust the offset of an
6329 invalid code point to be an absolute offset in the whole string. */
6330
6331 match_data->rc = PRIV(valid_utf)(start_match,
6332 length - (start_match - subject), &(match_data->startchar));
6333 if (match_data->rc != 0)
6334 {
6335 match_data->startchar += start_match - subject;
6336 return match_data->rc;
6337 }
6338 jit_checked_utf = TRUE;
6339 }
6340 #endif /* SUPPORT_UNICODE */
6341
6342 /* If JIT returns BADOPTION, which means that the selected complete or
6343 partial matching mode was not compiled, fall through to the interpreter. */
6344
6345 rc = pcre2_jit_match(code, subject, length, start_offset, options,
6346 match_data, mcontext);
6347 if (rc != PCRE2_ERROR_JIT_BADOPTION)
6348 {
6349 if (rc >= 0 && (options & PCRE2_COPY_MATCHED_SUBJECT) != 0)
6350 {
6351 length = CU2BYTES(length + was_zero_terminated);
6352 match_data->subject = match_data->memctl.malloc(length,
6353 match_data->memctl.memory_data);
6354 if (match_data->subject == NULL) return PCRE2_ERROR_NOMEMORY;
6355 memcpy((void *)match_data->subject, subject, length);
6356 match_data->flags |= PCRE2_MD_COPIED_SUBJECT;
6357 }
6358 return rc;
6359 }
6360 }
6361 #endif /* SUPPORT_JIT */
6362
6363 /* ========================= End of JIT matching ========================== */
6364
6365
6366 /* Proceed with non-JIT matching. The default is to allow lookbehinds to the
6367 start of the subject. A UTF check when there is a non-zero offset may change
6368 this. */
6369
6370 mb->check_subject = subject;
6371
6372 /* If a UTF subject string was not checked for validity in the JIT code above,
6373 check it here, and handle support for invalid UTF strings. The check above
6374 happens only when invalid UTF is not supported and PCRE2_NO_CHECK_UTF is unset.
6375 If we get here in those circumstances, it means the subject string is valid,
6376 but for some reason JIT matching was not successful. There is no need to check
6377 the subject again.
6378
6379 We check only the portion of the subject that might be be inspected during
6380 matching - from the offset minus the maximum lookbehind to the given length.
6381 This saves time when a small part of a large subject is being matched by the
6382 use of a starting offset. Note that the maximum lookbehind is a number of
6383 characters, not code units.
6384
6385 Note also that support for invalid UTF forces a check, overriding the setting
6386 of PCRE2_NO_CHECK_UTF. */
6387
6388 #ifdef SUPPORT_UNICODE
6389 if (utf &&
6390 #ifdef SUPPORT_JIT
6391 !jit_checked_utf &&
6392 #endif
6393 ((options & PCRE2_NO_UTF_CHECK) == 0 || allow_invalid))
6394 {
6395 #if PCRE2_CODE_UNIT_WIDTH != 32
6396 BOOL skipped_bad_start = FALSE;
6397 #endif
6398
6399 /* For 8-bit and 16-bit UTF, check that the first code unit is a valid
6400 character start. If we are handling invalid UTF, just skip over such code
6401 units. Otherwise, give an appropriate error. */
6402
6403 #if PCRE2_CODE_UNIT_WIDTH != 32
6404 if (allow_invalid)
6405 {
6406 while (start_match < end_subject && NOT_FIRSTCU(*start_match))
6407 {
6408 start_match++;
6409 skipped_bad_start = TRUE;
6410 }
6411 }
6412 else if (start_match < end_subject && NOT_FIRSTCU(*start_match))
6413 {
6414 if (start_offset > 0) return PCRE2_ERROR_BADUTFOFFSET;
6415 #if PCRE2_CODE_UNIT_WIDTH == 8
6416 return PCRE2_ERROR_UTF8_ERR20; /* Isolated 0x80 byte */
6417 #else
6418 return PCRE2_ERROR_UTF16_ERR3; /* Isolated low surrogate */
6419 #endif
6420 }
6421 #endif /* WIDTH != 32 */
6422
6423 /* The mb->check_subject field points to the start of UTF checking;
6424 lookbehinds can go back no further than this. */
6425
6426 mb->check_subject = start_match;
6427
6428 /* Move back by the maximum lookbehind, just in case it happens at the very
6429 start of matching, but don't do this if we skipped bad 8-bit or 16-bit code
6430 units above. */
6431
6432 #if PCRE2_CODE_UNIT_WIDTH != 32
6433 if (!skipped_bad_start)
6434 {
6435 unsigned int i;
6436 for (i = re->max_lookbehind; i > 0 && mb->check_subject > subject; i--)
6437 {
6438 mb->check_subject--;
6439 while (mb->check_subject > subject &&
6440 #if PCRE2_CODE_UNIT_WIDTH == 8
6441 (*mb->check_subject & 0xc0) == 0x80)
6442 #else /* 16-bit */
6443 (*mb->check_subject & 0xfc00) == 0xdc00)
6444 #endif
6445 mb->check_subject--;
6446 }
6447 }
6448 #else /* PCRE2_CODE_UNIT_WIDTH != 32 */
6449
6450 /* In the 32-bit library, one code unit equals one character. However,
6451 we cannot just subtract the lookbehind and then compare pointers, because
6452 a very large lookbehind could create an invalid pointer. */
6453
6454 if (start_offset >= re->max_lookbehind)
6455 mb->check_subject -= re->max_lookbehind;
6456 else
6457 mb->check_subject = subject;
6458 #endif /* PCRE2_CODE_UNIT_WIDTH != 32 */
6459
6460 /* Validate the relevant portion of the subject. There's a loop in case we
6461 encounter bad UTF in the characters preceding start_match which we are
6462 scanning because of a lookbehind. */
6463
6464 for (;;)
6465 {
6466 match_data->rc = PRIV(valid_utf)(mb->check_subject,
6467 length - (mb->check_subject - subject), &(match_data->startchar));
6468
6469 if (match_data->rc == 0) break; /* Valid UTF string */
6470
6471 /* Invalid UTF string. Adjust the offset to be an absolute offset in the
6472 whole string. If we are handling invalid UTF strings, set end_subject to
6473 stop before the bad code unit, and set the options to "not end of line".
6474 Otherwise return the error. */
6475
6476 match_data->startchar += mb->check_subject - subject;
6477 if (!allow_invalid || match_data->rc > 0) return match_data->rc;
6478 end_subject = subject + match_data->startchar;
6479
6480 /* If the end precedes start_match, it means there is invalid UTF in the
6481 extra code units we reversed over because of a lookbehind. Advance past the
6482 first bad code unit, and then skip invalid character starting code units in
6483 8-bit and 16-bit modes, and try again. */
6484
6485 if (end_subject < start_match)
6486 {
6487 mb->check_subject = end_subject + 1;
6488 #if PCRE2_CODE_UNIT_WIDTH != 32
6489 while (mb->check_subject < start_match && NOT_FIRSTCU(*mb->check_subject))
6490 mb->check_subject++;
6491 #endif
6492 }
6493
6494 /* Otherwise, set the not end of line option, and do the match. */
6495
6496 else
6497 {
6498 fragment_options = PCRE2_NOTEOL;
6499 break;
6500 }
6501 }
6502 }
6503 #endif /* SUPPORT_UNICODE */
6504
6505 /* A NULL match context means "use a default context", but we take the memory
6506 control functions from the pattern. */
6507
6508 if (mcontext == NULL)
6509 {
6510 mcontext = (pcre2_match_context *)(&PRIV(default_match_context));
6511 mb->memctl = re->memctl;
6512 }
6513 else mb->memctl = mcontext->memctl;
6514
6515 anchored = ((re->overall_options | options) & PCRE2_ANCHORED) != 0;
6516 firstline = (re->overall_options & PCRE2_FIRSTLINE) != 0;
6517 startline = (re->flags & PCRE2_STARTLINE) != 0;
6518 bumpalong_limit = (mcontext->offset_limit == PCRE2_UNSET)?
6519 true_end_subject : subject + mcontext->offset_limit;
6520
6521 /* Initialize and set up the fixed fields in the callout block, with a pointer
6522 in the match block. */
6523
6524 mb->cb = &cb;
6525 cb.version = 2;
6526 cb.subject = subject;
6527 cb.subject_length = (PCRE2_SIZE)(end_subject - subject);
6528 cb.callout_flags = 0;
6529
6530 /* Fill in the remaining fields in the match block, except for moptions, which
6531 gets set later. */
6532
6533 mb->callout = mcontext->callout;
6534 mb->callout_data = mcontext->callout_data;
6535
6536 mb->start_subject = subject;
6537 mb->start_offset = start_offset;
6538 mb->end_subject = end_subject;
6539 mb->hasthen = (re->flags & PCRE2_HASTHEN) != 0;
6540 mb->allowemptypartial = (re->max_lookbehind > 0) ||
6541 (re->flags & PCRE2_MATCH_EMPTY) != 0;
6542 mb->poptions = re->overall_options; /* Pattern options */
6543 mb->ignore_skip_arg = 0;
6544 mb->mark = mb->nomatch_mark = NULL; /* In case never set */
6545
6546 /* The name table is needed for finding all the numbers associated with a
6547 given name, for condition testing. The code follows the name table. */
6548
6549 mb->name_table = (PCRE2_UCHAR *)((uint8_t *)re + sizeof(pcre2_real_code));
6550 mb->name_count = re->name_count;
6551 mb->name_entry_size = re->name_entry_size;
6552 mb->start_code = mb->name_table + re->name_count * re->name_entry_size;
6553
6554 /* Process the \R and newline settings. */
6555
6556 mb->bsr_convention = re->bsr_convention;
6557 mb->nltype = NLTYPE_FIXED;
6558 switch(re->newline_convention)
6559 {
6560 case PCRE2_NEWLINE_CR:
6561 mb->nllen = 1;
6562 mb->nl[0] = CHAR_CR;
6563 break;
6564
6565 case PCRE2_NEWLINE_LF:
6566 mb->nllen = 1;
6567 mb->nl[0] = CHAR_NL;
6568 break;
6569
6570 case PCRE2_NEWLINE_NUL:
6571 mb->nllen = 1;
6572 mb->nl[0] = CHAR_NUL;
6573 break;
6574
6575 case PCRE2_NEWLINE_CRLF:
6576 mb->nllen = 2;
6577 mb->nl[0] = CHAR_CR;
6578 mb->nl[1] = CHAR_NL;
6579 break;
6580
6581 case PCRE2_NEWLINE_ANY:
6582 mb->nltype = NLTYPE_ANY;
6583 break;
6584
6585 case PCRE2_NEWLINE_ANYCRLF:
6586 mb->nltype = NLTYPE_ANYCRLF;
6587 break;
6588
6589 default: return PCRE2_ERROR_INTERNAL;
6590 }
6591
6592 /* The backtracking frames have fixed data at the front, and a PCRE2_SIZE
6593 vector at the end, whose size depends on the number of capturing parentheses in
6594 the pattern. It is not used at all if there are no capturing parentheses.
6595
6596 frame_size is the total size of each frame
6597 mb->frame_vector_size is the total usable size of the vector (rounded down
6598 to a whole number of frames)
6599
6600 The last of these is changed within the match() function if the frame vector
6601 has to be expanded. We therefore put it into the match block so that it is
6602 correct when calling match() more than once for non-anchored patterns. */
6603
6604 frame_size = offsetof(heapframe, ovector) +
6605 re->top_bracket * 2 * sizeof(PCRE2_SIZE);
6606
6607 /* Limits set in the pattern override the match context only if they are
6608 smaller. */
6609
6610 mb->heap_limit = (mcontext->heap_limit < re->limit_heap)?
6611 mcontext->heap_limit : re->limit_heap;
6612
6613 mb->match_limit = (mcontext->match_limit < re->limit_match)?
6614 mcontext->match_limit : re->limit_match;
6615
6616 mb->match_limit_depth = (mcontext->depth_limit < re->limit_depth)?
6617 mcontext->depth_limit : re->limit_depth;
6618
6619 /* If a pattern has very many capturing parentheses, the frame size may be very
6620 large. Ensure that there are at least 10 available frames by getting an initial
6621 vector on the heap if necessary, except when the heap limit prevents this. Get
6622 fewer if possible. (The heap limit is in kibibytes.) */
6623
6624 if (frame_size <= START_FRAMES_SIZE/10)
6625 {
6626 mb->match_frames = mb->stack_frames; /* Initial frame vector on the stack */
6627 mb->frame_vector_size = ((START_FRAMES_SIZE/frame_size) * frame_size);
6628 }
6629 else
6630 {
6631 mb->frame_vector_size = frame_size * 10;
6632 if ((mb->frame_vector_size / 1024) > mb->heap_limit)
6633 {
6634 if (frame_size > mb->heap_limit * 1024) return PCRE2_ERROR_HEAPLIMIT;
6635 mb->frame_vector_size = ((mb->heap_limit * 1024)/frame_size) * frame_size;
6636 }
6637 mb->match_frames = mb->memctl.malloc(mb->frame_vector_size,
6638 mb->memctl.memory_data);
6639 if (mb->match_frames == NULL) return PCRE2_ERROR_NOMEMORY;
6640 }
6641
6642 mb->match_frames_top =
6643 (heapframe *)((char *)mb->match_frames + mb->frame_vector_size);
6644
6645 /* Write to the ovector within the first frame to mark every capture unset and
6646 to avoid uninitialized memory read errors when it is copied to a new frame. */
6647
6648 memset((char *)(mb->match_frames) + offsetof(heapframe, ovector), 0xff,
6649 re->top_bracket * 2 * sizeof(PCRE2_SIZE));
6650
6651 /* Pointers to the individual character tables */
6652
6653 mb->lcc = re->tables + lcc_offset;
6654 mb->fcc = re->tables + fcc_offset;
6655 mb->ctypes = re->tables + ctypes_offset;
6656
6657 /* Set up the first code unit to match, if available. If there's no first code
6658 unit there may be a bitmap of possible first characters. */
6659
6660 if ((re->flags & PCRE2_FIRSTSET) != 0)
6661 {
6662 has_first_cu = TRUE;
6663 first_cu = first_cu2 = (PCRE2_UCHAR)(re->first_codeunit);
6664 if ((re->flags & PCRE2_FIRSTCASELESS) != 0)
6665 {
6666 first_cu2 = TABLE_GET(first_cu, mb->fcc, first_cu);
6667 #ifdef SUPPORT_UNICODE
6668 #if PCRE2_CODE_UNIT_WIDTH == 8
6669 if (first_cu > 127 && ucp && !utf) first_cu2 = UCD_OTHERCASE(first_cu);
6670 #else
6671 if (first_cu > 127 && (utf || ucp)) first_cu2 = UCD_OTHERCASE(first_cu);
6672 #endif
6673 #endif /* SUPPORT_UNICODE */
6674 }
6675 }
6676 else
6677 if (!startline && (re->flags & PCRE2_FIRSTMAPSET) != 0)
6678 start_bits = re->start_bitmap;
6679
6680 /* There may also be a "last known required character" set. */
6681
6682 if ((re->flags & PCRE2_LASTSET) != 0)
6683 {
6684 has_req_cu = TRUE;
6685 req_cu = req_cu2 = (PCRE2_UCHAR)(re->last_codeunit);
6686 if ((re->flags & PCRE2_LASTCASELESS) != 0)
6687 {
6688 req_cu2 = TABLE_GET(req_cu, mb->fcc, req_cu);
6689 #ifdef SUPPORT_UNICODE
6690 #if PCRE2_CODE_UNIT_WIDTH == 8
6691 if (req_cu > 127 && ucp && !utf) req_cu2 = UCD_OTHERCASE(req_cu);
6692 #else
6693 if (req_cu > 127 && (utf || ucp)) req_cu2 = UCD_OTHERCASE(req_cu);
6694 #endif
6695 #endif /* SUPPORT_UNICODE */
6696 }
6697 }
6698
6699
6700 /* ==========================================================================*/
6701
6702 /* Loop for handling unanchored repeated matching attempts; for anchored regexs
6703 the loop runs just once. */
6704
6705 #ifdef SUPPORT_UNICODE
6706 FRAGMENT_RESTART:
6707 #endif
6708
6709 start_partial = match_partial = NULL;
6710 mb->hitend = FALSE;
6711
6712 #if PCRE2_CODE_UNIT_WIDTH == 8
6713 memchr_not_found_first_cu = FALSE;
6714 memchr_not_found_first_cu2 = FALSE;
6715 #endif
6716
6717 for(;;)
6718 {
6719 PCRE2_SPTR new_start_match;
6720
6721 /* ----------------- Start of match optimizations ---------------- */
6722
6723 /* There are some optimizations that avoid running the match if a known
6724 starting point is not found, or if a known later code unit is not present.
6725 However, there is an option (settable at compile time) that disables these,
6726 for testing and for ensuring that all callouts do actually occur. */
6727
6728 if ((re->overall_options & PCRE2_NO_START_OPTIMIZE) == 0)
6729 {
6730 /* If firstline is TRUE, the start of the match is constrained to the first
6731 line of a multiline string. That is, the match must be before or at the
6732 first newline following the start of matching. Temporarily adjust
6733 end_subject so that we stop the scans for a first code unit at a newline.
6734 If the match fails at the newline, later code breaks the loop. */
6735
6736 if (firstline)
6737 {
6738 PCRE2_SPTR t = start_match;
6739 #ifdef SUPPORT_UNICODE
6740 if (utf)
6741 {
6742 while (t < end_subject && !IS_NEWLINE(t))
6743 {
6744 t++;
6745 ACROSSCHAR(t < end_subject, t, t++);
6746 }
6747 }
6748 else
6749 #endif
6750 while (t < end_subject && !IS_NEWLINE(t)) t++;
6751 end_subject = t;
6752 }
6753
6754 /* Anchored: check the first code unit if one is recorded. This may seem
6755 pointless but it can help in detecting a no match case without scanning for
6756 the required code unit. */
6757
6758 if (anchored)
6759 {
6760 if (has_first_cu || start_bits != NULL)
6761 {
6762 BOOL ok = start_match < end_subject;
6763 if (ok)
6764 {
6765 PCRE2_UCHAR c = UCHAR21TEST(start_match);
6766 ok = has_first_cu && (c == first_cu || c == first_cu2);
6767 if (!ok && start_bits != NULL)
6768 {
6769 #if PCRE2_CODE_UNIT_WIDTH != 8
6770 if (c > 255) c = 255;
6771 #endif
6772 ok = (start_bits[c/8] & (1u << (c&7))) != 0;
6773 }
6774 }
6775 if (!ok)
6776 {
6777 rc = MATCH_NOMATCH;
6778 break;
6779 }
6780 }
6781 }
6782
6783 /* Not anchored. Advance to a unique first code unit if there is one. In
6784 8-bit mode, the use of memchr() gives a big speed up, even though we have
6785 to call it twice in caseless mode, in order to find the earliest occurrence
6786 of the character in either of its cases. If a call to memchr() that
6787 searches the rest of the subject fails to find one case, remember that in
6788 order not to keep on repeating the search. This can make a huge difference
6789 when the strings are very long and only one case is present. */
6790
6791 else
6792 {
6793 if (has_first_cu)
6794 {
6795 if (first_cu != first_cu2) /* Caseless */
6796 {
6797 #if PCRE2_CODE_UNIT_WIDTH != 8
6798 PCRE2_UCHAR smc;
6799 while (start_match < end_subject &&
6800 (smc = UCHAR21TEST(start_match)) != first_cu &&
6801 smc != first_cu2)
6802 start_match++;
6803
6804 #else /* 8-bit code units */
6805 PCRE2_SPTR pp1 = NULL;
6806 PCRE2_SPTR pp2 = NULL;
6807 PCRE2_SIZE cu2size = end_subject - start_match;
6808
6809 if (!memchr_not_found_first_cu)
6810 {
6811 pp1 = memchr(start_match, first_cu, end_subject - start_match);
6812 if (pp1 == NULL) memchr_not_found_first_cu = TRUE;
6813 else cu2size = pp1 - start_match;
6814 }
6815
6816 /* If pp1 is not NULL, we have arranged to search only as far as pp1,
6817 to see if the other case is earlier, so we can set "not found" only
6818 when both searches have returned NULL. */
6819
6820 if (!memchr_not_found_first_cu2)
6821 {
6822 pp2 = memchr(start_match, first_cu2, cu2size);
6823 memchr_not_found_first_cu2 = (pp2 == NULL && pp1 == NULL);
6824 }
6825
6826 if (pp1 == NULL)
6827 start_match = (pp2 == NULL)? end_subject : pp2;
6828 else
6829 start_match = (pp2 == NULL || pp1 < pp2)? pp1 : pp2;
6830 #endif
6831 }
6832
6833 /* The caseful case */
6834
6835 else
6836 {
6837 #if PCRE2_CODE_UNIT_WIDTH != 8
6838 while (start_match < end_subject && UCHAR21TEST(start_match) !=
6839 first_cu)
6840 start_match++;
6841 #else
6842 start_match = memchr(start_match, first_cu, end_subject - start_match);
6843 if (start_match == NULL) start_match = end_subject;
6844 #endif
6845 }
6846
6847 /* If we can't find the required first code unit, having reached the
6848 true end of the subject, break the bumpalong loop, to force a match
6849 failure, except when doing partial matching, when we let the next cycle
6850 run at the end of the subject. To see why, consider the pattern
6851 /(?<=abc)def/, which partially matches "abc", even though the string
6852 does not contain the starting character "d". If we have not reached the
6853 true end of the subject (PCRE2_FIRSTLINE caused end_subject to be
6854 temporarily modified) we also let the cycle run, because the matching
6855 string is legitimately allowed to start with the first code unit of a
6856 newline. */
6857
6858 if (mb->partial == 0 && start_match >= mb->end_subject)
6859 {
6860 rc = MATCH_NOMATCH;
6861 break;
6862 }
6863 }
6864
6865 /* If there's no first code unit, advance to just after a linebreak for a
6866 multiline match if required. */
6867
6868 else if (startline)
6869 {
6870 if (start_match > mb->start_subject + start_offset)
6871 {
6872 #ifdef SUPPORT_UNICODE
6873 if (utf)
6874 {
6875 while (start_match < end_subject && !WAS_NEWLINE(start_match))
6876 {
6877 start_match++;
6878 ACROSSCHAR(start_match < end_subject, start_match, start_match++);
6879 }
6880 }
6881 else
6882 #endif
6883 while (start_match < end_subject && !WAS_NEWLINE(start_match))
6884 start_match++;
6885
6886 /* If we have just passed a CR and the newline option is ANY or
6887 ANYCRLF, and we are now at a LF, advance the match position by one
6888 more code unit. */
6889
6890 if (start_match[-1] == CHAR_CR &&
6891 (mb->nltype == NLTYPE_ANY || mb->nltype == NLTYPE_ANYCRLF) &&
6892 start_match < end_subject &&
6893 UCHAR21TEST(start_match) == CHAR_NL)
6894 start_match++;
6895 }
6896 }
6897
6898 /* If there's no first code unit or a requirement for a multiline line
6899 start, advance to a non-unique first code unit if any have been
6900 identified. The bitmap contains only 256 bits. When code units are 16 or
6901 32 bits wide, all code units greater than 254 set the 255 bit. */
6902
6903 else if (start_bits != NULL)
6904 {
6905 while (start_match < end_subject)
6906 {
6907 uint32_t c = UCHAR21TEST(start_match);
6908 #if PCRE2_CODE_UNIT_WIDTH != 8
6909 if (c > 255) c = 255;
6910 #endif
6911 if ((start_bits[c/8] & (1u << (c&7))) != 0) break;
6912 start_match++;
6913 }
6914
6915 /* See comment above in first_cu checking about the next few lines. */
6916
6917 if (mb->partial == 0 && start_match >= mb->end_subject)
6918 {
6919 rc = MATCH_NOMATCH;
6920 break;
6921 }
6922 }
6923 } /* End first code unit handling */
6924
6925 /* Restore fudged end_subject */
6926
6927 end_subject = mb->end_subject;
6928
6929 /* The following two optimizations must be disabled for partial matching. */
6930
6931 if (mb->partial == 0)
6932 {
6933 PCRE2_SPTR p;
6934
6935 /* The minimum matching length is a lower bound; no string of that length
6936 may actually match the pattern. Although the value is, strictly, in
6937 characters, we treat it as code units to avoid spending too much time in
6938 this optimization. */
6939
6940 if (end_subject - start_match < re->minlength)
6941 {
6942 rc = MATCH_NOMATCH;
6943 break;
6944 }
6945
6946 /* If req_cu is set, we know that that code unit must appear in the
6947 subject for the (non-partial) match to succeed. If the first code unit is
6948 set, req_cu must be later in the subject; otherwise the test starts at
6949 the match point. This optimization can save a huge amount of backtracking
6950 in patterns with nested unlimited repeats that aren't going to match.
6951 Writing separate code for caseful/caseless versions makes it go faster,
6952 as does using an autoincrement and backing off on a match. As in the case
6953 of the first code unit, using memchr() in the 8-bit library gives a big
6954 speed up. Unlike the first_cu check above, we do not need to call
6955 memchr() twice in the caseless case because we only need to check for the
6956 presence of the character in either case, not find the first occurrence.
6957
6958 The search can be skipped if the code unit was found later than the
6959 current starting point in a previous iteration of the bumpalong loop.
6960
6961 HOWEVER: when the subject string is very, very long, searching to its end
6962 can take a long time, and give bad performance on quite ordinary
6963 anchored patterns. This showed up when somebody was matching something
6964 like /^\d+C/ on a 32-megabyte string... so we don't do this when the
6965 string is sufficiently long, but it's worth searching a lot more for
6966 unanchored patterns. */
6967
6968 p = start_match + (has_first_cu? 1:0);
6969 if (has_req_cu && p > req_cu_ptr)
6970 {
6971 PCRE2_SIZE check_length = end_subject - start_match;
6972
6973 if (check_length < REQ_CU_MAX ||
6974 (!anchored && check_length < REQ_CU_MAX * 1000))
6975 {
6976 if (req_cu != req_cu2) /* Caseless */
6977 {
6978 #if PCRE2_CODE_UNIT_WIDTH != 8
6979 while (p < end_subject)
6980 {
6981 uint32_t pp = UCHAR21INCTEST(p);
6982 if (pp == req_cu || pp == req_cu2) { p--; break; }
6983 }
6984 #else /* 8-bit code units */
6985 PCRE2_SPTR pp = p;
6986 p = memchr(pp, req_cu, end_subject - pp);
6987 if (p == NULL)
6988 {
6989 p = memchr(pp, req_cu2, end_subject - pp);
6990 if (p == NULL) p = end_subject;
6991 }
6992 #endif /* PCRE2_CODE_UNIT_WIDTH != 8 */
6993 }
6994
6995 /* The caseful case */
6996
6997 else
6998 {
6999 #if PCRE2_CODE_UNIT_WIDTH != 8
7000 while (p < end_subject)
7001 {
7002 if (UCHAR21INCTEST(p) == req_cu) { p--; break; }
7003 }
7004
7005 #else /* 8-bit code units */
7006 p = memchr(p, req_cu, end_subject - p);
7007 if (p == NULL) p = end_subject;
7008 #endif
7009 }
7010
7011 /* If we can't find the required code unit, break the bumpalong loop,
7012 forcing a match failure. */
7013
7014 if (p >= end_subject)
7015 {
7016 rc = MATCH_NOMATCH;
7017 break;
7018 }
7019
7020 /* If we have found the required code unit, save the point where we
7021 found it, so that we don't search again next time round the bumpalong
7022 loop if the start hasn't yet passed this code unit. */
7023
7024 req_cu_ptr = p;
7025 }
7026 }
7027 }
7028 }
7029
7030 /* ------------ End of start of match optimizations ------------ */
7031
7032 /* Give no match if we have passed the bumpalong limit. */
7033
7034 if (start_match > bumpalong_limit)
7035 {
7036 rc = MATCH_NOMATCH;
7037 break;
7038 }
7039
7040 /* OK, we can now run the match. If "hitend" is set afterwards, remember the
7041 first starting point for which a partial match was found. */
7042
7043 cb.start_match = (PCRE2_SIZE)(start_match - subject);
7044 cb.callout_flags |= PCRE2_CALLOUT_STARTMATCH;
7045
7046 mb->start_used_ptr = start_match;
7047 mb->last_used_ptr = start_match;
7048 #ifdef SUPPORT_UNICODE
7049 mb->moptions = options | fragment_options;
7050 #else
7051 mb->moptions = options;
7052 #endif
7053 mb->match_call_count = 0;
7054 mb->end_offset_top = 0;
7055 mb->skip_arg_count = 0;
7056
7057 rc = match(start_match, mb->start_code, match_data->ovector,
7058 match_data->oveccount, re->top_bracket, frame_size, mb);
7059
7060 if (mb->hitend && start_partial == NULL)
7061 {
7062 start_partial = mb->start_used_ptr;
7063 match_partial = start_match;
7064 }
7065
7066 switch(rc)
7067 {
7068 /* If MATCH_SKIP_ARG reaches this level it means that a MARK that matched
7069 the SKIP's arg was not found. In this circumstance, Perl ignores the SKIP
7070 entirely. The only way we can do that is to re-do the match at the same
7071 point, with a flag to force SKIP with an argument to be ignored. Just
7072 treating this case as NOMATCH does not work because it does not check other
7073 alternatives in patterns such as A(*SKIP:A)B|AC when the subject is AC. */
7074
7075 case MATCH_SKIP_ARG:
7076 new_start_match = start_match;
7077 mb->ignore_skip_arg = mb->skip_arg_count;
7078 break;
7079
7080 /* SKIP passes back the next starting point explicitly, but if it is no
7081 greater than the match we have just done, treat it as NOMATCH. */
7082
7083 case MATCH_SKIP:
7084 if (mb->verb_skip_ptr > start_match)
7085 {
7086 new_start_match = mb->verb_skip_ptr;
7087 break;
7088 }
7089 /* Fall through */
7090
7091 /* NOMATCH and PRUNE advance by one character. THEN at this level acts
7092 exactly like PRUNE. Unset ignore SKIP-with-argument. */
7093
7094 case MATCH_NOMATCH:
7095 case MATCH_PRUNE:
7096 case MATCH_THEN:
7097 mb->ignore_skip_arg = 0;
7098 new_start_match = start_match + 1;
7099 #ifdef SUPPORT_UNICODE
7100 if (utf)
7101 ACROSSCHAR(new_start_match < end_subject, new_start_match,
7102 new_start_match++);
7103 #endif
7104 break;
7105
7106 /* COMMIT disables the bumpalong, but otherwise behaves as NOMATCH. */
7107
7108 case MATCH_COMMIT:
7109 rc = MATCH_NOMATCH;
7110 goto ENDLOOP;
7111
7112 /* Any other return is either a match, or some kind of error. */
7113
7114 default:
7115 goto ENDLOOP;
7116 }
7117
7118 /* Control reaches here for the various types of "no match at this point"
7119 result. Reset the code to MATCH_NOMATCH for subsequent checking. */
7120
7121 rc = MATCH_NOMATCH;
7122
7123 /* If PCRE2_FIRSTLINE is set, the match must happen before or at the first
7124 newline in the subject (though it may continue over the newline). Therefore,
7125 if we have just failed to match, starting at a newline, do not continue. */
7126
7127 if (firstline && IS_NEWLINE(start_match)) break;
7128
7129 /* Advance to new matching position */
7130
7131 start_match = new_start_match;
7132
7133 /* Break the loop if the pattern is anchored or if we have passed the end of
7134 the subject. */
7135
7136 if (anchored || start_match > end_subject) break;
7137
7138 /* If we have just passed a CR and we are now at a LF, and the pattern does
7139 not contain any explicit matches for \r or \n, and the newline option is CRLF
7140 or ANY or ANYCRLF, advance the match position by one more code unit. In
7141 normal matching start_match will aways be greater than the first position at
7142 this stage, but a failed *SKIP can cause a return at the same point, which is
7143 why the first test exists. */
7144
7145 if (start_match > subject + start_offset &&
7146 start_match[-1] == CHAR_CR &&
7147 start_match < end_subject &&
7148 *start_match == CHAR_NL &&
7149 (re->flags & PCRE2_HASCRORLF) == 0 &&
7150 (mb->nltype == NLTYPE_ANY ||
7151 mb->nltype == NLTYPE_ANYCRLF ||
7152 mb->nllen == 2))
7153 start_match++;
7154
7155 mb->mark = NULL; /* Reset for start of next match attempt */
7156 } /* End of for(;;) "bumpalong" loop */
7157
7158 /* ==========================================================================*/
7159
7160 /* When we reach here, one of the following stopping conditions is true:
7161
7162 (1) The match succeeded, either completely, or partially;
7163
7164 (2) The pattern is anchored or the match was failed after (*COMMIT);
7165
7166 (3) We are past the end of the subject or the bumpalong limit;
7167
7168 (4) PCRE2_FIRSTLINE is set and we have failed to match at a newline, because
7169 this option requests that a match occur at or before the first newline in
7170 the subject.
7171
7172 (5) Some kind of error occurred.
7173
7174 */
7175
7176 ENDLOOP:
7177
7178 /* If end_subject != true_end_subject, it means we are handling invalid UTF,
7179 and have just processed a non-terminal fragment. If this resulted in no match
7180 or a partial match we must carry on to the next fragment (a partial match is
7181 returned to the caller only at the very end of the subject). A loop is used to
7182 avoid trying to match against empty fragments; if the pattern can match an
7183 empty string it would have done so already. */
7184
7185 #ifdef SUPPORT_UNICODE
7186 if (utf && end_subject != true_end_subject &&
7187 (rc == MATCH_NOMATCH || rc == PCRE2_ERROR_PARTIAL))
7188 {
7189 for (;;)
7190 {
7191 /* Advance past the first bad code unit, and then skip invalid character
7192 starting code units in 8-bit and 16-bit modes. */
7193
7194 start_match = end_subject + 1;
7195
7196 #if PCRE2_CODE_UNIT_WIDTH != 32
7197 while (start_match < true_end_subject && NOT_FIRSTCU(*start_match))
7198 start_match++;
7199 #endif
7200
7201 /* If we have hit the end of the subject, there isn't another non-empty
7202 fragment, so give up. */
7203
7204 if (start_match >= true_end_subject)
7205 {
7206 rc = MATCH_NOMATCH; /* In case it was partial */
7207 break;
7208 }
7209
7210 /* Check the rest of the subject */
7211
7212 mb->check_subject = start_match;
7213 rc = PRIV(valid_utf)(start_match, length - (start_match - subject),
7214 &(match_data->startchar));
7215
7216 /* The rest of the subject is valid UTF. */
7217
7218 if (rc == 0)
7219 {
7220 mb->end_subject = end_subject = true_end_subject;
7221 fragment_options = PCRE2_NOTBOL;
7222 goto FRAGMENT_RESTART;
7223 }
7224
7225 /* A subsequent UTF error has been found; if the next fragment is
7226 non-empty, set up to process it. Otherwise, let the loop advance. */
7227
7228 else if (rc < 0)
7229 {
7230 mb->end_subject = end_subject = start_match + match_data->startchar;
7231 if (end_subject > start_match)
7232 {
7233 fragment_options = PCRE2_NOTBOL|PCRE2_NOTEOL;
7234 goto FRAGMENT_RESTART;
7235 }
7236 }
7237 }
7238 }
7239 #endif /* SUPPORT_UNICODE */
7240
7241 /* Release an enlarged frame vector that is on the heap. */
7242
7243 if (mb->match_frames != mb->stack_frames)
7244 mb->memctl.free(mb->match_frames, mb->memctl.memory_data);
7245
7246 /* Fill in fields that are always returned in the match data. */
7247
7248 match_data->code = re;
7249 match_data->mark = mb->mark;
7250 match_data->matchedby = PCRE2_MATCHEDBY_INTERPRETER;
7251
7252 /* Handle a fully successful match. Set the return code to the number of
7253 captured strings, or 0 if there were too many to fit into the ovector, and then
7254 set the remaining returned values before returning. Make a copy of the subject
7255 string if requested. */
7256
7257 if (rc == MATCH_MATCH)
7258 {
7259 match_data->rc = ((int)mb->end_offset_top >= 2 * match_data->oveccount)?
7260 0 : (int)mb->end_offset_top/2 + 1;
7261 match_data->startchar = start_match - subject;
7262 match_data->leftchar = mb->start_used_ptr - subject;
7263 match_data->rightchar = ((mb->last_used_ptr > mb->end_match_ptr)?
7264 mb->last_used_ptr : mb->end_match_ptr) - subject;
7265 if ((options & PCRE2_COPY_MATCHED_SUBJECT) != 0)
7266 {
7267 length = CU2BYTES(length + was_zero_terminated);
7268 match_data->subject = match_data->memctl.malloc(length,
7269 match_data->memctl.memory_data);
7270 if (match_data->subject == NULL) return PCRE2_ERROR_NOMEMORY;
7271 memcpy((void *)match_data->subject, subject, length);
7272 match_data->flags |= PCRE2_MD_COPIED_SUBJECT;
7273 }
7274 else match_data->subject = subject;
7275 return match_data->rc;
7276 }
7277
7278 /* Control gets here if there has been a partial match, an error, or if the
7279 overall match attempt has failed at all permitted starting positions. Any mark
7280 data is in the nomatch_mark field. */
7281
7282 match_data->mark = mb->nomatch_mark;
7283
7284 /* For anything other than nomatch or partial match, just return the code. */
7285
7286 if (rc != MATCH_NOMATCH && rc != PCRE2_ERROR_PARTIAL) match_data->rc = rc;
7287
7288 /* Handle a partial match. If a "soft" partial match was requested, searching
7289 for a complete match will have continued, and the value of rc at this point
7290 will be MATCH_NOMATCH. For a "hard" partial match, it will already be
7291 PCRE2_ERROR_PARTIAL. */
7292
7293 else if (match_partial != NULL)
7294 {
7295 match_data->subject = subject;
7296 match_data->ovector[0] = match_partial - subject;
7297 match_data->ovector[1] = end_subject - subject;
7298 match_data->startchar = match_partial - subject;
7299 match_data->leftchar = start_partial - subject;
7300 match_data->rightchar = end_subject - subject;
7301 match_data->rc = PCRE2_ERROR_PARTIAL;
7302 }
7303
7304 /* Else this is the classic nomatch case. */
7305
7306 else match_data->rc = PCRE2_ERROR_NOMATCH;
7307
7308 return match_data->rc;
7309 }
7310
7311 /* End of pcre2_match.c */
7312