1 /*************************************************
2 * Perl-Compatible Regular Expressions *
3 *************************************************/
4
5 /* PCRE is a library of functions to support regular expressions whose syntax
6 and semantics are as close as possible to those of the Perl 5 language.
7
8 Written by Philip Hazel
9 Original API code Copyright (c) 1997-2012 University of Cambridge
10 New API code Copyright (c) 2016 University of Cambridge
11
12 -----------------------------------------------------------------------------
13 Redistribution and use in source and binary forms, with or without
14 modification, are permitted provided that the following conditions are met:
15
16 * Redistributions of source code must retain the above copyright notice,
17 this list of conditions and the following disclaimer.
18
19 * Redistributions in binary form must reproduce the above copyright
20 notice, this list of conditions and the following disclaimer in the
21 documentation and/or other materials provided with the distribution.
22
23 * Neither the name of the University of Cambridge nor the names of its
24 contributors may be used to endorse or promote products derived from
25 this software without specific prior written permission.
26
27 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
28 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
29 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
30 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
31 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
32 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
33 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
34 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
35 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
36 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
37 POSSIBILITY OF SUCH DAMAGE.
38 -----------------------------------------------------------------------------
39 */
40
41
42 /* This module contains the external function pcre2_dfa_match(), which is an
43 alternative matching function that uses a sort of DFA algorithm (not a true
44 FSM). This is NOT Perl-compatible, but it has advantages in certain
45 applications. */
46
47
48 /* NOTE ABOUT PERFORMANCE: A user of this function sent some code that improved
49 the performance of his patterns greatly. I could not use it as it stood, as it
50 was not thread safe, and made assumptions about pattern sizes. Also, it caused
51 test 7 to loop, and test 9 to crash with a segfault.
52
53 The issue is the check for duplicate states, which is done by a simple linear
54 search up the state list. (Grep for "duplicate" below to find the code.) For
55 many patterns, there will never be many states active at one time, so a simple
56 linear search is fine. In patterns that have many active states, it might be a
57 bottleneck. The suggested code used an indexing scheme to remember which states
58 had previously been used for each character, and avoided the linear search when
59 it knew there was no chance of a duplicate. This was implemented when adding
60 states to the state lists.
61
62 I wrote some thread-safe, not-limited code to try something similar at the time
63 of checking for duplicates (instead of when adding states), using index vectors
64 on the stack. It did give a 13% improvement with one specially constructed
65 pattern for certain subject strings, but on other strings and on many of the
66 simpler patterns in the test suite it did worse. The major problem, I think,
67 was the extra time to initialize the index. This had to be done for each call
68 of internal_dfa_match(). (The supplied patch used a static vector, initialized
69 only once - I suspect this was the cause of the problems with the tests.)
70
71 Overall, I concluded that the gains in some cases did not outweigh the losses
72 in others, so I abandoned this code. */
73
74
75 #ifdef HAVE_CONFIG_H
76 #include "config.h"
77 #endif
78
79 #define NLBLOCK mb /* Block containing newline information */
80 #define PSSTART start_subject /* Field containing processed string start */
81 #define PSEND end_subject /* Field containing processed string end */
82
83 #include "pcre2_internal.h"
84
85 #define PUBLIC_DFA_MATCH_OPTIONS \
86 (PCRE2_ANCHORED|PCRE2_NOTBOL|PCRE2_NOTEOL|PCRE2_NOTEMPTY| \
87 PCRE2_NOTEMPTY_ATSTART|PCRE2_NO_UTF_CHECK|PCRE2_PARTIAL_HARD| \
88 PCRE2_PARTIAL_SOFT|PCRE2_DFA_SHORTEST|PCRE2_DFA_RESTART)
89
90
91 /*************************************************
92 * Code parameters and static tables *
93 *************************************************/
94
95 /* These are offsets that are used to turn the OP_TYPESTAR and friends opcodes
96 into others, under special conditions. A gap of 20 between the blocks should be
97 enough. The resulting opcodes don't have to be less than 256 because they are
98 never stored, so we push them well clear of the normal opcodes. */
99
100 #define OP_PROP_EXTRA 300
101 #define OP_EXTUNI_EXTRA 320
102 #define OP_ANYNL_EXTRA 340
103 #define OP_HSPACE_EXTRA 360
104 #define OP_VSPACE_EXTRA 380
105
106
107 /* This table identifies those opcodes that are followed immediately by a
108 character that is to be tested in some way. This makes it possible to
109 centralize the loading of these characters. In the case of Type * etc, the
110 "character" is the opcode for \D, \d, \S, \s, \W, or \w, which will always be a
111 small value. Non-zero values in the table are the offsets from the opcode where
112 the character is to be found. ***NOTE*** If the start of this table is
113 modified, the three tables that follow must also be modified. */
114
115 static const uint8_t coptable[] = {
116 0, /* End */
117 0, 0, 0, 0, 0, /* \A, \G, \K, \B, \b */
118 0, 0, 0, 0, 0, 0, /* \D, \d, \S, \s, \W, \w */
119 0, 0, 0, /* Any, AllAny, Anybyte */
120 0, 0, /* \P, \p */
121 0, 0, 0, 0, 0, /* \R, \H, \h, \V, \v */
122 0, /* \X */
123 0, 0, 0, 0, 0, 0, /* \Z, \z, $, $M, ^, ^M */
124 1, /* Char */
125 1, /* Chari */
126 1, /* not */
127 1, /* noti */
128 /* Positive single-char repeats */
129 1, 1, 1, 1, 1, 1, /* *, *?, +, +?, ?, ?? */
130 1+IMM2_SIZE, 1+IMM2_SIZE, /* upto, minupto */
131 1+IMM2_SIZE, /* exact */
132 1, 1, 1, 1+IMM2_SIZE, /* *+, ++, ?+, upto+ */
133 1, 1, 1, 1, 1, 1, /* *I, *?I, +I, +?I, ?I, ??I */
134 1+IMM2_SIZE, 1+IMM2_SIZE, /* upto I, minupto I */
135 1+IMM2_SIZE, /* exact I */
136 1, 1, 1, 1+IMM2_SIZE, /* *+I, ++I, ?+I, upto+I */
137 /* Negative single-char repeats - only for chars < 256 */
138 1, 1, 1, 1, 1, 1, /* NOT *, *?, +, +?, ?, ?? */
139 1+IMM2_SIZE, 1+IMM2_SIZE, /* NOT upto, minupto */
140 1+IMM2_SIZE, /* NOT exact */
141 1, 1, 1, 1+IMM2_SIZE, /* NOT *+, ++, ?+, upto+ */
142 1, 1, 1, 1, 1, 1, /* NOT *I, *?I, +I, +?I, ?I, ??I */
143 1+IMM2_SIZE, 1+IMM2_SIZE, /* NOT upto I, minupto I */
144 1+IMM2_SIZE, /* NOT exact I */
145 1, 1, 1, 1+IMM2_SIZE, /* NOT *+I, ++I, ?+I, upto+I */
146 /* Positive type repeats */
147 1, 1, 1, 1, 1, 1, /* Type *, *?, +, +?, ?, ?? */
148 1+IMM2_SIZE, 1+IMM2_SIZE, /* Type upto, minupto */
149 1+IMM2_SIZE, /* Type exact */
150 1, 1, 1, 1+IMM2_SIZE, /* Type *+, ++, ?+, upto+ */
151 /* Character class & ref repeats */
152 0, 0, 0, 0, 0, 0, /* *, *?, +, +?, ?, ?? */
153 0, 0, /* CRRANGE, CRMINRANGE */
154 0, 0, 0, 0, /* Possessive *+, ++, ?+, CRPOSRANGE */
155 0, /* CLASS */
156 0, /* NCLASS */
157 0, /* XCLASS - variable length */
158 0, /* REF */
159 0, /* REFI */
160 0, /* DNREF */
161 0, /* DNREFI */
162 0, /* RECURSE */
163 0, /* CALLOUT */
164 0, /* CALLOUT_STR */
165 0, /* Alt */
166 0, /* Ket */
167 0, /* KetRmax */
168 0, /* KetRmin */
169 0, /* KetRpos */
170 0, /* Reverse */
171 0, /* Assert */
172 0, /* Assert not */
173 0, /* Assert behind */
174 0, /* Assert behind not */
175 0, 0, /* ONCE, ONCE_NC */
176 0, 0, 0, 0, 0, /* BRA, BRAPOS, CBRA, CBRAPOS, COND */
177 0, 0, 0, 0, 0, /* SBRA, SBRAPOS, SCBRA, SCBRAPOS, SCOND */
178 0, 0, /* CREF, DNCREF */
179 0, 0, /* RREF, DNRREF */
180 0, 0, /* FALSE, TRUE */
181 0, 0, 0, /* BRAZERO, BRAMINZERO, BRAPOSZERO */
182 0, 0, 0, /* MARK, PRUNE, PRUNE_ARG */
183 0, 0, 0, 0, /* SKIP, SKIP_ARG, THEN, THEN_ARG */
184 0, 0, 0, 0, /* COMMIT, FAIL, ACCEPT, ASSERT_ACCEPT */
185 0, 0, 0 /* CLOSE, SKIPZERO, DEFINE */
186 };
187
188 /* This table identifies those opcodes that inspect a character. It is used to
189 remember the fact that a character could have been inspected when the end of
190 the subject is reached. ***NOTE*** If the start of this table is modified, the
191 two tables that follow must also be modified. */
192
193 static const uint8_t poptable[] = {
194 0, /* End */
195 0, 0, 0, 1, 1, /* \A, \G, \K, \B, \b */
196 1, 1, 1, 1, 1, 1, /* \D, \d, \S, \s, \W, \w */
197 1, 1, 1, /* Any, AllAny, Anybyte */
198 1, 1, /* \P, \p */
199 1, 1, 1, 1, 1, /* \R, \H, \h, \V, \v */
200 1, /* \X */
201 0, 0, 0, 0, 0, 0, /* \Z, \z, $, $M, ^, ^M */
202 1, /* Char */
203 1, /* Chari */
204 1, /* not */
205 1, /* noti */
206 /* Positive single-char repeats */
207 1, 1, 1, 1, 1, 1, /* *, *?, +, +?, ?, ?? */
208 1, 1, 1, /* upto, minupto, exact */
209 1, 1, 1, 1, /* *+, ++, ?+, upto+ */
210 1, 1, 1, 1, 1, 1, /* *I, *?I, +I, +?I, ?I, ??I */
211 1, 1, 1, /* upto I, minupto I, exact I */
212 1, 1, 1, 1, /* *+I, ++I, ?+I, upto+I */
213 /* Negative single-char repeats - only for chars < 256 */
214 1, 1, 1, 1, 1, 1, /* NOT *, *?, +, +?, ?, ?? */
215 1, 1, 1, /* NOT upto, minupto, exact */
216 1, 1, 1, 1, /* NOT *+, ++, ?+, upto+ */
217 1, 1, 1, 1, 1, 1, /* NOT *I, *?I, +I, +?I, ?I, ??I */
218 1, 1, 1, /* NOT upto I, minupto I, exact I */
219 1, 1, 1, 1, /* NOT *+I, ++I, ?+I, upto+I */
220 /* Positive type repeats */
221 1, 1, 1, 1, 1, 1, /* Type *, *?, +, +?, ?, ?? */
222 1, 1, 1, /* Type upto, minupto, exact */
223 1, 1, 1, 1, /* Type *+, ++, ?+, upto+ */
224 /* Character class & ref repeats */
225 1, 1, 1, 1, 1, 1, /* *, *?, +, +?, ?, ?? */
226 1, 1, /* CRRANGE, CRMINRANGE */
227 1, 1, 1, 1, /* Possessive *+, ++, ?+, CRPOSRANGE */
228 1, /* CLASS */
229 1, /* NCLASS */
230 1, /* XCLASS - variable length */
231 0, /* REF */
232 0, /* REFI */
233 0, /* DNREF */
234 0, /* DNREFI */
235 0, /* RECURSE */
236 0, /* CALLOUT */
237 0, /* CALLOUT_STR */
238 0, /* Alt */
239 0, /* Ket */
240 0, /* KetRmax */
241 0, /* KetRmin */
242 0, /* KetRpos */
243 0, /* Reverse */
244 0, /* Assert */
245 0, /* Assert not */
246 0, /* Assert behind */
247 0, /* Assert behind not */
248 0, 0, /* ONCE, ONCE_NC */
249 0, 0, 0, 0, 0, /* BRA, BRAPOS, CBRA, CBRAPOS, COND */
250 0, 0, 0, 0, 0, /* SBRA, SBRAPOS, SCBRA, SCBRAPOS, SCOND */
251 0, 0, /* CREF, DNCREF */
252 0, 0, /* RREF, DNRREF */
253 0, 0, /* FALSE, TRUE */
254 0, 0, 0, /* BRAZERO, BRAMINZERO, BRAPOSZERO */
255 0, 0, 0, /* MARK, PRUNE, PRUNE_ARG */
256 0, 0, 0, 0, /* SKIP, SKIP_ARG, THEN, THEN_ARG */
257 0, 0, 0, 0, /* COMMIT, FAIL, ACCEPT, ASSERT_ACCEPT */
258 0, 0, 0 /* CLOSE, SKIPZERO, DEFINE */
259 };
260
261 /* These 2 tables allow for compact code for testing for \D, \d, \S, \s, \W,
262 and \w */
263
264 static const uint8_t toptable1[] = {
265 0, 0, 0, 0, 0, 0,
266 ctype_digit, ctype_digit,
267 ctype_space, ctype_space,
268 ctype_word, ctype_word,
269 0, 0 /* OP_ANY, OP_ALLANY */
270 };
271
272 static const uint8_t toptable2[] = {
273 0, 0, 0, 0, 0, 0,
274 ctype_digit, 0,
275 ctype_space, 0,
276 ctype_word, 0,
277 1, 1 /* OP_ANY, OP_ALLANY */
278 };
279
280
281 /* Structure for holding data about a particular state, which is in effect the
282 current data for an active path through the match tree. It must consist
283 entirely of ints because the working vector we are passed, and which we put
284 these structures in, is a vector of ints. */
285
286 typedef struct stateblock {
287 int offset; /* Offset to opcode (-ve has meaning) */
288 int count; /* Count for repeats */
289 int data; /* Some use extra data */
290 } stateblock;
291
292 #define INTS_PER_STATEBLOCK (int)(sizeof(stateblock)/sizeof(int))
293
294
295
296 /*************************************************
297 * Match a Regular Expression - DFA engine *
298 *************************************************/
299
300 /* This internal function applies a compiled pattern to a subject string,
301 starting at a given point, using a DFA engine. This function is called from the
302 external one, possibly multiple times if the pattern is not anchored. The
303 function calls itself recursively for some kinds of subpattern.
304
305 Arguments:
306 mb the match_data block with fixed information
307 this_start_code the opening bracket of this subexpression's code
308 current_subject where we currently are in the subject string
309 start_offset start offset in the subject string
310 offsets vector to contain the matching string offsets
311 offsetcount size of same
312 workspace vector of workspace
313 wscount size of same
314 rlevel function call recursion level
315
316 Returns: > 0 => number of match offset pairs placed in offsets
317 = 0 => offsets overflowed; longest matches are present
318 -1 => failed to match
319 < -1 => some kind of unexpected problem
320
321 The following macros are used for adding states to the two state vectors (one
322 for the current character, one for the following character). */
323
324 #define ADD_ACTIVE(x,y) \
325 if (active_count++ < wscount) \
326 { \
327 next_active_state->offset = (x); \
328 next_active_state->count = (y); \
329 next_active_state++; \
330 } \
331 else return PCRE2_ERROR_DFA_WSSIZE
332
333 #define ADD_ACTIVE_DATA(x,y,z) \
334 if (active_count++ < wscount) \
335 { \
336 next_active_state->offset = (x); \
337 next_active_state->count = (y); \
338 next_active_state->data = (z); \
339 next_active_state++; \
340 } \
341 else return PCRE2_ERROR_DFA_WSSIZE
342
343 #define ADD_NEW(x,y) \
344 if (new_count++ < wscount) \
345 { \
346 next_new_state->offset = (x); \
347 next_new_state->count = (y); \
348 next_new_state++; \
349 } \
350 else return PCRE2_ERROR_DFA_WSSIZE
351
352 #define ADD_NEW_DATA(x,y,z) \
353 if (new_count++ < wscount) \
354 { \
355 next_new_state->offset = (x); \
356 next_new_state->count = (y); \
357 next_new_state->data = (z); \
358 next_new_state++; \
359 } \
360 else return PCRE2_ERROR_DFA_WSSIZE
361
362 /* And now, here is the code */
363
364 static int
internal_dfa_match(dfa_match_block * mb,PCRE2_SPTR this_start_code,PCRE2_SPTR current_subject,PCRE2_SIZE start_offset,PCRE2_SIZE * offsets,uint32_t offsetcount,int * workspace,int wscount,int rlevel)365 internal_dfa_match(
366 dfa_match_block *mb,
367 PCRE2_SPTR this_start_code,
368 PCRE2_SPTR current_subject,
369 PCRE2_SIZE start_offset,
370 PCRE2_SIZE *offsets,
371 uint32_t offsetcount,
372 int *workspace,
373 int wscount,
374 int rlevel)
375 {
376 stateblock *active_states, *new_states, *temp_states;
377 stateblock *next_active_state, *next_new_state;
378
379 const uint8_t *ctypes, *lcc, *fcc;
380 PCRE2_SPTR ptr;
381 PCRE2_SPTR end_code;
382 PCRE2_SPTR first_op;
383
384 dfa_recursion_info new_recursive;
385
386 int active_count, new_count, match_count;
387
388 /* Some fields in the mb block are frequently referenced, so we load them into
389 independent variables in the hope that this will perform better. */
390
391 PCRE2_SPTR start_subject = mb->start_subject;
392 PCRE2_SPTR end_subject = mb->end_subject;
393 PCRE2_SPTR start_code = mb->start_code;
394
395 #ifdef SUPPORT_UNICODE
396 BOOL utf = (mb->poptions & PCRE2_UTF) != 0;
397 #else
398 BOOL utf = FALSE;
399 #endif
400
401 BOOL reset_could_continue = FALSE;
402
403 rlevel++;
404 offsetcount &= (uint32_t)(-2); /* Round down */
405
406 wscount -= 2;
407 wscount = (wscount - (wscount % (INTS_PER_STATEBLOCK * 2))) /
408 (2 * INTS_PER_STATEBLOCK);
409
410 ctypes = mb->tables + ctypes_offset;
411 lcc = mb->tables + lcc_offset;
412 fcc = mb->tables + fcc_offset;
413
414 match_count = PCRE2_ERROR_NOMATCH; /* A negative number */
415
416 active_states = (stateblock *)(workspace + 2);
417 next_new_state = new_states = active_states + wscount;
418 new_count = 0;
419
420 first_op = this_start_code + 1 + LINK_SIZE +
421 ((*this_start_code == OP_CBRA || *this_start_code == OP_SCBRA ||
422 *this_start_code == OP_CBRAPOS || *this_start_code == OP_SCBRAPOS)
423 ? IMM2_SIZE:0);
424
425 /* The first thing in any (sub) pattern is a bracket of some sort. Push all
426 the alternative states onto the list, and find out where the end is. This
427 makes is possible to use this function recursively, when we want to stop at a
428 matching internal ket rather than at the end.
429
430 If the first opcode in the first alternative is OP_REVERSE, we are dealing with
431 a backward assertion. In that case, we have to find out the maximum amount to
432 move back, and set up each alternative appropriately. */
433
434 if (*first_op == OP_REVERSE)
435 {
436 size_t max_back = 0;
437 size_t gone_back;
438
439 end_code = this_start_code;
440 do
441 {
442 size_t back = (size_t)GET(end_code, 2+LINK_SIZE);
443 if (back > max_back) max_back = back;
444 end_code += GET(end_code, 1);
445 }
446 while (*end_code == OP_ALT);
447
448 /* If we can't go back the amount required for the longest lookbehind
449 pattern, go back as far as we can; some alternatives may still be viable. */
450
451 #ifdef SUPPORT_UNICODE
452 /* In character mode we have to step back character by character */
453
454 if (utf)
455 {
456 for (gone_back = 0; gone_back < max_back; gone_back++)
457 {
458 if (current_subject <= start_subject) break;
459 current_subject--;
460 ACROSSCHAR(current_subject > start_subject, *current_subject, current_subject--);
461 }
462 }
463 else
464 #endif
465
466 /* In byte-mode we can do this quickly. */
467
468 {
469 size_t current_offset = (size_t)(current_subject - start_subject);
470 gone_back = (current_offset < max_back)? current_offset : max_back;
471 current_subject -= gone_back;
472 }
473
474 /* Save the earliest consulted character */
475
476 if (current_subject < mb->start_used_ptr)
477 mb->start_used_ptr = current_subject;
478
479 /* Now we can process the individual branches. */
480
481 end_code = this_start_code;
482 do
483 {
484 size_t back = (size_t)GET(end_code, 2+LINK_SIZE);
485 if (back <= gone_back)
486 {
487 int bstate = (int)(end_code - start_code + 2 + 2*LINK_SIZE);
488 ADD_NEW_DATA(-bstate, 0, (int)(gone_back - back));
489 }
490 end_code += GET(end_code, 1);
491 }
492 while (*end_code == OP_ALT);
493 }
494
495 /* This is the code for a "normal" subpattern (not a backward assertion). The
496 start of a whole pattern is always one of these. If we are at the top level,
497 we may be asked to restart matching from the same point that we reached for a
498 previous partial match. We still have to scan through the top-level branches to
499 find the end state. */
500
501 else
502 {
503 end_code = this_start_code;
504
505 /* Restarting */
506
507 if (rlevel == 1 && (mb->moptions & PCRE2_DFA_RESTART) != 0)
508 {
509 do { end_code += GET(end_code, 1); } while (*end_code == OP_ALT);
510 new_count = workspace[1];
511 if (!workspace[0])
512 memcpy(new_states, active_states, (size_t)new_count * sizeof(stateblock));
513 }
514
515 /* Not restarting */
516
517 else
518 {
519 int length = 1 + LINK_SIZE +
520 ((*this_start_code == OP_CBRA || *this_start_code == OP_SCBRA ||
521 *this_start_code == OP_CBRAPOS || *this_start_code == OP_SCBRAPOS)
522 ? IMM2_SIZE:0);
523 do
524 {
525 ADD_NEW((int)(end_code - start_code + length), 0);
526 end_code += GET(end_code, 1);
527 length = 1 + LINK_SIZE;
528 }
529 while (*end_code == OP_ALT);
530 }
531 }
532
533 workspace[0] = 0; /* Bit indicating which vector is current */
534
535 /* Loop for scanning the subject */
536
537 ptr = current_subject;
538 for (;;)
539 {
540 int i, j;
541 int clen, dlen;
542 uint32_t c, d;
543 int forced_fail = 0;
544 BOOL partial_newline = FALSE;
545 BOOL could_continue = reset_could_continue;
546 reset_could_continue = FALSE;
547
548 if (ptr > mb->last_used_ptr) mb->last_used_ptr = ptr;
549
550 /* Make the new state list into the active state list and empty the
551 new state list. */
552
553 temp_states = active_states;
554 active_states = new_states;
555 new_states = temp_states;
556 active_count = new_count;
557 new_count = 0;
558
559 workspace[0] ^= 1; /* Remember for the restarting feature */
560 workspace[1] = active_count;
561
562 /* Set the pointers for adding new states */
563
564 next_active_state = active_states + active_count;
565 next_new_state = new_states;
566
567 /* Load the current character from the subject outside the loop, as many
568 different states may want to look at it, and we assume that at least one
569 will. */
570
571 if (ptr < end_subject)
572 {
573 clen = 1; /* Number of data items in the character */
574 #ifdef SUPPORT_UNICODE
575 GETCHARLENTEST(c, ptr, clen);
576 #else
577 c = *ptr;
578 #endif /* SUPPORT_UNICODE */
579 }
580 else
581 {
582 clen = 0; /* This indicates the end of the subject */
583 c = NOTACHAR; /* This value should never actually be used */
584 }
585
586 /* Scan up the active states and act on each one. The result of an action
587 may be to add more states to the currently active list (e.g. on hitting a
588 parenthesis) or it may be to put states on the new list, for considering
589 when we move the character pointer on. */
590
591 for (i = 0; i < active_count; i++)
592 {
593 stateblock *current_state = active_states + i;
594 BOOL caseless = FALSE;
595 PCRE2_SPTR code;
596 uint32_t codevalue;
597 int state_offset = current_state->offset;
598 int rrc;
599 int count;
600
601 /* A negative offset is a special case meaning "hold off going to this
602 (negated) state until the number of characters in the data field have
603 been skipped". If the could_continue flag was passed over from a previous
604 state, arrange for it to passed on. */
605
606 if (state_offset < 0)
607 {
608 if (current_state->data > 0)
609 {
610 ADD_NEW_DATA(state_offset, current_state->count,
611 current_state->data - 1);
612 if (could_continue) reset_could_continue = TRUE;
613 continue;
614 }
615 else
616 {
617 current_state->offset = state_offset = -state_offset;
618 }
619 }
620
621 /* Check for a duplicate state with the same count, and skip if found.
622 See the note at the head of this module about the possibility of improving
623 performance here. */
624
625 for (j = 0; j < i; j++)
626 {
627 if (active_states[j].offset == state_offset &&
628 active_states[j].count == current_state->count)
629 goto NEXT_ACTIVE_STATE;
630 }
631
632 /* The state offset is the offset to the opcode */
633
634 code = start_code + state_offset;
635 codevalue = *code;
636
637 /* If this opcode inspects a character, but we are at the end of the
638 subject, remember the fact for use when testing for a partial match. */
639
640 if (clen == 0 && poptable[codevalue] != 0)
641 could_continue = TRUE;
642
643 /* If this opcode is followed by an inline character, load it. It is
644 tempting to test for the presence of a subject character here, but that
645 is wrong, because sometimes zero repetitions of the subject are
646 permitted.
647
648 We also use this mechanism for opcodes such as OP_TYPEPLUS that take an
649 argument that is not a data character - but is always one byte long because
650 the values are small. We have to take special action to deal with \P, \p,
651 \H, \h, \V, \v and \X in this case. To keep the other cases fast, convert
652 these ones to new opcodes. */
653
654 if (coptable[codevalue] > 0)
655 {
656 dlen = 1;
657 #ifdef SUPPORT_UNICODE
658 if (utf) { GETCHARLEN(d, (code + coptable[codevalue]), dlen); } else
659 #endif /* SUPPORT_UNICODE */
660 d = code[coptable[codevalue]];
661 if (codevalue >= OP_TYPESTAR)
662 {
663 switch(d)
664 {
665 case OP_ANYBYTE: return PCRE2_ERROR_DFA_UITEM;
666 case OP_NOTPROP:
667 case OP_PROP: codevalue += OP_PROP_EXTRA; break;
668 case OP_ANYNL: codevalue += OP_ANYNL_EXTRA; break;
669 case OP_EXTUNI: codevalue += OP_EXTUNI_EXTRA; break;
670 case OP_NOT_HSPACE:
671 case OP_HSPACE: codevalue += OP_HSPACE_EXTRA; break;
672 case OP_NOT_VSPACE:
673 case OP_VSPACE: codevalue += OP_VSPACE_EXTRA; break;
674 default: break;
675 }
676 }
677 }
678 else
679 {
680 dlen = 0; /* Not strictly necessary, but compilers moan */
681 d = NOTACHAR; /* if these variables are not set. */
682 }
683
684
685 /* Now process the individual opcodes */
686
687 switch (codevalue)
688 {
689 /* ========================================================================== */
690 /* These cases are never obeyed. This is a fudge that causes a compile-
691 time error if the vectors coptable or poptable, which are indexed by
692 opcode, are not the correct length. It seems to be the only way to do
693 such a check at compile time, as the sizeof() operator does not work
694 in the C preprocessor. */
695
696 case OP_TABLE_LENGTH:
697 case OP_TABLE_LENGTH +
698 ((sizeof(coptable) == OP_TABLE_LENGTH) &&
699 (sizeof(poptable) == OP_TABLE_LENGTH)):
700 break;
701
702 /* ========================================================================== */
703 /* Reached a closing bracket. If not at the end of the pattern, carry
704 on with the next opcode. For repeating opcodes, also add the repeat
705 state. Note that KETRPOS will always be encountered at the end of the
706 subpattern, because the possessive subpattern repeats are always handled
707 using recursive calls. Thus, it never adds any new states.
708
709 At the end of the (sub)pattern, unless we have an empty string and
710 PCRE2_NOTEMPTY is set, or PCRE2_NOTEMPTY_ATSTART is set and we are at the
711 start of the subject, save the match data, shifting up all previous
712 matches so we always have the longest first. */
713
714 case OP_KET:
715 case OP_KETRMIN:
716 case OP_KETRMAX:
717 case OP_KETRPOS:
718 if (code != end_code)
719 {
720 ADD_ACTIVE(state_offset + 1 + LINK_SIZE, 0);
721 if (codevalue != OP_KET)
722 {
723 ADD_ACTIVE(state_offset - (int)GET(code, 1), 0);
724 }
725 }
726 else
727 {
728 if (ptr > current_subject ||
729 ((mb->moptions & PCRE2_NOTEMPTY) == 0 &&
730 ((mb->moptions & PCRE2_NOTEMPTY_ATSTART) == 0 ||
731 current_subject > start_subject + mb->start_offset)))
732 {
733 if (match_count < 0) match_count = (offsetcount >= 2)? 1 : 0;
734 else if (match_count > 0 && ++match_count * 2 > (int)offsetcount)
735 match_count = 0;
736 count = ((match_count == 0)? (int)offsetcount : match_count * 2) - 2;
737 if (count > 0) memmove(offsets + 2, offsets,
738 (size_t)count * sizeof(PCRE2_SIZE));
739 if (offsetcount >= 2)
740 {
741 offsets[0] = (PCRE2_SIZE)(current_subject - start_subject);
742 offsets[1] = (PCRE2_SIZE)(ptr - start_subject);
743 }
744 if ((mb->moptions & PCRE2_DFA_SHORTEST) != 0) return match_count;
745 }
746 }
747 break;
748
749 /* ========================================================================== */
750 /* These opcodes add to the current list of states without looking
751 at the current character. */
752
753 /*-----------------------------------------------------------------*/
754 case OP_ALT:
755 do { code += GET(code, 1); } while (*code == OP_ALT);
756 ADD_ACTIVE((int)(code - start_code), 0);
757 break;
758
759 /*-----------------------------------------------------------------*/
760 case OP_BRA:
761 case OP_SBRA:
762 do
763 {
764 ADD_ACTIVE((int)(code - start_code + 1 + LINK_SIZE), 0);
765 code += GET(code, 1);
766 }
767 while (*code == OP_ALT);
768 break;
769
770 /*-----------------------------------------------------------------*/
771 case OP_CBRA:
772 case OP_SCBRA:
773 ADD_ACTIVE((int)(code - start_code + 1 + LINK_SIZE + IMM2_SIZE), 0);
774 code += GET(code, 1);
775 while (*code == OP_ALT)
776 {
777 ADD_ACTIVE((int)(code - start_code + 1 + LINK_SIZE), 0);
778 code += GET(code, 1);
779 }
780 break;
781
782 /*-----------------------------------------------------------------*/
783 case OP_BRAZERO:
784 case OP_BRAMINZERO:
785 ADD_ACTIVE(state_offset + 1, 0);
786 code += 1 + GET(code, 2);
787 while (*code == OP_ALT) code += GET(code, 1);
788 ADD_ACTIVE((int)(code - start_code + 1 + LINK_SIZE), 0);
789 break;
790
791 /*-----------------------------------------------------------------*/
792 case OP_SKIPZERO:
793 code += 1 + GET(code, 2);
794 while (*code == OP_ALT) code += GET(code, 1);
795 ADD_ACTIVE((int)(code - start_code + 1 + LINK_SIZE), 0);
796 break;
797
798 /*-----------------------------------------------------------------*/
799 case OP_CIRC:
800 if (ptr == start_subject && (mb->moptions & PCRE2_NOTBOL) == 0)
801 { ADD_ACTIVE(state_offset + 1, 0); }
802 break;
803
804 /*-----------------------------------------------------------------*/
805 case OP_CIRCM:
806 if ((ptr == start_subject && (mb->moptions & PCRE2_NOTBOL) == 0) ||
807 ((ptr != end_subject || (mb->poptions & PCRE2_ALT_CIRCUMFLEX) != 0 )
808 && WAS_NEWLINE(ptr)))
809 { ADD_ACTIVE(state_offset + 1, 0); }
810 break;
811
812 /*-----------------------------------------------------------------*/
813 case OP_EOD:
814 if (ptr >= end_subject)
815 {
816 if ((mb->moptions & PCRE2_PARTIAL_HARD) != 0)
817 could_continue = TRUE;
818 else { ADD_ACTIVE(state_offset + 1, 0); }
819 }
820 break;
821
822 /*-----------------------------------------------------------------*/
823 case OP_SOD:
824 if (ptr == start_subject) { ADD_ACTIVE(state_offset + 1, 0); }
825 break;
826
827 /*-----------------------------------------------------------------*/
828 case OP_SOM:
829 if (ptr == start_subject + start_offset) { ADD_ACTIVE(state_offset + 1, 0); }
830 break;
831
832
833 /* ========================================================================== */
834 /* These opcodes inspect the next subject character, and sometimes
835 the previous one as well, but do not have an argument. The variable
836 clen contains the length of the current character and is zero if we are
837 at the end of the subject. */
838
839 /*-----------------------------------------------------------------*/
840 case OP_ANY:
841 if (clen > 0 && !IS_NEWLINE(ptr))
842 {
843 if (ptr + 1 >= mb->end_subject &&
844 (mb->moptions & (PCRE2_PARTIAL_HARD)) != 0 &&
845 NLBLOCK->nltype == NLTYPE_FIXED &&
846 NLBLOCK->nllen == 2 &&
847 c == NLBLOCK->nl[0])
848 {
849 could_continue = partial_newline = TRUE;
850 }
851 else
852 {
853 ADD_NEW(state_offset + 1, 0);
854 }
855 }
856 break;
857
858 /*-----------------------------------------------------------------*/
859 case OP_ALLANY:
860 if (clen > 0)
861 { ADD_NEW(state_offset + 1, 0); }
862 break;
863
864 /*-----------------------------------------------------------------*/
865 case OP_EODN:
866 if (clen == 0 && (mb->moptions & PCRE2_PARTIAL_HARD) != 0)
867 could_continue = TRUE;
868 else if (clen == 0 || (IS_NEWLINE(ptr) && ptr == end_subject - mb->nllen))
869 { ADD_ACTIVE(state_offset + 1, 0); }
870 break;
871
872 /*-----------------------------------------------------------------*/
873 case OP_DOLL:
874 if ((mb->moptions & PCRE2_NOTEOL) == 0)
875 {
876 if (clen == 0 && (mb->moptions & PCRE2_PARTIAL_HARD) != 0)
877 could_continue = TRUE;
878 else if (clen == 0 ||
879 ((mb->poptions & PCRE2_DOLLAR_ENDONLY) == 0 && IS_NEWLINE(ptr) &&
880 (ptr == end_subject - mb->nllen)
881 ))
882 { ADD_ACTIVE(state_offset + 1, 0); }
883 else if (ptr + 1 >= mb->end_subject &&
884 (mb->moptions & (PCRE2_PARTIAL_HARD|PCRE2_PARTIAL_SOFT)) != 0 &&
885 NLBLOCK->nltype == NLTYPE_FIXED &&
886 NLBLOCK->nllen == 2 &&
887 c == NLBLOCK->nl[0])
888 {
889 if ((mb->moptions & PCRE2_PARTIAL_HARD) != 0)
890 {
891 reset_could_continue = TRUE;
892 ADD_NEW_DATA(-(state_offset + 1), 0, 1);
893 }
894 else could_continue = partial_newline = TRUE;
895 }
896 }
897 break;
898
899 /*-----------------------------------------------------------------*/
900 case OP_DOLLM:
901 if ((mb->moptions & PCRE2_NOTEOL) == 0)
902 {
903 if (clen == 0 && (mb->moptions & PCRE2_PARTIAL_HARD) != 0)
904 could_continue = TRUE;
905 else if (clen == 0 ||
906 ((mb->poptions & PCRE2_DOLLAR_ENDONLY) == 0 && IS_NEWLINE(ptr)))
907 { ADD_ACTIVE(state_offset + 1, 0); }
908 else if (ptr + 1 >= mb->end_subject &&
909 (mb->moptions & (PCRE2_PARTIAL_HARD|PCRE2_PARTIAL_SOFT)) != 0 &&
910 NLBLOCK->nltype == NLTYPE_FIXED &&
911 NLBLOCK->nllen == 2 &&
912 c == NLBLOCK->nl[0])
913 {
914 if ((mb->moptions & PCRE2_PARTIAL_HARD) != 0)
915 {
916 reset_could_continue = TRUE;
917 ADD_NEW_DATA(-(state_offset + 1), 0, 1);
918 }
919 else could_continue = partial_newline = TRUE;
920 }
921 }
922 else if (IS_NEWLINE(ptr))
923 { ADD_ACTIVE(state_offset + 1, 0); }
924 break;
925
926 /*-----------------------------------------------------------------*/
927
928 case OP_DIGIT:
929 case OP_WHITESPACE:
930 case OP_WORDCHAR:
931 if (clen > 0 && c < 256 &&
932 ((ctypes[c] & toptable1[codevalue]) ^ toptable2[codevalue]) != 0)
933 { ADD_NEW(state_offset + 1, 0); }
934 break;
935
936 /*-----------------------------------------------------------------*/
937 case OP_NOT_DIGIT:
938 case OP_NOT_WHITESPACE:
939 case OP_NOT_WORDCHAR:
940 if (clen > 0 && (c >= 256 ||
941 ((ctypes[c] & toptable1[codevalue]) ^ toptable2[codevalue]) != 0))
942 { ADD_NEW(state_offset + 1, 0); }
943 break;
944
945 /*-----------------------------------------------------------------*/
946 case OP_WORD_BOUNDARY:
947 case OP_NOT_WORD_BOUNDARY:
948 {
949 int left_word, right_word;
950
951 if (ptr > start_subject)
952 {
953 PCRE2_SPTR temp = ptr - 1;
954 if (temp < mb->start_used_ptr) mb->start_used_ptr = temp;
955 #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32
956 if (utf) { BACKCHAR(temp); }
957 #endif
958 GETCHARTEST(d, temp);
959 #ifdef SUPPORT_UNICODE
960 if ((mb->poptions & PCRE2_UCP) != 0)
961 {
962 if (d == '_') left_word = TRUE; else
963 {
964 uint32_t cat = UCD_CATEGORY(d);
965 left_word = (cat == ucp_L || cat == ucp_N);
966 }
967 }
968 else
969 #endif
970 left_word = d < 256 && (ctypes[d] & ctype_word) != 0;
971 }
972 else left_word = FALSE;
973
974 if (clen > 0)
975 {
976 if (ptr >= mb->last_used_ptr)
977 {
978 PCRE2_SPTR temp = ptr + 1;
979 #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32
980 if (utf) { FORWARDCHARTEST(temp, mb->end_subject); }
981 #endif
982 mb->last_used_ptr = temp;
983 }
984 #ifdef SUPPORT_UNICODE
985 if ((mb->poptions & PCRE2_UCP) != 0)
986 {
987 if (c == '_') right_word = TRUE; else
988 {
989 uint32_t cat = UCD_CATEGORY(c);
990 right_word = (cat == ucp_L || cat == ucp_N);
991 }
992 }
993 else
994 #endif
995 right_word = c < 256 && (ctypes[c] & ctype_word) != 0;
996 }
997 else right_word = FALSE;
998
999 if ((left_word == right_word) == (codevalue == OP_NOT_WORD_BOUNDARY))
1000 { ADD_ACTIVE(state_offset + 1, 0); }
1001 }
1002 break;
1003
1004
1005 /*-----------------------------------------------------------------*/
1006 /* Check the next character by Unicode property. We will get here only
1007 if the support is in the binary; otherwise a compile-time error occurs.
1008 */
1009
1010 #ifdef SUPPORT_UNICODE
1011 case OP_PROP:
1012 case OP_NOTPROP:
1013 if (clen > 0)
1014 {
1015 BOOL OK;
1016 const uint32_t *cp;
1017 const ucd_record * prop = GET_UCD(c);
1018 switch(code[1])
1019 {
1020 case PT_ANY:
1021 OK = TRUE;
1022 break;
1023
1024 case PT_LAMP:
1025 OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll ||
1026 prop->chartype == ucp_Lt;
1027 break;
1028
1029 case PT_GC:
1030 OK = PRIV(ucp_gentype)[prop->chartype] == code[2];
1031 break;
1032
1033 case PT_PC:
1034 OK = prop->chartype == code[2];
1035 break;
1036
1037 case PT_SC:
1038 OK = prop->script == code[2];
1039 break;
1040
1041 /* These are specials for combination cases. */
1042
1043 case PT_ALNUM:
1044 OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
1045 PRIV(ucp_gentype)[prop->chartype] == ucp_N;
1046 break;
1047
1048 /* Perl space used to exclude VT, but from Perl 5.18 it is included,
1049 which means that Perl space and POSIX space are now identical. PCRE
1050 was changed at release 8.34. */
1051
1052 case PT_SPACE: /* Perl space */
1053 case PT_PXSPACE: /* POSIX space */
1054 switch(c)
1055 {
1056 HSPACE_CASES:
1057 VSPACE_CASES:
1058 OK = TRUE;
1059 break;
1060
1061 default:
1062 OK = PRIV(ucp_gentype)[prop->chartype] == ucp_Z;
1063 break;
1064 }
1065 break;
1066
1067 case PT_WORD:
1068 OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
1069 PRIV(ucp_gentype)[prop->chartype] == ucp_N ||
1070 c == CHAR_UNDERSCORE;
1071 break;
1072
1073 case PT_CLIST:
1074 cp = PRIV(ucd_caseless_sets) + code[2];
1075 for (;;)
1076 {
1077 if (c < *cp) { OK = FALSE; break; }
1078 if (c == *cp++) { OK = TRUE; break; }
1079 }
1080 break;
1081
1082 case PT_UCNC:
1083 OK = c == CHAR_DOLLAR_SIGN || c == CHAR_COMMERCIAL_AT ||
1084 c == CHAR_GRAVE_ACCENT || (c >= 0xa0 && c <= 0xd7ff) ||
1085 c >= 0xe000;
1086 break;
1087
1088 /* Should never occur, but keep compilers from grumbling. */
1089
1090 default:
1091 OK = codevalue != OP_PROP;
1092 break;
1093 }
1094
1095 if (OK == (codevalue == OP_PROP)) { ADD_NEW(state_offset + 3, 0); }
1096 }
1097 break;
1098 #endif
1099
1100
1101
1102 /* ========================================================================== */
1103 /* These opcodes likewise inspect the subject character, but have an
1104 argument that is not a data character. It is one of these opcodes:
1105 OP_ANY, OP_ALLANY, OP_DIGIT, OP_NOT_DIGIT, OP_WHITESPACE, OP_NOT_SPACE,
1106 OP_WORDCHAR, OP_NOT_WORDCHAR. The value is loaded into d. */
1107
1108 case OP_TYPEPLUS:
1109 case OP_TYPEMINPLUS:
1110 case OP_TYPEPOSPLUS:
1111 count = current_state->count; /* Already matched */
1112 if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1113 if (clen > 0)
1114 {
1115 if (d == OP_ANY && ptr + 1 >= mb->end_subject &&
1116 (mb->moptions & (PCRE2_PARTIAL_HARD)) != 0 &&
1117 NLBLOCK->nltype == NLTYPE_FIXED &&
1118 NLBLOCK->nllen == 2 &&
1119 c == NLBLOCK->nl[0])
1120 {
1121 could_continue = partial_newline = TRUE;
1122 }
1123 else if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
1124 (c < 256 &&
1125 (d != OP_ANY || !IS_NEWLINE(ptr)) &&
1126 ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
1127 {
1128 if (count > 0 && codevalue == OP_TYPEPOSPLUS)
1129 {
1130 active_count--; /* Remove non-match possibility */
1131 next_active_state--;
1132 }
1133 count++;
1134 ADD_NEW(state_offset, count);
1135 }
1136 }
1137 break;
1138
1139 /*-----------------------------------------------------------------*/
1140 case OP_TYPEQUERY:
1141 case OP_TYPEMINQUERY:
1142 case OP_TYPEPOSQUERY:
1143 ADD_ACTIVE(state_offset + 2, 0);
1144 if (clen > 0)
1145 {
1146 if (d == OP_ANY && ptr + 1 >= mb->end_subject &&
1147 (mb->moptions & (PCRE2_PARTIAL_HARD)) != 0 &&
1148 NLBLOCK->nltype == NLTYPE_FIXED &&
1149 NLBLOCK->nllen == 2 &&
1150 c == NLBLOCK->nl[0])
1151 {
1152 could_continue = partial_newline = TRUE;
1153 }
1154 else if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
1155 (c < 256 &&
1156 (d != OP_ANY || !IS_NEWLINE(ptr)) &&
1157 ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
1158 {
1159 if (codevalue == OP_TYPEPOSQUERY)
1160 {
1161 active_count--; /* Remove non-match possibility */
1162 next_active_state--;
1163 }
1164 ADD_NEW(state_offset + 2, 0);
1165 }
1166 }
1167 break;
1168
1169 /*-----------------------------------------------------------------*/
1170 case OP_TYPESTAR:
1171 case OP_TYPEMINSTAR:
1172 case OP_TYPEPOSSTAR:
1173 ADD_ACTIVE(state_offset + 2, 0);
1174 if (clen > 0)
1175 {
1176 if (d == OP_ANY && ptr + 1 >= mb->end_subject &&
1177 (mb->moptions & (PCRE2_PARTIAL_HARD)) != 0 &&
1178 NLBLOCK->nltype == NLTYPE_FIXED &&
1179 NLBLOCK->nllen == 2 &&
1180 c == NLBLOCK->nl[0])
1181 {
1182 could_continue = partial_newline = TRUE;
1183 }
1184 else if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
1185 (c < 256 &&
1186 (d != OP_ANY || !IS_NEWLINE(ptr)) &&
1187 ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
1188 {
1189 if (codevalue == OP_TYPEPOSSTAR)
1190 {
1191 active_count--; /* Remove non-match possibility */
1192 next_active_state--;
1193 }
1194 ADD_NEW(state_offset, 0);
1195 }
1196 }
1197 break;
1198
1199 /*-----------------------------------------------------------------*/
1200 case OP_TYPEEXACT:
1201 count = current_state->count; /* Number already matched */
1202 if (clen > 0)
1203 {
1204 if (d == OP_ANY && ptr + 1 >= mb->end_subject &&
1205 (mb->moptions & (PCRE2_PARTIAL_HARD)) != 0 &&
1206 NLBLOCK->nltype == NLTYPE_FIXED &&
1207 NLBLOCK->nllen == 2 &&
1208 c == NLBLOCK->nl[0])
1209 {
1210 could_continue = partial_newline = TRUE;
1211 }
1212 else if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
1213 (c < 256 &&
1214 (d != OP_ANY || !IS_NEWLINE(ptr)) &&
1215 ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
1216 {
1217 if (++count >= (int)GET2(code, 1))
1218 { ADD_NEW(state_offset + 1 + IMM2_SIZE + 1, 0); }
1219 else
1220 { ADD_NEW(state_offset, count); }
1221 }
1222 }
1223 break;
1224
1225 /*-----------------------------------------------------------------*/
1226 case OP_TYPEUPTO:
1227 case OP_TYPEMINUPTO:
1228 case OP_TYPEPOSUPTO:
1229 ADD_ACTIVE(state_offset + 2 + IMM2_SIZE, 0);
1230 count = current_state->count; /* Number already matched */
1231 if (clen > 0)
1232 {
1233 if (d == OP_ANY && ptr + 1 >= mb->end_subject &&
1234 (mb->moptions & (PCRE2_PARTIAL_HARD)) != 0 &&
1235 NLBLOCK->nltype == NLTYPE_FIXED &&
1236 NLBLOCK->nllen == 2 &&
1237 c == NLBLOCK->nl[0])
1238 {
1239 could_continue = partial_newline = TRUE;
1240 }
1241 else if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
1242 (c < 256 &&
1243 (d != OP_ANY || !IS_NEWLINE(ptr)) &&
1244 ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
1245 {
1246 if (codevalue == OP_TYPEPOSUPTO)
1247 {
1248 active_count--; /* Remove non-match possibility */
1249 next_active_state--;
1250 }
1251 if (++count >= (int)GET2(code, 1))
1252 { ADD_NEW(state_offset + 2 + IMM2_SIZE, 0); }
1253 else
1254 { ADD_NEW(state_offset, count); }
1255 }
1256 }
1257 break;
1258
1259 /* ========================================================================== */
1260 /* These are virtual opcodes that are used when something like
1261 OP_TYPEPLUS has OP_PROP, OP_NOTPROP, OP_ANYNL, or OP_EXTUNI as its
1262 argument. It keeps the code above fast for the other cases. The argument
1263 is in the d variable. */
1264
1265 #ifdef SUPPORT_UNICODE
1266 case OP_PROP_EXTRA + OP_TYPEPLUS:
1267 case OP_PROP_EXTRA + OP_TYPEMINPLUS:
1268 case OP_PROP_EXTRA + OP_TYPEPOSPLUS:
1269 count = current_state->count; /* Already matched */
1270 if (count > 0) { ADD_ACTIVE(state_offset + 4, 0); }
1271 if (clen > 0)
1272 {
1273 BOOL OK;
1274 const uint32_t *cp;
1275 const ucd_record * prop = GET_UCD(c);
1276 switch(code[2])
1277 {
1278 case PT_ANY:
1279 OK = TRUE;
1280 break;
1281
1282 case PT_LAMP:
1283 OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll ||
1284 prop->chartype == ucp_Lt;
1285 break;
1286
1287 case PT_GC:
1288 OK = PRIV(ucp_gentype)[prop->chartype] == code[3];
1289 break;
1290
1291 case PT_PC:
1292 OK = prop->chartype == code[3];
1293 break;
1294
1295 case PT_SC:
1296 OK = prop->script == code[3];
1297 break;
1298
1299 /* These are specials for combination cases. */
1300
1301 case PT_ALNUM:
1302 OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
1303 PRIV(ucp_gentype)[prop->chartype] == ucp_N;
1304 break;
1305
1306 /* Perl space used to exclude VT, but from Perl 5.18 it is included,
1307 which means that Perl space and POSIX space are now identical. PCRE
1308 was changed at release 8.34. */
1309
1310 case PT_SPACE: /* Perl space */
1311 case PT_PXSPACE: /* POSIX space */
1312 switch(c)
1313 {
1314 HSPACE_CASES:
1315 VSPACE_CASES:
1316 OK = TRUE;
1317 break;
1318
1319 default:
1320 OK = PRIV(ucp_gentype)[prop->chartype] == ucp_Z;
1321 break;
1322 }
1323 break;
1324
1325 case PT_WORD:
1326 OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
1327 PRIV(ucp_gentype)[prop->chartype] == ucp_N ||
1328 c == CHAR_UNDERSCORE;
1329 break;
1330
1331 case PT_CLIST:
1332 cp = PRIV(ucd_caseless_sets) + code[3];
1333 for (;;)
1334 {
1335 if (c < *cp) { OK = FALSE; break; }
1336 if (c == *cp++) { OK = TRUE; break; }
1337 }
1338 break;
1339
1340 case PT_UCNC:
1341 OK = c == CHAR_DOLLAR_SIGN || c == CHAR_COMMERCIAL_AT ||
1342 c == CHAR_GRAVE_ACCENT || (c >= 0xa0 && c <= 0xd7ff) ||
1343 c >= 0xe000;
1344 break;
1345
1346 /* Should never occur, but keep compilers from grumbling. */
1347
1348 default:
1349 OK = codevalue != OP_PROP;
1350 break;
1351 }
1352
1353 if (OK == (d == OP_PROP))
1354 {
1355 if (count > 0 && codevalue == OP_PROP_EXTRA + OP_TYPEPOSPLUS)
1356 {
1357 active_count--; /* Remove non-match possibility */
1358 next_active_state--;
1359 }
1360 count++;
1361 ADD_NEW(state_offset, count);
1362 }
1363 }
1364 break;
1365
1366 /*-----------------------------------------------------------------*/
1367 case OP_EXTUNI_EXTRA + OP_TYPEPLUS:
1368 case OP_EXTUNI_EXTRA + OP_TYPEMINPLUS:
1369 case OP_EXTUNI_EXTRA + OP_TYPEPOSPLUS:
1370 count = current_state->count; /* Already matched */
1371 if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1372 if (clen > 0)
1373 {
1374 uint32_t lgb, rgb;
1375 PCRE2_SPTR nptr = ptr + clen;
1376 int ncount = 0;
1377 if (count > 0 && codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSPLUS)
1378 {
1379 active_count--; /* Remove non-match possibility */
1380 next_active_state--;
1381 }
1382 lgb = UCD_GRAPHBREAK(c);
1383 while (nptr < end_subject)
1384 {
1385 dlen = 1;
1386 if (!utf) d = *nptr; else { GETCHARLEN(d, nptr, dlen); }
1387 rgb = UCD_GRAPHBREAK(d);
1388 if ((PRIV(ucp_gbtable)[lgb] & (1u << rgb)) == 0) break;
1389 ncount++;
1390 lgb = rgb;
1391 nptr += dlen;
1392 }
1393 count++;
1394 ADD_NEW_DATA(-state_offset, count, ncount);
1395 }
1396 break;
1397 #endif
1398
1399 /*-----------------------------------------------------------------*/
1400 case OP_ANYNL_EXTRA + OP_TYPEPLUS:
1401 case OP_ANYNL_EXTRA + OP_TYPEMINPLUS:
1402 case OP_ANYNL_EXTRA + OP_TYPEPOSPLUS:
1403 count = current_state->count; /* Already matched */
1404 if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1405 if (clen > 0)
1406 {
1407 int ncount = 0;
1408 switch (c)
1409 {
1410 case CHAR_VT:
1411 case CHAR_FF:
1412 case CHAR_NEL:
1413 #ifndef EBCDIC
1414 case 0x2028:
1415 case 0x2029:
1416 #endif /* Not EBCDIC */
1417 if (mb->bsr_convention == PCRE2_BSR_ANYCRLF) break;
1418 goto ANYNL01;
1419
1420 case CHAR_CR:
1421 if (ptr + 1 < end_subject && UCHAR21TEST(ptr + 1) == CHAR_LF) ncount = 1;
1422 /* Fall through */
1423
1424 ANYNL01:
1425 case CHAR_LF:
1426 if (count > 0 && codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSPLUS)
1427 {
1428 active_count--; /* Remove non-match possibility */
1429 next_active_state--;
1430 }
1431 count++;
1432 ADD_NEW_DATA(-state_offset, count, ncount);
1433 break;
1434
1435 default:
1436 break;
1437 }
1438 }
1439 break;
1440
1441 /*-----------------------------------------------------------------*/
1442 case OP_VSPACE_EXTRA + OP_TYPEPLUS:
1443 case OP_VSPACE_EXTRA + OP_TYPEMINPLUS:
1444 case OP_VSPACE_EXTRA + OP_TYPEPOSPLUS:
1445 count = current_state->count; /* Already matched */
1446 if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1447 if (clen > 0)
1448 {
1449 BOOL OK;
1450 switch (c)
1451 {
1452 VSPACE_CASES:
1453 OK = TRUE;
1454 break;
1455
1456 default:
1457 OK = FALSE;
1458 break;
1459 }
1460
1461 if (OK == (d == OP_VSPACE))
1462 {
1463 if (count > 0 && codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSPLUS)
1464 {
1465 active_count--; /* Remove non-match possibility */
1466 next_active_state--;
1467 }
1468 count++;
1469 ADD_NEW_DATA(-state_offset, count, 0);
1470 }
1471 }
1472 break;
1473
1474 /*-----------------------------------------------------------------*/
1475 case OP_HSPACE_EXTRA + OP_TYPEPLUS:
1476 case OP_HSPACE_EXTRA + OP_TYPEMINPLUS:
1477 case OP_HSPACE_EXTRA + OP_TYPEPOSPLUS:
1478 count = current_state->count; /* Already matched */
1479 if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1480 if (clen > 0)
1481 {
1482 BOOL OK;
1483 switch (c)
1484 {
1485 HSPACE_CASES:
1486 OK = TRUE;
1487 break;
1488
1489 default:
1490 OK = FALSE;
1491 break;
1492 }
1493
1494 if (OK == (d == OP_HSPACE))
1495 {
1496 if (count > 0 && codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSPLUS)
1497 {
1498 active_count--; /* Remove non-match possibility */
1499 next_active_state--;
1500 }
1501 count++;
1502 ADD_NEW_DATA(-state_offset, count, 0);
1503 }
1504 }
1505 break;
1506
1507 /*-----------------------------------------------------------------*/
1508 #ifdef SUPPORT_UNICODE
1509 case OP_PROP_EXTRA + OP_TYPEQUERY:
1510 case OP_PROP_EXTRA + OP_TYPEMINQUERY:
1511 case OP_PROP_EXTRA + OP_TYPEPOSQUERY:
1512 count = 4;
1513 goto QS1;
1514
1515 case OP_PROP_EXTRA + OP_TYPESTAR:
1516 case OP_PROP_EXTRA + OP_TYPEMINSTAR:
1517 case OP_PROP_EXTRA + OP_TYPEPOSSTAR:
1518 count = 0;
1519
1520 QS1:
1521
1522 ADD_ACTIVE(state_offset + 4, 0);
1523 if (clen > 0)
1524 {
1525 BOOL OK;
1526 const uint32_t *cp;
1527 const ucd_record * prop = GET_UCD(c);
1528 switch(code[2])
1529 {
1530 case PT_ANY:
1531 OK = TRUE;
1532 break;
1533
1534 case PT_LAMP:
1535 OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll ||
1536 prop->chartype == ucp_Lt;
1537 break;
1538
1539 case PT_GC:
1540 OK = PRIV(ucp_gentype)[prop->chartype] == code[3];
1541 break;
1542
1543 case PT_PC:
1544 OK = prop->chartype == code[3];
1545 break;
1546
1547 case PT_SC:
1548 OK = prop->script == code[3];
1549 break;
1550
1551 /* These are specials for combination cases. */
1552
1553 case PT_ALNUM:
1554 OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
1555 PRIV(ucp_gentype)[prop->chartype] == ucp_N;
1556 break;
1557
1558 /* Perl space used to exclude VT, but from Perl 5.18 it is included,
1559 which means that Perl space and POSIX space are now identical. PCRE
1560 was changed at release 8.34. */
1561
1562 case PT_SPACE: /* Perl space */
1563 case PT_PXSPACE: /* POSIX space */
1564 switch(c)
1565 {
1566 HSPACE_CASES:
1567 VSPACE_CASES:
1568 OK = TRUE;
1569 break;
1570
1571 default:
1572 OK = PRIV(ucp_gentype)[prop->chartype] == ucp_Z;
1573 break;
1574 }
1575 break;
1576
1577 case PT_WORD:
1578 OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
1579 PRIV(ucp_gentype)[prop->chartype] == ucp_N ||
1580 c == CHAR_UNDERSCORE;
1581 break;
1582
1583 case PT_CLIST:
1584 cp = PRIV(ucd_caseless_sets) + code[3];
1585 for (;;)
1586 {
1587 if (c < *cp) { OK = FALSE; break; }
1588 if (c == *cp++) { OK = TRUE; break; }
1589 }
1590 break;
1591
1592 case PT_UCNC:
1593 OK = c == CHAR_DOLLAR_SIGN || c == CHAR_COMMERCIAL_AT ||
1594 c == CHAR_GRAVE_ACCENT || (c >= 0xa0 && c <= 0xd7ff) ||
1595 c >= 0xe000;
1596 break;
1597
1598 /* Should never occur, but keep compilers from grumbling. */
1599
1600 default:
1601 OK = codevalue != OP_PROP;
1602 break;
1603 }
1604
1605 if (OK == (d == OP_PROP))
1606 {
1607 if (codevalue == OP_PROP_EXTRA + OP_TYPEPOSSTAR ||
1608 codevalue == OP_PROP_EXTRA + OP_TYPEPOSQUERY)
1609 {
1610 active_count--; /* Remove non-match possibility */
1611 next_active_state--;
1612 }
1613 ADD_NEW(state_offset + count, 0);
1614 }
1615 }
1616 break;
1617
1618 /*-----------------------------------------------------------------*/
1619 case OP_EXTUNI_EXTRA + OP_TYPEQUERY:
1620 case OP_EXTUNI_EXTRA + OP_TYPEMINQUERY:
1621 case OP_EXTUNI_EXTRA + OP_TYPEPOSQUERY:
1622 count = 2;
1623 goto QS2;
1624
1625 case OP_EXTUNI_EXTRA + OP_TYPESTAR:
1626 case OP_EXTUNI_EXTRA + OP_TYPEMINSTAR:
1627 case OP_EXTUNI_EXTRA + OP_TYPEPOSSTAR:
1628 count = 0;
1629
1630 QS2:
1631
1632 ADD_ACTIVE(state_offset + 2, 0);
1633 if (clen > 0)
1634 {
1635 uint32_t lgb, rgb;
1636 PCRE2_SPTR nptr = ptr + clen;
1637 int ncount = 0;
1638 if (codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSSTAR ||
1639 codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSQUERY)
1640 {
1641 active_count--; /* Remove non-match possibility */
1642 next_active_state--;
1643 }
1644 lgb = UCD_GRAPHBREAK(c);
1645 while (nptr < end_subject)
1646 {
1647 dlen = 1;
1648 if (!utf) d = *nptr; else { GETCHARLEN(d, nptr, dlen); }
1649 rgb = UCD_GRAPHBREAK(d);
1650 if ((PRIV(ucp_gbtable)[lgb] & (1u << rgb)) == 0) break;
1651 ncount++;
1652 lgb = rgb;
1653 nptr += dlen;
1654 }
1655 ADD_NEW_DATA(-(state_offset + count), 0, ncount);
1656 }
1657 break;
1658 #endif
1659
1660 /*-----------------------------------------------------------------*/
1661 case OP_ANYNL_EXTRA + OP_TYPEQUERY:
1662 case OP_ANYNL_EXTRA + OP_TYPEMINQUERY:
1663 case OP_ANYNL_EXTRA + OP_TYPEPOSQUERY:
1664 count = 2;
1665 goto QS3;
1666
1667 case OP_ANYNL_EXTRA + OP_TYPESTAR:
1668 case OP_ANYNL_EXTRA + OP_TYPEMINSTAR:
1669 case OP_ANYNL_EXTRA + OP_TYPEPOSSTAR:
1670 count = 0;
1671
1672 QS3:
1673 ADD_ACTIVE(state_offset + 2, 0);
1674 if (clen > 0)
1675 {
1676 int ncount = 0;
1677 switch (c)
1678 {
1679 case CHAR_VT:
1680 case CHAR_FF:
1681 case CHAR_NEL:
1682 #ifndef EBCDIC
1683 case 0x2028:
1684 case 0x2029:
1685 #endif /* Not EBCDIC */
1686 if (mb->bsr_convention == PCRE2_BSR_ANYCRLF) break;
1687 goto ANYNL02;
1688
1689 case CHAR_CR:
1690 if (ptr + 1 < end_subject && UCHAR21TEST(ptr + 1) == CHAR_LF) ncount = 1;
1691 /* Fall through */
1692
1693 ANYNL02:
1694 case CHAR_LF:
1695 if (codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSSTAR ||
1696 codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSQUERY)
1697 {
1698 active_count--; /* Remove non-match possibility */
1699 next_active_state--;
1700 }
1701 ADD_NEW_DATA(-(state_offset + (int)count), 0, ncount);
1702 break;
1703
1704 default:
1705 break;
1706 }
1707 }
1708 break;
1709
1710 /*-----------------------------------------------------------------*/
1711 case OP_VSPACE_EXTRA + OP_TYPEQUERY:
1712 case OP_VSPACE_EXTRA + OP_TYPEMINQUERY:
1713 case OP_VSPACE_EXTRA + OP_TYPEPOSQUERY:
1714 count = 2;
1715 goto QS4;
1716
1717 case OP_VSPACE_EXTRA + OP_TYPESTAR:
1718 case OP_VSPACE_EXTRA + OP_TYPEMINSTAR:
1719 case OP_VSPACE_EXTRA + OP_TYPEPOSSTAR:
1720 count = 0;
1721
1722 QS4:
1723 ADD_ACTIVE(state_offset + 2, 0);
1724 if (clen > 0)
1725 {
1726 BOOL OK;
1727 switch (c)
1728 {
1729 VSPACE_CASES:
1730 OK = TRUE;
1731 break;
1732
1733 default:
1734 OK = FALSE;
1735 break;
1736 }
1737 if (OK == (d == OP_VSPACE))
1738 {
1739 if (codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSSTAR ||
1740 codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSQUERY)
1741 {
1742 active_count--; /* Remove non-match possibility */
1743 next_active_state--;
1744 }
1745 ADD_NEW_DATA(-(state_offset + (int)count), 0, 0);
1746 }
1747 }
1748 break;
1749
1750 /*-----------------------------------------------------------------*/
1751 case OP_HSPACE_EXTRA + OP_TYPEQUERY:
1752 case OP_HSPACE_EXTRA + OP_TYPEMINQUERY:
1753 case OP_HSPACE_EXTRA + OP_TYPEPOSQUERY:
1754 count = 2;
1755 goto QS5;
1756
1757 case OP_HSPACE_EXTRA + OP_TYPESTAR:
1758 case OP_HSPACE_EXTRA + OP_TYPEMINSTAR:
1759 case OP_HSPACE_EXTRA + OP_TYPEPOSSTAR:
1760 count = 0;
1761
1762 QS5:
1763 ADD_ACTIVE(state_offset + 2, 0);
1764 if (clen > 0)
1765 {
1766 BOOL OK;
1767 switch (c)
1768 {
1769 HSPACE_CASES:
1770 OK = TRUE;
1771 break;
1772
1773 default:
1774 OK = FALSE;
1775 break;
1776 }
1777
1778 if (OK == (d == OP_HSPACE))
1779 {
1780 if (codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSSTAR ||
1781 codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSQUERY)
1782 {
1783 active_count--; /* Remove non-match possibility */
1784 next_active_state--;
1785 }
1786 ADD_NEW_DATA(-(state_offset + (int)count), 0, 0);
1787 }
1788 }
1789 break;
1790
1791 /*-----------------------------------------------------------------*/
1792 #ifdef SUPPORT_UNICODE
1793 case OP_PROP_EXTRA + OP_TYPEEXACT:
1794 case OP_PROP_EXTRA + OP_TYPEUPTO:
1795 case OP_PROP_EXTRA + OP_TYPEMINUPTO:
1796 case OP_PROP_EXTRA + OP_TYPEPOSUPTO:
1797 if (codevalue != OP_PROP_EXTRA + OP_TYPEEXACT)
1798 { ADD_ACTIVE(state_offset + 1 + IMM2_SIZE + 3, 0); }
1799 count = current_state->count; /* Number already matched */
1800 if (clen > 0)
1801 {
1802 BOOL OK;
1803 const uint32_t *cp;
1804 const ucd_record * prop = GET_UCD(c);
1805 switch(code[1 + IMM2_SIZE + 1])
1806 {
1807 case PT_ANY:
1808 OK = TRUE;
1809 break;
1810
1811 case PT_LAMP:
1812 OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll ||
1813 prop->chartype == ucp_Lt;
1814 break;
1815
1816 case PT_GC:
1817 OK = PRIV(ucp_gentype)[prop->chartype] == code[1 + IMM2_SIZE + 2];
1818 break;
1819
1820 case PT_PC:
1821 OK = prop->chartype == code[1 + IMM2_SIZE + 2];
1822 break;
1823
1824 case PT_SC:
1825 OK = prop->script == code[1 + IMM2_SIZE + 2];
1826 break;
1827
1828 /* These are specials for combination cases. */
1829
1830 case PT_ALNUM:
1831 OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
1832 PRIV(ucp_gentype)[prop->chartype] == ucp_N;
1833 break;
1834
1835 /* Perl space used to exclude VT, but from Perl 5.18 it is included,
1836 which means that Perl space and POSIX space are now identical. PCRE
1837 was changed at release 8.34. */
1838
1839 case PT_SPACE: /* Perl space */
1840 case PT_PXSPACE: /* POSIX space */
1841 switch(c)
1842 {
1843 HSPACE_CASES:
1844 VSPACE_CASES:
1845 OK = TRUE;
1846 break;
1847
1848 default:
1849 OK = PRIV(ucp_gentype)[prop->chartype] == ucp_Z;
1850 break;
1851 }
1852 break;
1853
1854 case PT_WORD:
1855 OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
1856 PRIV(ucp_gentype)[prop->chartype] == ucp_N ||
1857 c == CHAR_UNDERSCORE;
1858 break;
1859
1860 case PT_CLIST:
1861 cp = PRIV(ucd_caseless_sets) + code[1 + IMM2_SIZE + 2];
1862 for (;;)
1863 {
1864 if (c < *cp) { OK = FALSE; break; }
1865 if (c == *cp++) { OK = TRUE; break; }
1866 }
1867 break;
1868
1869 case PT_UCNC:
1870 OK = c == CHAR_DOLLAR_SIGN || c == CHAR_COMMERCIAL_AT ||
1871 c == CHAR_GRAVE_ACCENT || (c >= 0xa0 && c <= 0xd7ff) ||
1872 c >= 0xe000;
1873 break;
1874
1875 /* Should never occur, but keep compilers from grumbling. */
1876
1877 default:
1878 OK = codevalue != OP_PROP;
1879 break;
1880 }
1881
1882 if (OK == (d == OP_PROP))
1883 {
1884 if (codevalue == OP_PROP_EXTRA + OP_TYPEPOSUPTO)
1885 {
1886 active_count--; /* Remove non-match possibility */
1887 next_active_state--;
1888 }
1889 if (++count >= (int)GET2(code, 1))
1890 { ADD_NEW(state_offset + 1 + IMM2_SIZE + 3, 0); }
1891 else
1892 { ADD_NEW(state_offset, count); }
1893 }
1894 }
1895 break;
1896
1897 /*-----------------------------------------------------------------*/
1898 case OP_EXTUNI_EXTRA + OP_TYPEEXACT:
1899 case OP_EXTUNI_EXTRA + OP_TYPEUPTO:
1900 case OP_EXTUNI_EXTRA + OP_TYPEMINUPTO:
1901 case OP_EXTUNI_EXTRA + OP_TYPEPOSUPTO:
1902 if (codevalue != OP_EXTUNI_EXTRA + OP_TYPEEXACT)
1903 { ADD_ACTIVE(state_offset + 2 + IMM2_SIZE, 0); }
1904 count = current_state->count; /* Number already matched */
1905 if (clen > 0)
1906 {
1907 uint32_t lgb, rgb;
1908 PCRE2_SPTR nptr = ptr + clen;
1909 int ncount = 0;
1910 if (codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSUPTO)
1911 {
1912 active_count--; /* Remove non-match possibility */
1913 next_active_state--;
1914 }
1915 lgb = UCD_GRAPHBREAK(c);
1916 while (nptr < end_subject)
1917 {
1918 dlen = 1;
1919 if (!utf) d = *nptr; else { GETCHARLEN(d, nptr, dlen); }
1920 rgb = UCD_GRAPHBREAK(d);
1921 if ((PRIV(ucp_gbtable)[lgb] & (1u << rgb)) == 0) break;
1922 ncount++;
1923 lgb = rgb;
1924 nptr += dlen;
1925 }
1926 if (nptr >= end_subject && (mb->moptions & PCRE2_PARTIAL_HARD) != 0)
1927 reset_could_continue = TRUE;
1928 if (++count >= (int)GET2(code, 1))
1929 { ADD_NEW_DATA(-(state_offset + 2 + IMM2_SIZE), 0, ncount); }
1930 else
1931 { ADD_NEW_DATA(-state_offset, count, ncount); }
1932 }
1933 break;
1934 #endif
1935
1936 /*-----------------------------------------------------------------*/
1937 case OP_ANYNL_EXTRA + OP_TYPEEXACT:
1938 case OP_ANYNL_EXTRA + OP_TYPEUPTO:
1939 case OP_ANYNL_EXTRA + OP_TYPEMINUPTO:
1940 case OP_ANYNL_EXTRA + OP_TYPEPOSUPTO:
1941 if (codevalue != OP_ANYNL_EXTRA + OP_TYPEEXACT)
1942 { ADD_ACTIVE(state_offset + 2 + IMM2_SIZE, 0); }
1943 count = current_state->count; /* Number already matched */
1944 if (clen > 0)
1945 {
1946 int ncount = 0;
1947 switch (c)
1948 {
1949 case CHAR_VT:
1950 case CHAR_FF:
1951 case CHAR_NEL:
1952 #ifndef EBCDIC
1953 case 0x2028:
1954 case 0x2029:
1955 #endif /* Not EBCDIC */
1956 if (mb->bsr_convention == PCRE2_BSR_ANYCRLF) break;
1957 goto ANYNL03;
1958
1959 case CHAR_CR:
1960 if (ptr + 1 < end_subject && UCHAR21TEST(ptr + 1) == CHAR_LF) ncount = 1;
1961 /* Fall through */
1962
1963 ANYNL03:
1964 case CHAR_LF:
1965 if (codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSUPTO)
1966 {
1967 active_count--; /* Remove non-match possibility */
1968 next_active_state--;
1969 }
1970 if (++count >= (int)GET2(code, 1))
1971 { ADD_NEW_DATA(-(state_offset + 2 + IMM2_SIZE), 0, ncount); }
1972 else
1973 { ADD_NEW_DATA(-state_offset, count, ncount); }
1974 break;
1975
1976 default:
1977 break;
1978 }
1979 }
1980 break;
1981
1982 /*-----------------------------------------------------------------*/
1983 case OP_VSPACE_EXTRA + OP_TYPEEXACT:
1984 case OP_VSPACE_EXTRA + OP_TYPEUPTO:
1985 case OP_VSPACE_EXTRA + OP_TYPEMINUPTO:
1986 case OP_VSPACE_EXTRA + OP_TYPEPOSUPTO:
1987 if (codevalue != OP_VSPACE_EXTRA + OP_TYPEEXACT)
1988 { ADD_ACTIVE(state_offset + 2 + IMM2_SIZE, 0); }
1989 count = current_state->count; /* Number already matched */
1990 if (clen > 0)
1991 {
1992 BOOL OK;
1993 switch (c)
1994 {
1995 VSPACE_CASES:
1996 OK = TRUE;
1997 break;
1998
1999 default:
2000 OK = FALSE;
2001 }
2002
2003 if (OK == (d == OP_VSPACE))
2004 {
2005 if (codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSUPTO)
2006 {
2007 active_count--; /* Remove non-match possibility */
2008 next_active_state--;
2009 }
2010 if (++count >= (int)GET2(code, 1))
2011 { ADD_NEW_DATA(-(state_offset + 2 + IMM2_SIZE), 0, 0); }
2012 else
2013 { ADD_NEW_DATA(-state_offset, count, 0); }
2014 }
2015 }
2016 break;
2017
2018 /*-----------------------------------------------------------------*/
2019 case OP_HSPACE_EXTRA + OP_TYPEEXACT:
2020 case OP_HSPACE_EXTRA + OP_TYPEUPTO:
2021 case OP_HSPACE_EXTRA + OP_TYPEMINUPTO:
2022 case OP_HSPACE_EXTRA + OP_TYPEPOSUPTO:
2023 if (codevalue != OP_HSPACE_EXTRA + OP_TYPEEXACT)
2024 { ADD_ACTIVE(state_offset + 2 + IMM2_SIZE, 0); }
2025 count = current_state->count; /* Number already matched */
2026 if (clen > 0)
2027 {
2028 BOOL OK;
2029 switch (c)
2030 {
2031 HSPACE_CASES:
2032 OK = TRUE;
2033 break;
2034
2035 default:
2036 OK = FALSE;
2037 break;
2038 }
2039
2040 if (OK == (d == OP_HSPACE))
2041 {
2042 if (codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSUPTO)
2043 {
2044 active_count--; /* Remove non-match possibility */
2045 next_active_state--;
2046 }
2047 if (++count >= (int)GET2(code, 1))
2048 { ADD_NEW_DATA(-(state_offset + 2 + IMM2_SIZE), 0, 0); }
2049 else
2050 { ADD_NEW_DATA(-state_offset, count, 0); }
2051 }
2052 }
2053 break;
2054
2055 /* ========================================================================== */
2056 /* These opcodes are followed by a character that is usually compared
2057 to the current subject character; it is loaded into d. We still get
2058 here even if there is no subject character, because in some cases zero
2059 repetitions are permitted. */
2060
2061 /*-----------------------------------------------------------------*/
2062 case OP_CHAR:
2063 if (clen > 0 && c == d) { ADD_NEW(state_offset + dlen + 1, 0); }
2064 break;
2065
2066 /*-----------------------------------------------------------------*/
2067 case OP_CHARI:
2068 if (clen == 0) break;
2069
2070 #ifdef SUPPORT_UNICODE
2071 if (utf)
2072 {
2073 if (c == d) { ADD_NEW(state_offset + dlen + 1, 0); } else
2074 {
2075 unsigned int othercase;
2076 if (c < 128)
2077 othercase = fcc[c];
2078 else
2079 othercase = UCD_OTHERCASE(c);
2080 if (d == othercase) { ADD_NEW(state_offset + dlen + 1, 0); }
2081 }
2082 }
2083 else
2084 #endif /* SUPPORT_UNICODE */
2085 /* Not UTF mode */
2086 {
2087 if (TABLE_GET(c, lcc, c) == TABLE_GET(d, lcc, d))
2088 { ADD_NEW(state_offset + 2, 0); }
2089 }
2090 break;
2091
2092
2093 #ifdef SUPPORT_UNICODE
2094 /*-----------------------------------------------------------------*/
2095 /* This is a tricky one because it can match more than one character.
2096 Find out how many characters to skip, and then set up a negative state
2097 to wait for them to pass before continuing. */
2098
2099 case OP_EXTUNI:
2100 if (clen > 0)
2101 {
2102 uint32_t lgb, rgb;
2103 PCRE2_SPTR nptr = ptr + clen;
2104 int ncount = 0;
2105 lgb = UCD_GRAPHBREAK(c);
2106 while (nptr < end_subject)
2107 {
2108 dlen = 1;
2109 if (!utf) d = *nptr; else { GETCHARLEN(d, nptr, dlen); }
2110 rgb = UCD_GRAPHBREAK(d);
2111 if ((PRIV(ucp_gbtable)[lgb] & (1u << rgb)) == 0) break;
2112 ncount++;
2113 lgb = rgb;
2114 nptr += dlen;
2115 }
2116 if (nptr >= end_subject && (mb->moptions & PCRE2_PARTIAL_HARD) != 0)
2117 reset_could_continue = TRUE;
2118 ADD_NEW_DATA(-(state_offset + 1), 0, ncount);
2119 }
2120 break;
2121 #endif
2122
2123 /*-----------------------------------------------------------------*/
2124 /* This is a tricky like EXTUNI because it too can match more than one
2125 character (when CR is followed by LF). In this case, set up a negative
2126 state to wait for one character to pass before continuing. */
2127
2128 case OP_ANYNL:
2129 if (clen > 0) switch(c)
2130 {
2131 case CHAR_VT:
2132 case CHAR_FF:
2133 case CHAR_NEL:
2134 #ifndef EBCDIC
2135 case 0x2028:
2136 case 0x2029:
2137 #endif /* Not EBCDIC */
2138 if (mb->bsr_convention == PCRE2_BSR_ANYCRLF) break;
2139
2140 case CHAR_LF:
2141 ADD_NEW(state_offset + 1, 0);
2142 break;
2143
2144 case CHAR_CR:
2145 if (ptr + 1 >= end_subject)
2146 {
2147 ADD_NEW(state_offset + 1, 0);
2148 if ((mb->moptions & PCRE2_PARTIAL_HARD) != 0)
2149 reset_could_continue = TRUE;
2150 }
2151 else if (UCHAR21TEST(ptr + 1) == CHAR_LF)
2152 {
2153 ADD_NEW_DATA(-(state_offset + 1), 0, 1);
2154 }
2155 else
2156 {
2157 ADD_NEW(state_offset + 1, 0);
2158 }
2159 break;
2160 }
2161 break;
2162
2163 /*-----------------------------------------------------------------*/
2164 case OP_NOT_VSPACE:
2165 if (clen > 0) switch(c)
2166 {
2167 VSPACE_CASES:
2168 break;
2169
2170 default:
2171 ADD_NEW(state_offset + 1, 0);
2172 break;
2173 }
2174 break;
2175
2176 /*-----------------------------------------------------------------*/
2177 case OP_VSPACE:
2178 if (clen > 0) switch(c)
2179 {
2180 VSPACE_CASES:
2181 ADD_NEW(state_offset + 1, 0);
2182 break;
2183
2184 default:
2185 break;
2186 }
2187 break;
2188
2189 /*-----------------------------------------------------------------*/
2190 case OP_NOT_HSPACE:
2191 if (clen > 0) switch(c)
2192 {
2193 HSPACE_CASES:
2194 break;
2195
2196 default:
2197 ADD_NEW(state_offset + 1, 0);
2198 break;
2199 }
2200 break;
2201
2202 /*-----------------------------------------------------------------*/
2203 case OP_HSPACE:
2204 if (clen > 0) switch(c)
2205 {
2206 HSPACE_CASES:
2207 ADD_NEW(state_offset + 1, 0);
2208 break;
2209
2210 default:
2211 break;
2212 }
2213 break;
2214
2215 /*-----------------------------------------------------------------*/
2216 /* Match a negated single character casefully. */
2217
2218 case OP_NOT:
2219 if (clen > 0 && c != d) { ADD_NEW(state_offset + dlen + 1, 0); }
2220 break;
2221
2222 /*-----------------------------------------------------------------*/
2223 /* Match a negated single character caselessly. */
2224
2225 case OP_NOTI:
2226 if (clen > 0)
2227 {
2228 unsigned int otherd;
2229 #ifdef SUPPORT_UNICODE
2230 if (utf && d >= 128)
2231 otherd = UCD_OTHERCASE(d);
2232 else
2233 #endif /* SUPPORT_UNICODE */
2234 otherd = TABLE_GET(d, fcc, d);
2235 if (c != d && c != otherd)
2236 { ADD_NEW(state_offset + dlen + 1, 0); }
2237 }
2238 break;
2239
2240 /*-----------------------------------------------------------------*/
2241 case OP_PLUSI:
2242 case OP_MINPLUSI:
2243 case OP_POSPLUSI:
2244 case OP_NOTPLUSI:
2245 case OP_NOTMINPLUSI:
2246 case OP_NOTPOSPLUSI:
2247 caseless = TRUE;
2248 codevalue -= OP_STARI - OP_STAR;
2249
2250 /* Fall through */
2251 case OP_PLUS:
2252 case OP_MINPLUS:
2253 case OP_POSPLUS:
2254 case OP_NOTPLUS:
2255 case OP_NOTMINPLUS:
2256 case OP_NOTPOSPLUS:
2257 count = current_state->count; /* Already matched */
2258 if (count > 0) { ADD_ACTIVE(state_offset + dlen + 1, 0); }
2259 if (clen > 0)
2260 {
2261 uint32_t otherd = NOTACHAR;
2262 if (caseless)
2263 {
2264 #ifdef SUPPORT_UNICODE
2265 if (utf && d >= 128)
2266 otherd = UCD_OTHERCASE(d);
2267 else
2268 #endif /* SUPPORT_UNICODE */
2269 otherd = TABLE_GET(d, fcc, d);
2270 }
2271 if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
2272 {
2273 if (count > 0 &&
2274 (codevalue == OP_POSPLUS || codevalue == OP_NOTPOSPLUS))
2275 {
2276 active_count--; /* Remove non-match possibility */
2277 next_active_state--;
2278 }
2279 count++;
2280 ADD_NEW(state_offset, count);
2281 }
2282 }
2283 break;
2284
2285 /*-----------------------------------------------------------------*/
2286 case OP_QUERYI:
2287 case OP_MINQUERYI:
2288 case OP_POSQUERYI:
2289 case OP_NOTQUERYI:
2290 case OP_NOTMINQUERYI:
2291 case OP_NOTPOSQUERYI:
2292 caseless = TRUE;
2293 codevalue -= OP_STARI - OP_STAR;
2294 /* Fall through */
2295 case OP_QUERY:
2296 case OP_MINQUERY:
2297 case OP_POSQUERY:
2298 case OP_NOTQUERY:
2299 case OP_NOTMINQUERY:
2300 case OP_NOTPOSQUERY:
2301 ADD_ACTIVE(state_offset + dlen + 1, 0);
2302 if (clen > 0)
2303 {
2304 uint32_t otherd = NOTACHAR;
2305 if (caseless)
2306 {
2307 #ifdef SUPPORT_UNICODE
2308 if (utf && d >= 128)
2309 otherd = UCD_OTHERCASE(d);
2310 else
2311 #endif /* SUPPORT_UNICODE */
2312 otherd = TABLE_GET(d, fcc, d);
2313 }
2314 if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
2315 {
2316 if (codevalue == OP_POSQUERY || codevalue == OP_NOTPOSQUERY)
2317 {
2318 active_count--; /* Remove non-match possibility */
2319 next_active_state--;
2320 }
2321 ADD_NEW(state_offset + dlen + 1, 0);
2322 }
2323 }
2324 break;
2325
2326 /*-----------------------------------------------------------------*/
2327 case OP_STARI:
2328 case OP_MINSTARI:
2329 case OP_POSSTARI:
2330 case OP_NOTSTARI:
2331 case OP_NOTMINSTARI:
2332 case OP_NOTPOSSTARI:
2333 caseless = TRUE;
2334 codevalue -= OP_STARI - OP_STAR;
2335 /* Fall through */
2336 case OP_STAR:
2337 case OP_MINSTAR:
2338 case OP_POSSTAR:
2339 case OP_NOTSTAR:
2340 case OP_NOTMINSTAR:
2341 case OP_NOTPOSSTAR:
2342 ADD_ACTIVE(state_offset + dlen + 1, 0);
2343 if (clen > 0)
2344 {
2345 uint32_t otherd = NOTACHAR;
2346 if (caseless)
2347 {
2348 #ifdef SUPPORT_UNICODE
2349 if (utf && d >= 128)
2350 otherd = UCD_OTHERCASE(d);
2351 else
2352 #endif /* SUPPORT_UNICODE */
2353 otherd = TABLE_GET(d, fcc, d);
2354 }
2355 if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
2356 {
2357 if (codevalue == OP_POSSTAR || codevalue == OP_NOTPOSSTAR)
2358 {
2359 active_count--; /* Remove non-match possibility */
2360 next_active_state--;
2361 }
2362 ADD_NEW(state_offset, 0);
2363 }
2364 }
2365 break;
2366
2367 /*-----------------------------------------------------------------*/
2368 case OP_EXACTI:
2369 case OP_NOTEXACTI:
2370 caseless = TRUE;
2371 codevalue -= OP_STARI - OP_STAR;
2372 /* Fall through */
2373 case OP_EXACT:
2374 case OP_NOTEXACT:
2375 count = current_state->count; /* Number already matched */
2376 if (clen > 0)
2377 {
2378 uint32_t otherd = NOTACHAR;
2379 if (caseless)
2380 {
2381 #ifdef SUPPORT_UNICODE
2382 if (utf && d >= 128)
2383 otherd = UCD_OTHERCASE(d);
2384 else
2385 #endif /* SUPPORT_UNICODE */
2386 otherd = TABLE_GET(d, fcc, d);
2387 }
2388 if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
2389 {
2390 if (++count >= (int)GET2(code, 1))
2391 { ADD_NEW(state_offset + dlen + 1 + IMM2_SIZE, 0); }
2392 else
2393 { ADD_NEW(state_offset, count); }
2394 }
2395 }
2396 break;
2397
2398 /*-----------------------------------------------------------------*/
2399 case OP_UPTOI:
2400 case OP_MINUPTOI:
2401 case OP_POSUPTOI:
2402 case OP_NOTUPTOI:
2403 case OP_NOTMINUPTOI:
2404 case OP_NOTPOSUPTOI:
2405 caseless = TRUE;
2406 codevalue -= OP_STARI - OP_STAR;
2407 /* Fall through */
2408 case OP_UPTO:
2409 case OP_MINUPTO:
2410 case OP_POSUPTO:
2411 case OP_NOTUPTO:
2412 case OP_NOTMINUPTO:
2413 case OP_NOTPOSUPTO:
2414 ADD_ACTIVE(state_offset + dlen + 1 + IMM2_SIZE, 0);
2415 count = current_state->count; /* Number already matched */
2416 if (clen > 0)
2417 {
2418 uint32_t otherd = NOTACHAR;
2419 if (caseless)
2420 {
2421 #ifdef SUPPORT_UNICODE
2422 if (utf && d >= 128)
2423 otherd = UCD_OTHERCASE(d);
2424 else
2425 #endif /* SUPPORT_UNICODE */
2426 otherd = TABLE_GET(d, fcc, d);
2427 }
2428 if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
2429 {
2430 if (codevalue == OP_POSUPTO || codevalue == OP_NOTPOSUPTO)
2431 {
2432 active_count--; /* Remove non-match possibility */
2433 next_active_state--;
2434 }
2435 if (++count >= (int)GET2(code, 1))
2436 { ADD_NEW(state_offset + dlen + 1 + IMM2_SIZE, 0); }
2437 else
2438 { ADD_NEW(state_offset, count); }
2439 }
2440 }
2441 break;
2442
2443
2444 /* ========================================================================== */
2445 /* These are the class-handling opcodes */
2446
2447 case OP_CLASS:
2448 case OP_NCLASS:
2449 case OP_XCLASS:
2450 {
2451 BOOL isinclass = FALSE;
2452 int next_state_offset;
2453 PCRE2_SPTR ecode;
2454
2455 /* For a simple class, there is always just a 32-byte table, and we
2456 can set isinclass from it. */
2457
2458 if (codevalue != OP_XCLASS)
2459 {
2460 ecode = code + 1 + (32 / sizeof(PCRE2_UCHAR));
2461 if (clen > 0)
2462 {
2463 isinclass = (c > 255)? (codevalue == OP_NCLASS) :
2464 ((((uint8_t *)(code + 1))[c/8] & (1 << (c&7))) != 0);
2465 }
2466 }
2467
2468 /* An extended class may have a table or a list of single characters,
2469 ranges, or both, and it may be positive or negative. There's a
2470 function that sorts all this out. */
2471
2472 else
2473 {
2474 ecode = code + GET(code, 1);
2475 if (clen > 0) isinclass = PRIV(xclass)(c, code + 1 + LINK_SIZE, utf);
2476 }
2477
2478 /* At this point, isinclass is set for all kinds of class, and ecode
2479 points to the byte after the end of the class. If there is a
2480 quantifier, this is where it will be. */
2481
2482 next_state_offset = (int)(ecode - start_code);
2483
2484 switch (*ecode)
2485 {
2486 case OP_CRSTAR:
2487 case OP_CRMINSTAR:
2488 case OP_CRPOSSTAR:
2489 ADD_ACTIVE(next_state_offset + 1, 0);
2490 if (isinclass)
2491 {
2492 if (*ecode == OP_CRPOSSTAR)
2493 {
2494 active_count--; /* Remove non-match possibility */
2495 next_active_state--;
2496 }
2497 ADD_NEW(state_offset, 0);
2498 }
2499 break;
2500
2501 case OP_CRPLUS:
2502 case OP_CRMINPLUS:
2503 case OP_CRPOSPLUS:
2504 count = current_state->count; /* Already matched */
2505 if (count > 0) { ADD_ACTIVE(next_state_offset + 1, 0); }
2506 if (isinclass)
2507 {
2508 if (count > 0 && *ecode == OP_CRPOSPLUS)
2509 {
2510 active_count--; /* Remove non-match possibility */
2511 next_active_state--;
2512 }
2513 count++;
2514 ADD_NEW(state_offset, count);
2515 }
2516 break;
2517
2518 case OP_CRQUERY:
2519 case OP_CRMINQUERY:
2520 case OP_CRPOSQUERY:
2521 ADD_ACTIVE(next_state_offset + 1, 0);
2522 if (isinclass)
2523 {
2524 if (*ecode == OP_CRPOSQUERY)
2525 {
2526 active_count--; /* Remove non-match possibility */
2527 next_active_state--;
2528 }
2529 ADD_NEW(next_state_offset + 1, 0);
2530 }
2531 break;
2532
2533 case OP_CRRANGE:
2534 case OP_CRMINRANGE:
2535 case OP_CRPOSRANGE:
2536 count = current_state->count; /* Already matched */
2537 if (count >= (int)GET2(ecode, 1))
2538 { ADD_ACTIVE(next_state_offset + 1 + 2 * IMM2_SIZE, 0); }
2539 if (isinclass)
2540 {
2541 int max = (int)GET2(ecode, 1 + IMM2_SIZE);
2542 if (*ecode == OP_CRPOSRANGE)
2543 {
2544 active_count--; /* Remove non-match possibility */
2545 next_active_state--;
2546 }
2547 if (++count >= max && max != 0) /* Max 0 => no limit */
2548 { ADD_NEW(next_state_offset + 1 + 2 * IMM2_SIZE, 0); }
2549 else
2550 { ADD_NEW(state_offset, count); }
2551 }
2552 break;
2553
2554 default:
2555 if (isinclass) { ADD_NEW(next_state_offset, 0); }
2556 break;
2557 }
2558 }
2559 break;
2560
2561 /* ========================================================================== */
2562 /* These are the opcodes for fancy brackets of various kinds. We have
2563 to use recursion in order to handle them. The "always failing" assertion
2564 (?!) is optimised to OP_FAIL when compiling, so we have to support that,
2565 though the other "backtracking verbs" are not supported. */
2566
2567 case OP_FAIL:
2568 forced_fail++; /* Count FAILs for multiple states */
2569 break;
2570
2571 case OP_ASSERT:
2572 case OP_ASSERT_NOT:
2573 case OP_ASSERTBACK:
2574 case OP_ASSERTBACK_NOT:
2575 {
2576 PCRE2_SPTR endasscode = code + GET(code, 1);
2577 PCRE2_SIZE local_offsets[2];
2578 int rc;
2579 int local_workspace[1000];
2580
2581 while (*endasscode == OP_ALT) endasscode += GET(endasscode, 1);
2582
2583 rc = internal_dfa_match(
2584 mb, /* static match data */
2585 code, /* this subexpression's code */
2586 ptr, /* where we currently are */
2587 (PCRE2_SIZE)(ptr - start_subject), /* start offset */
2588 local_offsets, /* offset vector */
2589 sizeof(local_offsets)/sizeof(PCRE2_SIZE), /* size of same */
2590 local_workspace, /* workspace vector */
2591 sizeof(local_workspace)/sizeof(int), /* size of same */
2592 rlevel); /* function recursion level */
2593
2594 if (rc == PCRE2_ERROR_DFA_UITEM) return rc;
2595 if ((rc >= 0) == (codevalue == OP_ASSERT || codevalue == OP_ASSERTBACK))
2596 { ADD_ACTIVE((int)(endasscode + LINK_SIZE + 1 - start_code), 0); }
2597 }
2598 break;
2599
2600 /*-----------------------------------------------------------------*/
2601 case OP_COND:
2602 case OP_SCOND:
2603 {
2604 PCRE2_SIZE local_offsets[1000];
2605 int local_workspace[1000];
2606 int codelink = (int)GET(code, 1);
2607 PCRE2_UCHAR condcode;
2608
2609 /* Because of the way auto-callout works during compile, a callout item
2610 is inserted between OP_COND and an assertion condition. This does not
2611 happen for the other conditions. */
2612
2613 if (code[LINK_SIZE + 1] == OP_CALLOUT
2614 || code[LINK_SIZE + 1] == OP_CALLOUT_STR)
2615 {
2616 PCRE2_SIZE callout_length = (code[LINK_SIZE + 1] == OP_CALLOUT)?
2617 (PCRE2_SIZE)PRIV(OP_lengths)[OP_CALLOUT] :
2618 (PCRE2_SIZE)GET(code, 2 + 3*LINK_SIZE);
2619
2620 rrc = 0;
2621 if (mb->callout != NULL)
2622 {
2623 pcre2_callout_block cb;
2624 cb.version = 1;
2625 cb.capture_top = 1;
2626 cb.capture_last = 0;
2627 cb.offset_vector = offsets;
2628 cb.mark = NULL; /* No (*MARK) support */
2629 cb.subject = start_subject;
2630 cb.subject_length = (PCRE2_SIZE)(end_subject - start_subject);
2631 cb.start_match = (PCRE2_SIZE)(current_subject - start_subject);
2632 cb.current_position = (PCRE2_SIZE)(ptr - start_subject);
2633 cb.pattern_position = GET(code, LINK_SIZE + 2);
2634 cb.next_item_length = GET(code, LINK_SIZE + 2 + LINK_SIZE);
2635
2636 if (code[LINK_SIZE + 1] == OP_CALLOUT)
2637 {
2638 cb.callout_number = code[2 + 3*LINK_SIZE];
2639 cb.callout_string_offset = 0;
2640 cb.callout_string = NULL;
2641 cb.callout_string_length = 0;
2642 }
2643 else
2644 {
2645 cb.callout_number = 0;
2646 cb.callout_string_offset = GET(code, 2 + 4*LINK_SIZE);
2647 cb.callout_string = code + (2 + 5*LINK_SIZE) + 1;
2648 cb.callout_string_length =
2649 callout_length - (1 + 4*LINK_SIZE) - 2;
2650 }
2651
2652 if ((rrc = (mb->callout)(&cb, mb->callout_data)) < 0)
2653 return rrc; /* Abandon */
2654 }
2655 if (rrc > 0) break; /* Fail this thread */
2656 code += callout_length; /* Skip callout data */
2657 }
2658
2659 condcode = code[LINK_SIZE+1];
2660
2661 /* Back reference conditions and duplicate named recursion conditions
2662 are not supported */
2663
2664 if (condcode == OP_CREF || condcode == OP_DNCREF ||
2665 condcode == OP_DNRREF)
2666 return PCRE2_ERROR_DFA_UCOND;
2667
2668 /* The DEFINE condition is always false, and the assertion (?!) is
2669 converted to OP_FAIL. */
2670
2671 if (condcode == OP_FALSE || condcode == OP_FAIL)
2672 { ADD_ACTIVE(state_offset + codelink + LINK_SIZE + 1, 0); }
2673
2674 /* There is also an always-true condition */
2675
2676 else if (condcode == OP_TRUE)
2677 { ADD_ACTIVE(state_offset + LINK_SIZE + 2 + IMM2_SIZE, 0); }
2678
2679 /* The only supported version of OP_RREF is for the value RREF_ANY,
2680 which means "test if in any recursion". We can't test for specifically
2681 recursed groups. */
2682
2683 else if (condcode == OP_RREF)
2684 {
2685 unsigned int value = GET2(code, LINK_SIZE + 2);
2686 if (value != RREF_ANY) return PCRE2_ERROR_DFA_UCOND;
2687 if (mb->recursive != NULL)
2688 { ADD_ACTIVE(state_offset + LINK_SIZE + 2 + IMM2_SIZE, 0); }
2689 else { ADD_ACTIVE(state_offset + codelink + LINK_SIZE + 1, 0); }
2690 }
2691
2692 /* Otherwise, the condition is an assertion */
2693
2694 else
2695 {
2696 int rc;
2697 PCRE2_SPTR asscode = code + LINK_SIZE + 1;
2698 PCRE2_SPTR endasscode = asscode + GET(asscode, 1);
2699
2700 while (*endasscode == OP_ALT) endasscode += GET(endasscode, 1);
2701
2702 rc = internal_dfa_match(
2703 mb, /* fixed match data */
2704 asscode, /* this subexpression's code */
2705 ptr, /* where we currently are */
2706 (PCRE2_SIZE)(ptr - start_subject), /* start offset */
2707 local_offsets, /* offset vector */
2708 sizeof(local_offsets)/sizeof(PCRE2_SIZE), /* size of same */
2709 local_workspace, /* workspace vector */
2710 sizeof(local_workspace)/sizeof(int), /* size of same */
2711 rlevel); /* function recursion level */
2712
2713 if (rc == PCRE2_ERROR_DFA_UITEM) return rc;
2714 if ((rc >= 0) ==
2715 (condcode == OP_ASSERT || condcode == OP_ASSERTBACK))
2716 { ADD_ACTIVE((int)(endasscode + LINK_SIZE + 1 - start_code), 0); }
2717 else
2718 { ADD_ACTIVE(state_offset + codelink + LINK_SIZE + 1, 0); }
2719 }
2720 }
2721 break;
2722
2723 /*-----------------------------------------------------------------*/
2724 case OP_RECURSE:
2725 {
2726 dfa_recursion_info *ri;
2727 PCRE2_SIZE local_offsets[1000];
2728 int local_workspace[1000];
2729 PCRE2_SPTR callpat = start_code + GET(code, 1);
2730 uint32_t recno = (callpat == mb->start_code)? 0 :
2731 GET2(callpat, 1 + LINK_SIZE);
2732 int rc;
2733
2734 /* Check for repeating a recursion without advancing the subject
2735 pointer. This should catch convoluted mutual recursions. (Some simple
2736 cases are caught at compile time.) */
2737
2738 for (ri = mb->recursive; ri != NULL; ri = ri->prevrec)
2739 if (recno == ri->group_num && ptr == ri->subject_position)
2740 return PCRE2_ERROR_RECURSELOOP;
2741
2742 /* Remember this recursion and where we started it so as to
2743 catch infinite loops. */
2744
2745 new_recursive.group_num = recno;
2746 new_recursive.subject_position = ptr;
2747 new_recursive.prevrec = mb->recursive;
2748 mb->recursive = &new_recursive;
2749
2750 rc = internal_dfa_match(
2751 mb, /* fixed match data */
2752 callpat, /* this subexpression's code */
2753 ptr, /* where we currently are */
2754 (PCRE2_SIZE)(ptr - start_subject), /* start offset */
2755 local_offsets, /* offset vector */
2756 sizeof(local_offsets)/sizeof(PCRE2_SIZE), /* size of same */
2757 local_workspace, /* workspace vector */
2758 sizeof(local_workspace)/sizeof(int), /* size of same */
2759 rlevel); /* function recursion level */
2760
2761 mb->recursive = new_recursive.prevrec; /* Done this recursion */
2762
2763 /* Ran out of internal offsets */
2764
2765 if (rc == 0) return PCRE2_ERROR_DFA_RECURSE;
2766
2767 /* For each successful matched substring, set up the next state with a
2768 count of characters to skip before trying it. Note that the count is in
2769 characters, not bytes. */
2770
2771 if (rc > 0)
2772 {
2773 for (rc = rc*2 - 2; rc >= 0; rc -= 2)
2774 {
2775 PCRE2_SIZE charcount = local_offsets[rc+1] - local_offsets[rc];
2776 #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32
2777 if (utf)
2778 {
2779 PCRE2_SPTR p = start_subject + local_offsets[rc];
2780 PCRE2_SPTR pp = start_subject + local_offsets[rc+1];
2781 while (p < pp) if (NOT_FIRSTCU(*p++)) charcount--;
2782 }
2783 #endif
2784 if (charcount > 0)
2785 {
2786 ADD_NEW_DATA(-(state_offset + LINK_SIZE + 1), 0,
2787 (int)(charcount - 1));
2788 }
2789 else
2790 {
2791 ADD_ACTIVE(state_offset + LINK_SIZE + 1, 0);
2792 }
2793 }
2794 }
2795 else if (rc != PCRE2_ERROR_NOMATCH) return rc;
2796 }
2797 break;
2798
2799 /*-----------------------------------------------------------------*/
2800 case OP_BRAPOS:
2801 case OP_SBRAPOS:
2802 case OP_CBRAPOS:
2803 case OP_SCBRAPOS:
2804 case OP_BRAPOSZERO:
2805 {
2806 PCRE2_SIZE charcount, matched_count;
2807 PCRE2_SPTR local_ptr = ptr;
2808 BOOL allow_zero;
2809
2810 if (codevalue == OP_BRAPOSZERO)
2811 {
2812 allow_zero = TRUE;
2813 codevalue = *(++code); /* Codevalue will be one of above BRAs */
2814 }
2815 else allow_zero = FALSE;
2816
2817 /* Loop to match the subpattern as many times as possible as if it were
2818 a complete pattern. */
2819
2820 for (matched_count = 0;; matched_count++)
2821 {
2822 PCRE2_SIZE local_offsets[2];
2823 int local_workspace[1000];
2824
2825 int rc = internal_dfa_match(
2826 mb, /* fixed match data */
2827 code, /* this subexpression's code */
2828 local_ptr, /* where we currently are */
2829 (PCRE2_SIZE)(ptr - start_subject), /* start offset */
2830 local_offsets, /* offset vector */
2831 sizeof(local_offsets)/sizeof(PCRE2_SIZE), /* size of same */
2832 local_workspace, /* workspace vector */
2833 sizeof(local_workspace)/sizeof(int), /* size of same */
2834 rlevel); /* function recursion level */
2835
2836 /* Failed to match */
2837
2838 if (rc < 0)
2839 {
2840 if (rc != PCRE2_ERROR_NOMATCH) return rc;
2841 break;
2842 }
2843
2844 /* Matched: break the loop if zero characters matched. */
2845
2846 charcount = local_offsets[1] - local_offsets[0];
2847 if (charcount == 0) break;
2848 local_ptr += charcount; /* Advance temporary position ptr */
2849 }
2850
2851 /* At this point we have matched the subpattern matched_count
2852 times, and local_ptr is pointing to the character after the end of the
2853 last match. */
2854
2855 if (matched_count > 0 || allow_zero)
2856 {
2857 PCRE2_SPTR end_subpattern = code;
2858 int next_state_offset;
2859
2860 do { end_subpattern += GET(end_subpattern, 1); }
2861 while (*end_subpattern == OP_ALT);
2862 next_state_offset =
2863 (int)(end_subpattern - start_code + LINK_SIZE + 1);
2864
2865 /* Optimization: if there are no more active states, and there
2866 are no new states yet set up, then skip over the subject string
2867 right here, to save looping. Otherwise, set up the new state to swing
2868 into action when the end of the matched substring is reached. */
2869
2870 if (i + 1 >= active_count && new_count == 0)
2871 {
2872 ptr = local_ptr;
2873 clen = 0;
2874 ADD_NEW(next_state_offset, 0);
2875 }
2876 else
2877 {
2878 PCRE2_SPTR p = ptr;
2879 PCRE2_SPTR pp = local_ptr;
2880 charcount = (PCRE2_SIZE)(pp - p);
2881 #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32
2882 if (utf) while (p < pp) if (NOT_FIRSTCU(*p++)) charcount--;
2883 #endif
2884 ADD_NEW_DATA(-next_state_offset, 0, (int)(charcount - 1));
2885 }
2886 }
2887 }
2888 break;
2889
2890 /*-----------------------------------------------------------------*/
2891 case OP_ONCE:
2892 case OP_ONCE_NC:
2893 {
2894 PCRE2_SIZE local_offsets[2];
2895 int local_workspace[1000];
2896
2897 int rc = internal_dfa_match(
2898 mb, /* fixed match data */
2899 code, /* this subexpression's code */
2900 ptr, /* where we currently are */
2901 (PCRE2_SIZE)(ptr - start_subject), /* start offset */
2902 local_offsets, /* offset vector */
2903 sizeof(local_offsets)/sizeof(PCRE2_SIZE), /* size of same */
2904 local_workspace, /* workspace vector */
2905 sizeof(local_workspace)/sizeof(int), /* size of same */
2906 rlevel); /* function recursion level */
2907
2908 if (rc >= 0)
2909 {
2910 PCRE2_SPTR end_subpattern = code;
2911 PCRE2_SIZE charcount = local_offsets[1] - local_offsets[0];
2912 int next_state_offset, repeat_state_offset;
2913
2914 do { end_subpattern += GET(end_subpattern, 1); }
2915 while (*end_subpattern == OP_ALT);
2916 next_state_offset =
2917 (int)(end_subpattern - start_code + LINK_SIZE + 1);
2918
2919 /* If the end of this subpattern is KETRMAX or KETRMIN, we must
2920 arrange for the repeat state also to be added to the relevant list.
2921 Calculate the offset, or set -1 for no repeat. */
2922
2923 repeat_state_offset = (*end_subpattern == OP_KETRMAX ||
2924 *end_subpattern == OP_KETRMIN)?
2925 (int)(end_subpattern - start_code - GET(end_subpattern, 1)) : -1;
2926
2927 /* If we have matched an empty string, add the next state at the
2928 current character pointer. This is important so that the duplicate
2929 checking kicks in, which is what breaks infinite loops that match an
2930 empty string. */
2931
2932 if (charcount == 0)
2933 {
2934 ADD_ACTIVE(next_state_offset, 0);
2935 }
2936
2937 /* Optimization: if there are no more active states, and there
2938 are no new states yet set up, then skip over the subject string
2939 right here, to save looping. Otherwise, set up the new state to swing
2940 into action when the end of the matched substring is reached. */
2941
2942 else if (i + 1 >= active_count && new_count == 0)
2943 {
2944 ptr += charcount;
2945 clen = 0;
2946 ADD_NEW(next_state_offset, 0);
2947
2948 /* If we are adding a repeat state at the new character position,
2949 we must fudge things so that it is the only current state.
2950 Otherwise, it might be a duplicate of one we processed before, and
2951 that would cause it to be skipped. */
2952
2953 if (repeat_state_offset >= 0)
2954 {
2955 next_active_state = active_states;
2956 active_count = 0;
2957 i = -1;
2958 ADD_ACTIVE(repeat_state_offset, 0);
2959 }
2960 }
2961 else
2962 {
2963 #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32
2964 if (utf)
2965 {
2966 PCRE2_SPTR p = start_subject + local_offsets[0];
2967 PCRE2_SPTR pp = start_subject + local_offsets[1];
2968 while (p < pp) if (NOT_FIRSTCU(*p++)) charcount--;
2969 }
2970 #endif
2971 ADD_NEW_DATA(-next_state_offset, 0, (int)(charcount - 1));
2972 if (repeat_state_offset >= 0)
2973 { ADD_NEW_DATA(-repeat_state_offset, 0, (int)(charcount - 1)); }
2974 }
2975 }
2976 else if (rc != PCRE2_ERROR_NOMATCH) return rc;
2977 }
2978 break;
2979
2980
2981 /* ========================================================================== */
2982 /* Handle callouts */
2983
2984 case OP_CALLOUT:
2985 case OP_CALLOUT_STR:
2986 {
2987 unsigned int callout_length = (*code == OP_CALLOUT)
2988 ? PRIV(OP_lengths)[OP_CALLOUT] : GET(code, 1 + 2*LINK_SIZE);
2989 rrc = 0;
2990
2991 if (mb->callout != NULL)
2992 {
2993 pcre2_callout_block cb;
2994 cb.version = 1;
2995 cb.capture_top = 1;
2996 cb.capture_last = 0;
2997 cb.offset_vector = offsets;
2998 cb.mark = NULL; /* No (*MARK) support */
2999 cb.subject = start_subject;
3000 cb.subject_length = (PCRE2_SIZE)(end_subject - start_subject);
3001 cb.start_match = (PCRE2_SIZE)(current_subject - start_subject);
3002 cb.current_position = (PCRE2_SIZE)(ptr - start_subject);
3003 cb.pattern_position = GET(code, 1);
3004 cb.next_item_length = GET(code, 1 + LINK_SIZE);
3005
3006 if (*code == OP_CALLOUT)
3007 {
3008 cb.callout_number = code[1 + 2*LINK_SIZE];
3009 cb.callout_string_offset = 0;
3010 cb.callout_string = NULL;
3011 cb.callout_string_length = 0;
3012 }
3013 else
3014 {
3015 cb.callout_number = 0;
3016 cb.callout_string_offset = GET(code, 1 + 3*LINK_SIZE);
3017 cb.callout_string = code + (1 + 4*LINK_SIZE) + 1;
3018 cb.callout_string_length =
3019 callout_length - (1 + 4*LINK_SIZE) - 2;
3020 }
3021
3022 if ((rrc = (mb->callout)(&cb, mb->callout_data)) < 0)
3023 return rrc; /* Abandon */
3024 }
3025 if (rrc == 0)
3026 { ADD_ACTIVE(state_offset + (int)callout_length, 0); }
3027 }
3028 break;
3029
3030
3031 /* ========================================================================== */
3032 default: /* Unsupported opcode */
3033 return PCRE2_ERROR_DFA_UITEM;
3034 }
3035
3036 NEXT_ACTIVE_STATE: continue;
3037
3038 } /* End of loop scanning active states */
3039
3040 /* We have finished the processing at the current subject character. If no
3041 new states have been set for the next character, we have found all the
3042 matches that we are going to find. If we are at the top level and partial
3043 matching has been requested, check for appropriate conditions.
3044
3045 The "forced_ fail" variable counts the number of (*F) encountered for the
3046 character. If it is equal to the original active_count (saved in
3047 workspace[1]) it means that (*F) was found on every active state. In this
3048 case we don't want to give a partial match.
3049
3050 The "could_continue" variable is true if a state could have continued but
3051 for the fact that the end of the subject was reached. */
3052
3053 if (new_count <= 0)
3054 {
3055 if (rlevel == 1 && /* Top level, and */
3056 could_continue && /* Some could go on, and */
3057 forced_fail != workspace[1] && /* Not all forced fail & */
3058 ( /* either... */
3059 (mb->moptions & PCRE2_PARTIAL_HARD) != 0 /* Hard partial */
3060 || /* or... */
3061 ((mb->moptions & PCRE2_PARTIAL_SOFT) != 0 && /* Soft partial and */
3062 match_count < 0) /* no matches */
3063 ) && /* And... */
3064 (
3065 partial_newline || /* Either partial NL */
3066 ( /* or ... */
3067 ptr >= end_subject && /* End of subject and */
3068 ptr > mb->start_used_ptr) /* Inspected non-empty string */
3069 )
3070 )
3071 match_count = PCRE2_ERROR_PARTIAL;
3072 break; /* In effect, "return", but see the comment below */
3073 }
3074
3075 /* One or more states are active for the next character. */
3076
3077 ptr += clen; /* Advance to next subject character */
3078 } /* Loop to move along the subject string */
3079
3080 /* Control gets here from "break" a few lines above. We do it this way because
3081 if we use "return" above, we have compiler trouble. Some compilers warn if
3082 there's nothing here because they think the function doesn't return a value. On
3083 the other hand, if we put a dummy statement here, some more clever compilers
3084 complain that it can't be reached. Sigh. */
3085
3086 return match_count;
3087 }
3088
3089
3090
3091 /*************************************************
3092 * Match a pattern using the DFA algorithm *
3093 *************************************************/
3094
3095 /* This function matches a compiled pattern to a subject string, using the
3096 alternate matching algorithm that finds all matches at once.
3097
3098 Arguments:
3099 code points to the compiled pattern
3100 subject subject string
3101 length length of subject string
3102 startoffset where to start matching in the subject
3103 options option bits
3104 match_data points to a match data structure
3105 gcontext points to a match context
3106 workspace pointer to workspace
3107 wscount size of workspace
3108
3109 Returns: > 0 => number of match offset pairs placed in offsets
3110 = 0 => offsets overflowed; longest matches are present
3111 -1 => failed to match
3112 < -1 => some kind of unexpected problem
3113 */
3114
3115 PCRE2_EXP_DEFN int PCRE2_CALL_CONVENTION
pcre2_dfa_match(const pcre2_code * code,PCRE2_SPTR subject,PCRE2_SIZE length,PCRE2_SIZE start_offset,uint32_t options,pcre2_match_data * match_data,pcre2_match_context * mcontext,int * workspace,size_t wscount)3116 pcre2_dfa_match(const pcre2_code *code, PCRE2_SPTR subject, PCRE2_SIZE length,
3117 PCRE2_SIZE start_offset, uint32_t options, pcre2_match_data *match_data,
3118 pcre2_match_context *mcontext, int *workspace, size_t wscount)
3119 {
3120 const pcre2_real_code *re = (const pcre2_real_code *)code;
3121
3122 PCRE2_SPTR start_match;
3123 PCRE2_SPTR end_subject;
3124 PCRE2_SPTR bumpalong_limit;
3125 PCRE2_SPTR req_cu_ptr;
3126
3127 BOOL utf, anchored, startline, firstline;
3128
3129 BOOL has_first_cu = FALSE;
3130 BOOL has_req_cu = FALSE;
3131 PCRE2_UCHAR first_cu = 0;
3132 PCRE2_UCHAR first_cu2 = 0;
3133 PCRE2_UCHAR req_cu = 0;
3134 PCRE2_UCHAR req_cu2 = 0;
3135
3136 const uint8_t *start_bits = NULL;
3137
3138 /* We need to have mb pointing to a match block, because the IS_NEWLINE macro
3139 is used below, and it expects NLBLOCK to be defined as a pointer. */
3140
3141 dfa_match_block actual_match_block;
3142 dfa_match_block *mb = &actual_match_block;
3143
3144 /* A length equal to PCRE2_ZERO_TERMINATED implies a zero-terminated
3145 subject string. */
3146
3147 if (length == PCRE2_ZERO_TERMINATED) length = PRIV(strlen)(subject);
3148
3149 /* Plausibility checks */
3150
3151 if ((options & ~PUBLIC_DFA_MATCH_OPTIONS) != 0) return PCRE2_ERROR_BADOPTION;
3152 if (re == NULL || subject == NULL || workspace == NULL || match_data == NULL)
3153 return PCRE2_ERROR_NULL;
3154 if (wscount < 20) return PCRE2_ERROR_DFA_WSSIZE;
3155 if (start_offset > length) return PCRE2_ERROR_BADOFFSET;
3156
3157 /* Check that the first field in the block is the magic number. If it is not,
3158 return with PCRE2_ERROR_BADMAGIC. */
3159
3160 if (re->magic_number != MAGIC_NUMBER) return PCRE2_ERROR_BADMAGIC;
3161
3162 /* Check the code unit width. */
3163
3164 if ((re->flags & PCRE2_MODE_MASK) != PCRE2_CODE_UNIT_WIDTH/8)
3165 return PCRE2_ERROR_BADMODE;
3166
3167 /* PCRE2_NOTEMPTY and PCRE2_NOTEMPTY_ATSTART are match-time flags in the
3168 options variable for this function. Users of PCRE2 who are not calling the
3169 function directly would like to have a way of setting these flags, in the same
3170 way that they can set pcre2_compile() flags like PCRE2_NO_AUTOPOSSESS with
3171 constructions like (*NO_AUTOPOSSESS). To enable this, (*NOTEMPTY) and
3172 (*NOTEMPTY_ATSTART) set bits in the pattern's "flag" function which can now be
3173 transferred to the options for this function. The bits are guaranteed to be
3174 adjacent, but do not have the same values. This bit of Boolean trickery assumes
3175 that the match-time bits are not more significant than the flag bits. If by
3176 accident this is not the case, a compile-time division by zero error will
3177 occur. */
3178
3179 #define FF (PCRE2_NOTEMPTY_SET|PCRE2_NE_ATST_SET)
3180 #define OO (PCRE2_NOTEMPTY|PCRE2_NOTEMPTY_ATSTART)
3181 options |= (re->flags & FF) / ((FF & (~FF+1)) / (OO & (~OO+1)));
3182 #undef FF
3183 #undef OO
3184
3185 /* If restarting after a partial match, do some sanity checks on the contents
3186 of the workspace. */
3187
3188 if ((options & PCRE2_DFA_RESTART) != 0)
3189 {
3190 if ((workspace[0] & (-2)) != 0 || workspace[1] < 1 ||
3191 workspace[1] > (int)((wscount - 2)/INTS_PER_STATEBLOCK))
3192 return PCRE2_ERROR_DFA_BADRESTART;
3193 }
3194
3195 /* Set some local values */
3196
3197 utf = (re->overall_options & PCRE2_UTF) != 0;
3198 start_match = subject + start_offset;
3199 end_subject = subject + length;
3200 req_cu_ptr = start_match - 1;
3201 anchored = (options & (PCRE2_ANCHORED|PCRE2_DFA_RESTART)) != 0 ||
3202 (re->overall_options & PCRE2_ANCHORED) != 0;
3203
3204 /* The "must be at the start of a line" flags are used in a loop when finding
3205 where to start. */
3206
3207 startline = (re->flags & PCRE2_STARTLINE) != 0;
3208 firstline = (re->overall_options & PCRE2_FIRSTLINE) != 0;
3209 bumpalong_limit = end_subject;
3210
3211 /* Get data from the match context, if present, and fill in the fields in the
3212 match block. It is an error to set an offset limit without setting the flag at
3213 compile time. */
3214
3215 if (mcontext == NULL)
3216 {
3217 mb->callout = NULL;
3218 mb->memctl = re->memctl;
3219 }
3220 else
3221 {
3222 if (mcontext->offset_limit != PCRE2_UNSET)
3223 {
3224 if ((re->overall_options & PCRE2_USE_OFFSET_LIMIT) == 0)
3225 return PCRE2_ERROR_BADOFFSETLIMIT;
3226 bumpalong_limit = subject + mcontext->offset_limit;
3227 }
3228 mb->callout = mcontext->callout;
3229 mb->callout_data = mcontext->callout_data;
3230 mb->memctl = mcontext->memctl;
3231 }
3232
3233 mb->start_code = (PCRE2_UCHAR *)((uint8_t *)re + sizeof(pcre2_real_code)) +
3234 re->name_count * re->name_entry_size;
3235 mb->tables = re->tables;
3236 mb->start_subject = subject;
3237 mb->end_subject = end_subject;
3238 mb->start_offset = start_offset;
3239 mb->moptions = options;
3240 mb->poptions = re->overall_options;
3241
3242 /* Process the \R and newline settings. */
3243
3244 mb->bsr_convention = re->bsr_convention;
3245 mb->nltype = NLTYPE_FIXED;
3246 switch(re->newline_convention)
3247 {
3248 case PCRE2_NEWLINE_CR:
3249 mb->nllen = 1;
3250 mb->nl[0] = CHAR_CR;
3251 break;
3252
3253 case PCRE2_NEWLINE_LF:
3254 mb->nllen = 1;
3255 mb->nl[0] = CHAR_NL;
3256 break;
3257
3258 case PCRE2_NEWLINE_CRLF:
3259 mb->nllen = 2;
3260 mb->nl[0] = CHAR_CR;
3261 mb->nl[1] = CHAR_NL;
3262 break;
3263
3264 case PCRE2_NEWLINE_ANY:
3265 mb->nltype = NLTYPE_ANY;
3266 break;
3267
3268 case PCRE2_NEWLINE_ANYCRLF:
3269 mb->nltype = NLTYPE_ANYCRLF;
3270 break;
3271
3272 default: return PCRE2_ERROR_INTERNAL;
3273 }
3274
3275 /* Check a UTF string for validity if required. For 8-bit and 16-bit strings,
3276 we must also check that a starting offset does not point into the middle of a
3277 multiunit character. We check only the portion of the subject that is going to
3278 be inspected during matching - from the offset minus the maximum back reference
3279 to the given length. This saves time when a small part of a large subject is
3280 being matched by the use of a starting offset. Note that the maximum lookbehind
3281 is a number of characters, not code units. */
3282
3283 #ifdef SUPPORT_UNICODE
3284 if (utf && (options & PCRE2_NO_UTF_CHECK) == 0)
3285 {
3286 PCRE2_SPTR check_subject = start_match; /* start_match includes offset */
3287
3288 if (start_offset > 0)
3289 {
3290 #if PCRE2_CODE_UNIT_WIDTH != 32
3291 unsigned int i;
3292 if (start_match < end_subject && NOT_FIRSTCU(*start_match))
3293 return PCRE2_ERROR_BADUTFOFFSET;
3294 for (i = re->max_lookbehind; i > 0 && check_subject > subject; i--)
3295 {
3296 check_subject--;
3297 while (check_subject > subject &&
3298 #if PCRE2_CODE_UNIT_WIDTH == 8
3299 (*check_subject & 0xc0) == 0x80)
3300 #else /* 16-bit */
3301 (*check_subject & 0xfc00) == 0xdc00)
3302 #endif /* PCRE2_CODE_UNIT_WIDTH == 8 */
3303 check_subject--;
3304 }
3305 #else /* In the 32-bit library, one code unit equals one character. */
3306 check_subject -= re->max_lookbehind;
3307 if (check_subject < subject) check_subject = subject;
3308 #endif /* PCRE2_CODE_UNIT_WIDTH != 32 */
3309 }
3310
3311 /* Validate the relevant portion of the subject. After an error, adjust the
3312 offset to be an absolute offset in the whole string. */
3313
3314 match_data->rc = PRIV(valid_utf)(check_subject,
3315 length - (PCRE2_SIZE)(check_subject - subject), &(match_data->startchar));
3316 if (match_data->rc != 0)
3317 {
3318 match_data->startchar += (PCRE2_SIZE)(check_subject - subject);
3319 return match_data->rc;
3320 }
3321 }
3322 #endif /* SUPPORT_UNICODE */
3323
3324 /* Set up the first code unit to match, if available. The first_codeunit value
3325 is never set for an anchored regular expression, but the anchoring may be
3326 forced at run time, so we have to test for anchoring. The first code unit may
3327 be unset for an unanchored pattern, of course. If there's no first code unit
3328 there may be a bitmap of possible first characters. */
3329
3330 if (!anchored)
3331 {
3332 if ((re->flags & PCRE2_FIRSTSET) != 0)
3333 {
3334 has_first_cu = TRUE;
3335 first_cu = first_cu2 = (PCRE2_UCHAR)(re->first_codeunit);
3336 if ((re->flags & PCRE2_FIRSTCASELESS) != 0)
3337 {
3338 first_cu2 = TABLE_GET(first_cu, mb->tables + fcc_offset, first_cu);
3339 #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 8
3340 if (utf && first_cu > 127)
3341 first_cu2 = (PCRE2_UCHAR)UCD_OTHERCASE(first_cu);
3342 #endif
3343 }
3344 }
3345 else
3346 if (!startline && (re->flags & PCRE2_FIRSTMAPSET) != 0)
3347 start_bits = re->start_bitmap;
3348 }
3349
3350 /* For anchored or unanchored matches, there may be a "last known required
3351 character" set. */
3352
3353 if ((re->flags & PCRE2_LASTSET) != 0)
3354 {
3355 has_req_cu = TRUE;
3356 req_cu = req_cu2 = (PCRE2_UCHAR)(re->last_codeunit);
3357 if ((re->flags & PCRE2_LASTCASELESS) != 0)
3358 {
3359 req_cu2 = TABLE_GET(req_cu, mb->tables + fcc_offset, req_cu);
3360 #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 8
3361 if (utf && req_cu > 127) req_cu2 = (PCRE2_UCHAR)UCD_OTHERCASE(req_cu);
3362 #endif
3363 }
3364 }
3365
3366 /* Fill in fields that are always returned in the match data. */
3367
3368 match_data->code = re;
3369 match_data->subject = subject;
3370 match_data->mark = NULL;
3371 match_data->matchedby = PCRE2_MATCHEDBY_DFA_INTERPRETER;
3372
3373 /* Call the main matching function, looping for a non-anchored regex after a
3374 failed match. If not restarting, perform certain optimizations at the start of
3375 a match. */
3376
3377 for (;;)
3378 {
3379 int rc;
3380
3381 /* ----------------- Start of match optimizations ---------------- */
3382
3383 /* There are some optimizations that avoid running the match if a known
3384 starting point is not found, or if a known later code unit is not present.
3385 However, there is an option (settable at compile time) that disables
3386 these, for testing and for ensuring that all callouts do actually occur.
3387 The optimizations must also be avoided when restarting a DFA match. */
3388
3389 if ((re->overall_options & PCRE2_NO_START_OPTIMIZE) == 0 &&
3390 (options & PCRE2_DFA_RESTART) == 0)
3391 {
3392 PCRE2_SPTR save_end_subject = end_subject;
3393
3394 /* If firstline is TRUE, the start of the match is constrained to the first
3395 line of a multiline string. That is, the match must be before or at the
3396 first newline. Implement this by temporarily adjusting end_subject so that
3397 we stop the optimization scans at a newline. If the match fails at the
3398 newline, later code breaks this loop. */
3399
3400 if (firstline)
3401 {
3402 PCRE2_SPTR t = start_match;
3403 #ifdef SUPPORT_UNICODE
3404 if (utf)
3405 {
3406 while (t < mb->end_subject && !IS_NEWLINE(t))
3407 {
3408 t++;
3409 ACROSSCHAR(t < end_subject, *t, t++);
3410 }
3411 }
3412 else
3413 #endif
3414 while (t < mb->end_subject && !IS_NEWLINE(t)) t++;
3415 end_subject = t;
3416 }
3417
3418 /* Advance to a unique first code unit if there is one. */
3419
3420 if (has_first_cu)
3421 {
3422 PCRE2_UCHAR smc;
3423 if (first_cu != first_cu2)
3424 while (start_match < end_subject &&
3425 (smc = UCHAR21TEST(start_match)) != first_cu && smc != first_cu2)
3426 start_match++;
3427 else
3428 while (start_match < end_subject && UCHAR21TEST(start_match) != first_cu)
3429 start_match++;
3430 }
3431
3432 /* Or to just after a linebreak for a multiline match */
3433
3434 else if (startline)
3435 {
3436 if (start_match > mb->start_subject + start_offset)
3437 {
3438 #ifdef SUPPORT_UNICODE
3439 if (utf)
3440 {
3441 while (start_match < end_subject && !WAS_NEWLINE(start_match))
3442 {
3443 start_match++;
3444 ACROSSCHAR(start_match < end_subject, *start_match,
3445 start_match++);
3446 }
3447 }
3448 else
3449 #endif
3450 while (start_match < end_subject && !WAS_NEWLINE(start_match))
3451 start_match++;
3452
3453 /* If we have just passed a CR and the newline option is ANY or
3454 ANYCRLF, and we are now at a LF, advance the match position by one more
3455 code unit. */
3456
3457 if (start_match[-1] == CHAR_CR &&
3458 (mb->nltype == NLTYPE_ANY || mb->nltype == NLTYPE_ANYCRLF) &&
3459 start_match < end_subject &&
3460 UCHAR21TEST(start_match) == CHAR_NL)
3461 start_match++;
3462 }
3463 }
3464
3465 /* Or to a non-unique first code unit if any have been identified. The
3466 bitmap contains only 256 bits. When code units are 16 or 32 bits wide, all
3467 code units greater than 254 set the 255 bit. */
3468
3469 else if (start_bits != NULL)
3470 {
3471 while (start_match < end_subject)
3472 {
3473 register uint32_t c = UCHAR21TEST(start_match);
3474 #if PCRE2_CODE_UNIT_WIDTH != 8
3475 if (c > 255) c = 255;
3476 #endif
3477 if ((start_bits[c/8] & (1 << (c&7))) != 0) break;
3478 start_match++;
3479 }
3480 }
3481
3482 /* Restore fudged end_subject */
3483
3484 end_subject = save_end_subject;
3485
3486 /* The following two optimizations are disabled for partial matching. */
3487
3488 if ((mb->moptions & (PCRE2_PARTIAL_HARD|PCRE2_PARTIAL_SOFT)) == 0)
3489 {
3490 /* The minimum matching length is a lower bound; no actual string of that
3491 length may actually match the pattern. Although the value is, strictly,
3492 in characters, we treat it as code units to avoid spending too much time
3493 in this optimization. */
3494
3495 if (end_subject - start_match < re->minlength) return PCRE2_ERROR_NOMATCH;
3496
3497 /* If req_cu is set, we know that that code unit must appear in the
3498 subject for the match to succeed. If the first code unit is set, req_cu
3499 must be later in the subject; otherwise the test starts at the match
3500 point. This optimization can save a huge amount of backtracking in
3501 patterns with nested unlimited repeats that aren't going to match.
3502 Writing separate code for cased/caseless versions makes it go faster, as
3503 does using an autoincrement and backing off on a match.
3504
3505 HOWEVER: when the subject string is very, very long, searching to its end
3506 can take a long time, and give bad performance on quite ordinary
3507 patterns. This showed up when somebody was matching something like
3508 /^\d+C/ on a 32-megabyte string... so we don't do this when the string is
3509 sufficiently long. */
3510
3511 if (has_req_cu && end_subject - start_match < REQ_CU_MAX)
3512 {
3513 register PCRE2_SPTR p = start_match + (has_first_cu? 1:0);
3514
3515 /* We don't need to repeat the search if we haven't yet reached the
3516 place we found it at last time. */
3517
3518 if (p > req_cu_ptr)
3519 {
3520 if (req_cu != req_cu2)
3521 {
3522 while (p < end_subject)
3523 {
3524 register uint32_t pp = UCHAR21INCTEST(p);
3525 if (pp == req_cu || pp == req_cu2) { p--; break; }
3526 }
3527 }
3528 else
3529 {
3530 while (p < end_subject)
3531 {
3532 if (UCHAR21INCTEST(p) == req_cu) { p--; break; }
3533 }
3534 }
3535
3536 /* If we can't find the required code unit, break the matching loop,
3537 forcing a match failure. */
3538
3539 if (p >= end_subject) break;
3540
3541 /* If we have found the required code unit, save the point where we
3542 found it, so that we don't search again next time round the loop if
3543 the start hasn't passed this code unit yet. */
3544
3545 req_cu_ptr = p;
3546 }
3547 }
3548 }
3549 }
3550
3551 /* ------------ End of start of match optimizations ------------ */
3552
3553 /* Give no match if we have passed the bumpalong limit. */
3554
3555 if (start_match > bumpalong_limit) break;
3556
3557 /* OK, now we can do the business */
3558
3559 mb->start_used_ptr = start_match;
3560 mb->last_used_ptr = start_match;
3561 mb->recursive = NULL;
3562
3563 rc = internal_dfa_match(
3564 mb, /* fixed match data */
3565 mb->start_code, /* this subexpression's code */
3566 start_match, /* where we currently are */
3567 start_offset, /* start offset in subject */
3568 match_data->ovector, /* offset vector */
3569 (uint32_t)match_data->oveccount * 2, /* actual size of same */
3570 workspace, /* workspace vector */
3571 (int)wscount, /* size of same */
3572 0); /* function recurse level */
3573
3574 /* Anything other than "no match" means we are done, always; otherwise, carry
3575 on only if not anchored. */
3576
3577 if (rc != PCRE2_ERROR_NOMATCH || anchored)
3578 {
3579 if (rc == PCRE2_ERROR_PARTIAL && match_data->oveccount > 0)
3580 {
3581 match_data->ovector[0] = (PCRE2_SIZE)(start_match - subject);
3582 match_data->ovector[1] = (PCRE2_SIZE)(end_subject - subject);
3583 }
3584 match_data->leftchar = (PCRE2_SIZE)(mb->start_used_ptr - subject);
3585 match_data->rightchar = (PCRE2_SIZE)( mb->last_used_ptr - subject);
3586 match_data->startchar = (PCRE2_SIZE)(start_match - subject);
3587 match_data->rc = rc;
3588 return rc;
3589 }
3590
3591 /* Advance to the next subject character unless we are at the end of a line
3592 and firstline is set. */
3593
3594 if (firstline && IS_NEWLINE(start_match)) break;
3595 start_match++;
3596 #ifdef SUPPORT_UNICODE
3597 if (utf)
3598 {
3599 ACROSSCHAR(start_match < end_subject, *start_match,
3600 start_match++);
3601 }
3602 #endif
3603 if (start_match > end_subject) break;
3604
3605 /* If we have just passed a CR and we are now at a LF, and the pattern does
3606 not contain any explicit matches for \r or \n, and the newline option is CRLF
3607 or ANY or ANYCRLF, advance the match position by one more character. */
3608
3609 if (UCHAR21TEST(start_match - 1) == CHAR_CR &&
3610 start_match < end_subject &&
3611 UCHAR21TEST(start_match) == CHAR_NL &&
3612 (re->flags & PCRE2_HASCRORLF) == 0 &&
3613 (mb->nltype == NLTYPE_ANY ||
3614 mb->nltype == NLTYPE_ANYCRLF ||
3615 mb->nllen == 2))
3616 start_match++;
3617
3618 } /* "Bumpalong" loop */
3619
3620
3621 return PCRE2_ERROR_NOMATCH;
3622 }
3623
3624 /* End of pcre2_dfa_match.c */
3625