1 /*************************************************
2 * Perl-Compatible Regular Expressions *
3 *************************************************/
4
5 /* PCRE is a library of functions to support regular expressions whose syntax
6 and semantics are as close as possible to those of the Perl 5 language.
7
8 Written by Philip Hazel
9 Original API code Copyright (c) 1997-2012 University of Cambridge
10 New API code Copyright (c) 2016-2018 University of Cambridge
11
12 -----------------------------------------------------------------------------
13 Redistribution and use in source and binary forms, with or without
14 modification, are permitted provided that the following conditions are met:
15
16 * Redistributions of source code must retain the above copyright notice,
17 this list of conditions and the following disclaimer.
18
19 * Redistributions in binary form must reproduce the above copyright
20 notice, this list of conditions and the following disclaimer in the
21 documentation and/or other materials provided with the distribution.
22
23 * Neither the name of the University of Cambridge nor the names of its
24 contributors may be used to endorse or promote products derived from
25 this software without specific prior written permission.
26
27 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
28 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
29 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
30 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
31 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
32 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
33 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
34 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
35 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
36 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
37 POSSIBILITY OF SUCH DAMAGE.
38 -----------------------------------------------------------------------------
39 */
40
41
42 /* This module contains the external function pcre2_dfa_match(), which is an
43 alternative matching function that uses a sort of DFA algorithm (not a true
44 FSM). This is NOT Perl-compatible, but it has advantages in certain
45 applications. */
46
47
48 /* NOTE ABOUT PERFORMANCE: A user of this function sent some code that improved
49 the performance of his patterns greatly. I could not use it as it stood, as it
50 was not thread safe, and made assumptions about pattern sizes. Also, it caused
51 test 7 to loop, and test 9 to crash with a segfault.
52
53 The issue is the check for duplicate states, which is done by a simple linear
54 search up the state list. (Grep for "duplicate" below to find the code.) For
55 many patterns, there will never be many states active at one time, so a simple
56 linear search is fine. In patterns that have many active states, it might be a
57 bottleneck. The suggested code used an indexing scheme to remember which states
58 had previously been used for each character, and avoided the linear search when
59 it knew there was no chance of a duplicate. This was implemented when adding
60 states to the state lists.
61
62 I wrote some thread-safe, not-limited code to try something similar at the time
63 of checking for duplicates (instead of when adding states), using index vectors
64 on the stack. It did give a 13% improvement with one specially constructed
65 pattern for certain subject strings, but on other strings and on many of the
66 simpler patterns in the test suite it did worse. The major problem, I think,
67 was the extra time to initialize the index. This had to be done for each call
68 of internal_dfa_match(). (The supplied patch used a static vector, initialized
69 only once - I suspect this was the cause of the problems with the tests.)
70
71 Overall, I concluded that the gains in some cases did not outweigh the losses
72 in others, so I abandoned this code. */
73
74
75 #ifdef HAVE_CONFIG_H
76 #include "config.h"
77 #endif
78
79 #define NLBLOCK mb /* Block containing newline information */
80 #define PSSTART start_subject /* Field containing processed string start */
81 #define PSEND end_subject /* Field containing processed string end */
82
83 #include "pcre2_internal.h"
84
85 #define PUBLIC_DFA_MATCH_OPTIONS \
86 (PCRE2_ANCHORED|PCRE2_ENDANCHORED|PCRE2_NOTBOL|PCRE2_NOTEOL|PCRE2_NOTEMPTY| \
87 PCRE2_NOTEMPTY_ATSTART|PCRE2_NO_UTF_CHECK|PCRE2_PARTIAL_HARD| \
88 PCRE2_PARTIAL_SOFT|PCRE2_DFA_SHORTEST|PCRE2_DFA_RESTART)
89
90
91 /*************************************************
92 * Code parameters and static tables *
93 *************************************************/
94
95 /* These are offsets that are used to turn the OP_TYPESTAR and friends opcodes
96 into others, under special conditions. A gap of 20 between the blocks should be
97 enough. The resulting opcodes don't have to be less than 256 because they are
98 never stored, so we push them well clear of the normal opcodes. */
99
100 #define OP_PROP_EXTRA 300
101 #define OP_EXTUNI_EXTRA 320
102 #define OP_ANYNL_EXTRA 340
103 #define OP_HSPACE_EXTRA 360
104 #define OP_VSPACE_EXTRA 380
105
106
107 /* This table identifies those opcodes that are followed immediately by a
108 character that is to be tested in some way. This makes it possible to
109 centralize the loading of these characters. In the case of Type * etc, the
110 "character" is the opcode for \D, \d, \S, \s, \W, or \w, which will always be a
111 small value. Non-zero values in the table are the offsets from the opcode where
112 the character is to be found. ***NOTE*** If the start of this table is
113 modified, the three tables that follow must also be modified. */
114
115 static const uint8_t coptable[] = {
116 0, /* End */
117 0, 0, 0, 0, 0, /* \A, \G, \K, \B, \b */
118 0, 0, 0, 0, 0, 0, /* \D, \d, \S, \s, \W, \w */
119 0, 0, 0, /* Any, AllAny, Anybyte */
120 0, 0, /* \P, \p */
121 0, 0, 0, 0, 0, /* \R, \H, \h, \V, \v */
122 0, /* \X */
123 0, 0, 0, 0, 0, 0, /* \Z, \z, $, $M, ^, ^M */
124 1, /* Char */
125 1, /* Chari */
126 1, /* not */
127 1, /* noti */
128 /* Positive single-char repeats */
129 1, 1, 1, 1, 1, 1, /* *, *?, +, +?, ?, ?? */
130 1+IMM2_SIZE, 1+IMM2_SIZE, /* upto, minupto */
131 1+IMM2_SIZE, /* exact */
132 1, 1, 1, 1+IMM2_SIZE, /* *+, ++, ?+, upto+ */
133 1, 1, 1, 1, 1, 1, /* *I, *?I, +I, +?I, ?I, ??I */
134 1+IMM2_SIZE, 1+IMM2_SIZE, /* upto I, minupto I */
135 1+IMM2_SIZE, /* exact I */
136 1, 1, 1, 1+IMM2_SIZE, /* *+I, ++I, ?+I, upto+I */
137 /* Negative single-char repeats - only for chars < 256 */
138 1, 1, 1, 1, 1, 1, /* NOT *, *?, +, +?, ?, ?? */
139 1+IMM2_SIZE, 1+IMM2_SIZE, /* NOT upto, minupto */
140 1+IMM2_SIZE, /* NOT exact */
141 1, 1, 1, 1+IMM2_SIZE, /* NOT *+, ++, ?+, upto+ */
142 1, 1, 1, 1, 1, 1, /* NOT *I, *?I, +I, +?I, ?I, ??I */
143 1+IMM2_SIZE, 1+IMM2_SIZE, /* NOT upto I, minupto I */
144 1+IMM2_SIZE, /* NOT exact I */
145 1, 1, 1, 1+IMM2_SIZE, /* NOT *+I, ++I, ?+I, upto+I */
146 /* Positive type repeats */
147 1, 1, 1, 1, 1, 1, /* Type *, *?, +, +?, ?, ?? */
148 1+IMM2_SIZE, 1+IMM2_SIZE, /* Type upto, minupto */
149 1+IMM2_SIZE, /* Type exact */
150 1, 1, 1, 1+IMM2_SIZE, /* Type *+, ++, ?+, upto+ */
151 /* Character class & ref repeats */
152 0, 0, 0, 0, 0, 0, /* *, *?, +, +?, ?, ?? */
153 0, 0, /* CRRANGE, CRMINRANGE */
154 0, 0, 0, 0, /* Possessive *+, ++, ?+, CRPOSRANGE */
155 0, /* CLASS */
156 0, /* NCLASS */
157 0, /* XCLASS - variable length */
158 0, /* REF */
159 0, /* REFI */
160 0, /* DNREF */
161 0, /* DNREFI */
162 0, /* RECURSE */
163 0, /* CALLOUT */
164 0, /* CALLOUT_STR */
165 0, /* Alt */
166 0, /* Ket */
167 0, /* KetRmax */
168 0, /* KetRmin */
169 0, /* KetRpos */
170 0, /* Reverse */
171 0, /* Assert */
172 0, /* Assert not */
173 0, /* Assert behind */
174 0, /* Assert behind not */
175 0, /* ONCE */
176 0, 0, 0, 0, 0, /* BRA, BRAPOS, CBRA, CBRAPOS, COND */
177 0, 0, 0, 0, 0, /* SBRA, SBRAPOS, SCBRA, SCBRAPOS, SCOND */
178 0, 0, /* CREF, DNCREF */
179 0, 0, /* RREF, DNRREF */
180 0, 0, /* FALSE, TRUE */
181 0, 0, 0, /* BRAZERO, BRAMINZERO, BRAPOSZERO */
182 0, 0, 0, /* MARK, PRUNE, PRUNE_ARG */
183 0, 0, 0, 0, /* SKIP, SKIP_ARG, THEN, THEN_ARG */
184 0, 0, /* COMMIT, COMMIT_ARG */
185 0, 0, 0, /* FAIL, ACCEPT, ASSERT_ACCEPT */
186 0, 0, 0 /* CLOSE, SKIPZERO, DEFINE */
187 };
188
189 /* This table identifies those opcodes that inspect a character. It is used to
190 remember the fact that a character could have been inspected when the end of
191 the subject is reached. ***NOTE*** If the start of this table is modified, the
192 two tables that follow must also be modified. */
193
194 static const uint8_t poptable[] = {
195 0, /* End */
196 0, 0, 0, 1, 1, /* \A, \G, \K, \B, \b */
197 1, 1, 1, 1, 1, 1, /* \D, \d, \S, \s, \W, \w */
198 1, 1, 1, /* Any, AllAny, Anybyte */
199 1, 1, /* \P, \p */
200 1, 1, 1, 1, 1, /* \R, \H, \h, \V, \v */
201 1, /* \X */
202 0, 0, 0, 0, 0, 0, /* \Z, \z, $, $M, ^, ^M */
203 1, /* Char */
204 1, /* Chari */
205 1, /* not */
206 1, /* noti */
207 /* Positive single-char repeats */
208 1, 1, 1, 1, 1, 1, /* *, *?, +, +?, ?, ?? */
209 1, 1, 1, /* upto, minupto, exact */
210 1, 1, 1, 1, /* *+, ++, ?+, upto+ */
211 1, 1, 1, 1, 1, 1, /* *I, *?I, +I, +?I, ?I, ??I */
212 1, 1, 1, /* upto I, minupto I, exact I */
213 1, 1, 1, 1, /* *+I, ++I, ?+I, upto+I */
214 /* Negative single-char repeats - only for chars < 256 */
215 1, 1, 1, 1, 1, 1, /* NOT *, *?, +, +?, ?, ?? */
216 1, 1, 1, /* NOT upto, minupto, exact */
217 1, 1, 1, 1, /* NOT *+, ++, ?+, upto+ */
218 1, 1, 1, 1, 1, 1, /* NOT *I, *?I, +I, +?I, ?I, ??I */
219 1, 1, 1, /* NOT upto I, minupto I, exact I */
220 1, 1, 1, 1, /* NOT *+I, ++I, ?+I, upto+I */
221 /* Positive type repeats */
222 1, 1, 1, 1, 1, 1, /* Type *, *?, +, +?, ?, ?? */
223 1, 1, 1, /* Type upto, minupto, exact */
224 1, 1, 1, 1, /* Type *+, ++, ?+, upto+ */
225 /* Character class & ref repeats */
226 1, 1, 1, 1, 1, 1, /* *, *?, +, +?, ?, ?? */
227 1, 1, /* CRRANGE, CRMINRANGE */
228 1, 1, 1, 1, /* Possessive *+, ++, ?+, CRPOSRANGE */
229 1, /* CLASS */
230 1, /* NCLASS */
231 1, /* XCLASS - variable length */
232 0, /* REF */
233 0, /* REFI */
234 0, /* DNREF */
235 0, /* DNREFI */
236 0, /* RECURSE */
237 0, /* CALLOUT */
238 0, /* CALLOUT_STR */
239 0, /* Alt */
240 0, /* Ket */
241 0, /* KetRmax */
242 0, /* KetRmin */
243 0, /* KetRpos */
244 0, /* Reverse */
245 0, /* Assert */
246 0, /* Assert not */
247 0, /* Assert behind */
248 0, /* Assert behind not */
249 0, /* ONCE */
250 0, 0, 0, 0, 0, /* BRA, BRAPOS, CBRA, CBRAPOS, COND */
251 0, 0, 0, 0, 0, /* SBRA, SBRAPOS, SCBRA, SCBRAPOS, SCOND */
252 0, 0, /* CREF, DNCREF */
253 0, 0, /* RREF, DNRREF */
254 0, 0, /* FALSE, TRUE */
255 0, 0, 0, /* BRAZERO, BRAMINZERO, BRAPOSZERO */
256 0, 0, 0, /* MARK, PRUNE, PRUNE_ARG */
257 0, 0, 0, 0, /* SKIP, SKIP_ARG, THEN, THEN_ARG */
258 0, 0, /* COMMIT, COMMIT_ARG */
259 0, 0, 0, /* FAIL, ACCEPT, ASSERT_ACCEPT */
260 0, 0, 0 /* CLOSE, SKIPZERO, DEFINE */
261 };
262
263 /* These 2 tables allow for compact code for testing for \D, \d, \S, \s, \W,
264 and \w */
265
266 static const uint8_t toptable1[] = {
267 0, 0, 0, 0, 0, 0,
268 ctype_digit, ctype_digit,
269 ctype_space, ctype_space,
270 ctype_word, ctype_word,
271 0, 0 /* OP_ANY, OP_ALLANY */
272 };
273
274 static const uint8_t toptable2[] = {
275 0, 0, 0, 0, 0, 0,
276 ctype_digit, 0,
277 ctype_space, 0,
278 ctype_word, 0,
279 1, 1 /* OP_ANY, OP_ALLANY */
280 };
281
282
283 /* Structure for holding data about a particular state, which is in effect the
284 current data for an active path through the match tree. It must consist
285 entirely of ints because the working vector we are passed, and which we put
286 these structures in, is a vector of ints. */
287
288 typedef struct stateblock {
289 int offset; /* Offset to opcode (-ve has meaning) */
290 int count; /* Count for repeats */
291 int data; /* Some use extra data */
292 } stateblock;
293
294 #define INTS_PER_STATEBLOCK (int)(sizeof(stateblock)/sizeof(int))
295
296
297 /* Before version 10.32 the recursive calls of internal_dfa_match() were passed
298 local working space and output vectors that were created on the stack. This has
299 caused issues for some patterns, especially in small-stack environments such as
300 Windows. A new scheme is now in use which sets up a vector on the stack, but if
301 this is too small, heap memory is used, up to the heap_limit. The main
302 parameters are all numbers of ints because the workspace is a vector of ints.
303
304 The size of the starting stack vector, DFA_START_RWS_SIZE, is in bytes, and is
305 defined in pcre2_internal.h so as to be available to pcre2test when it is
306 finding the minimum heap requirement for a match. */
307
308 #define OVEC_UNIT (sizeof(PCRE2_SIZE)/sizeof(int))
309
310 #define RWS_BASE_SIZE (DFA_START_RWS_SIZE/sizeof(int)) /* Stack vector */
311 #define RWS_RSIZE 1000 /* Work size for recursion */
312 #define RWS_OVEC_RSIZE (1000*OVEC_UNIT) /* Ovector for recursion */
313 #define RWS_OVEC_OSIZE (2*OVEC_UNIT) /* Ovector in other cases */
314
315 /* This structure is at the start of each workspace block. */
316
317 typedef struct RWS_anchor {
318 struct RWS_anchor *next;
319 unsigned int size; /* Number of ints */
320 unsigned int free; /* Number of ints */
321 } RWS_anchor;
322
323 #define RWS_ANCHOR_SIZE (sizeof(RWS_anchor)/sizeof(int))
324
325
326
327 /*************************************************
328 * Process a callout *
329 *************************************************/
330
331 /* This function is called to perform a callout.
332
333 Arguments:
334 code current code pointer
335 offsets points to current capture offsets
336 current_subject start of current subject match
337 ptr current position in subject
338 mb the match block
339 extracode extra code offset when called from condition
340 lengthptr where to return the callout length
341
342 Returns: the return from the callout
343 */
344
345 static int
do_callout(PCRE2_SPTR code,PCRE2_SIZE * offsets,PCRE2_SPTR current_subject,PCRE2_SPTR ptr,dfa_match_block * mb,PCRE2_SIZE extracode,PCRE2_SIZE * lengthptr)346 do_callout(PCRE2_SPTR code, PCRE2_SIZE *offsets, PCRE2_SPTR current_subject,
347 PCRE2_SPTR ptr, dfa_match_block *mb, PCRE2_SIZE extracode,
348 PCRE2_SIZE *lengthptr)
349 {
350 pcre2_callout_block *cb = mb->cb;
351
352 *lengthptr = (code[extracode] == OP_CALLOUT)?
353 (PCRE2_SIZE)PRIV(OP_lengths)[OP_CALLOUT] :
354 (PCRE2_SIZE)GET(code, 1 + 2*LINK_SIZE + extracode);
355
356 if (mb->callout == NULL) return 0; /* No callout provided */
357
358 /* Fixed fields in the callout block are set once and for all at the start of
359 matching. */
360
361 cb->offset_vector = offsets;
362 cb->start_match = (PCRE2_SIZE)(current_subject - mb->start_subject);
363 cb->current_position = (PCRE2_SIZE)(ptr - mb->start_subject);
364 cb->pattern_position = GET(code, 1 + extracode);
365 cb->next_item_length = GET(code, 1 + LINK_SIZE + extracode);
366
367 if (code[extracode] == OP_CALLOUT)
368 {
369 cb->callout_number = code[1 + 2*LINK_SIZE + extracode];
370 cb->callout_string_offset = 0;
371 cb->callout_string = NULL;
372 cb->callout_string_length = 0;
373 }
374 else
375 {
376 cb->callout_number = 0;
377 cb->callout_string_offset = GET(code, 1 + 3*LINK_SIZE + extracode);
378 cb->callout_string = code + (1 + 4*LINK_SIZE + extracode) + 1;
379 cb->callout_string_length = *lengthptr - (1 + 4*LINK_SIZE) - 2;
380 }
381
382 return (mb->callout)(cb, mb->callout_data);
383 }
384
385
386
387 /*************************************************
388 * Expand local workspace memory *
389 *************************************************/
390
391 /* This function is called when internal_dfa_match() is about to be called
392 recursively and there is insufficient working space left in the current
393 workspace block. If there's an existing next block, use it; otherwise get a new
394 block unless the heap limit is reached.
395
396 Arguments:
397 rwsptr pointer to block pointer (updated)
398 ovecsize space needed for an ovector
399 mb the match block
400
401 Returns: 0 rwsptr has been updated
402 !0 an error code
403 */
404
405 static int
more_workspace(RWS_anchor ** rwsptr,unsigned int ovecsize,dfa_match_block * mb)406 more_workspace(RWS_anchor **rwsptr, unsigned int ovecsize, dfa_match_block *mb)
407 {
408 RWS_anchor *rws = *rwsptr;
409 RWS_anchor *new;
410
411 if (rws->next != NULL)
412 {
413 new = rws->next;
414 }
415
416 /* All sizes are in units of sizeof(int), except for mb->heaplimit, which is in
417 kibibytes. */
418
419 else
420 {
421 unsigned int newsize = rws->size * 2;
422 unsigned int heapleft = (unsigned int)
423 (((1024/sizeof(int))*mb->heap_limit - mb->heap_used));
424 if (newsize > heapleft) newsize = heapleft;
425 if (newsize < RWS_RSIZE + ovecsize + RWS_ANCHOR_SIZE)
426 return PCRE2_ERROR_HEAPLIMIT;
427 new = mb->memctl.malloc(newsize*sizeof(int), mb->memctl.memory_data);
428 if (new == NULL) return PCRE2_ERROR_NOMEMORY;
429 mb->heap_used += newsize;
430 new->next = NULL;
431 new->size = newsize;
432 rws->next = new;
433 }
434
435 new->free = new->size - RWS_ANCHOR_SIZE;
436 *rwsptr = new;
437 return 0;
438 }
439
440
441
442 /*************************************************
443 * Match a Regular Expression - DFA engine *
444 *************************************************/
445
446 /* This internal function applies a compiled pattern to a subject string,
447 starting at a given point, using a DFA engine. This function is called from the
448 external one, possibly multiple times if the pattern is not anchored. The
449 function calls itself recursively for some kinds of subpattern.
450
451 Arguments:
452 mb the match_data block with fixed information
453 this_start_code the opening bracket of this subexpression's code
454 current_subject where we currently are in the subject string
455 start_offset start offset in the subject string
456 offsets vector to contain the matching string offsets
457 offsetcount size of same
458 workspace vector of workspace
459 wscount size of same
460 rlevel function call recursion level
461
462 Returns: > 0 => number of match offset pairs placed in offsets
463 = 0 => offsets overflowed; longest matches are present
464 -1 => failed to match
465 < -1 => some kind of unexpected problem
466
467 The following macros are used for adding states to the two state vectors (one
468 for the current character, one for the following character). */
469
470 #define ADD_ACTIVE(x,y) \
471 if (active_count++ < wscount) \
472 { \
473 next_active_state->offset = (x); \
474 next_active_state->count = (y); \
475 next_active_state++; \
476 } \
477 else return PCRE2_ERROR_DFA_WSSIZE
478
479 #define ADD_ACTIVE_DATA(x,y,z) \
480 if (active_count++ < wscount) \
481 { \
482 next_active_state->offset = (x); \
483 next_active_state->count = (y); \
484 next_active_state->data = (z); \
485 next_active_state++; \
486 } \
487 else return PCRE2_ERROR_DFA_WSSIZE
488
489 #define ADD_NEW(x,y) \
490 if (new_count++ < wscount) \
491 { \
492 next_new_state->offset = (x); \
493 next_new_state->count = (y); \
494 next_new_state++; \
495 } \
496 else return PCRE2_ERROR_DFA_WSSIZE
497
498 #define ADD_NEW_DATA(x,y,z) \
499 if (new_count++ < wscount) \
500 { \
501 next_new_state->offset = (x); \
502 next_new_state->count = (y); \
503 next_new_state->data = (z); \
504 next_new_state++; \
505 } \
506 else return PCRE2_ERROR_DFA_WSSIZE
507
508 /* And now, here is the code */
509
510 static int
internal_dfa_match(dfa_match_block * mb,PCRE2_SPTR this_start_code,PCRE2_SPTR current_subject,PCRE2_SIZE start_offset,PCRE2_SIZE * offsets,uint32_t offsetcount,int * workspace,int wscount,uint32_t rlevel,int * RWS)511 internal_dfa_match(
512 dfa_match_block *mb,
513 PCRE2_SPTR this_start_code,
514 PCRE2_SPTR current_subject,
515 PCRE2_SIZE start_offset,
516 PCRE2_SIZE *offsets,
517 uint32_t offsetcount,
518 int *workspace,
519 int wscount,
520 uint32_t rlevel,
521 int *RWS)
522 {
523 stateblock *active_states, *new_states, *temp_states;
524 stateblock *next_active_state, *next_new_state;
525 const uint8_t *ctypes, *lcc, *fcc;
526 PCRE2_SPTR ptr;
527 PCRE2_SPTR end_code;
528 dfa_recursion_info new_recursive;
529 int active_count, new_count, match_count;
530
531 /* Some fields in the mb block are frequently referenced, so we load them into
532 independent variables in the hope that this will perform better. */
533
534 PCRE2_SPTR start_subject = mb->start_subject;
535 PCRE2_SPTR end_subject = mb->end_subject;
536 PCRE2_SPTR start_code = mb->start_code;
537
538 #ifdef SUPPORT_UNICODE
539 BOOL utf = (mb->poptions & PCRE2_UTF) != 0;
540 #else
541 BOOL utf = FALSE;
542 #endif
543
544 BOOL reset_could_continue = FALSE;
545
546 if (mb->match_call_count++ >= mb->match_limit) return PCRE2_ERROR_MATCHLIMIT;
547 if (rlevel++ > mb->match_limit_depth) return PCRE2_ERROR_DEPTHLIMIT;
548 offsetcount &= (uint32_t)(-2); /* Round down */
549
550 wscount -= 2;
551 wscount = (wscount - (wscount % (INTS_PER_STATEBLOCK * 2))) /
552 (2 * INTS_PER_STATEBLOCK);
553
554 ctypes = mb->tables + ctypes_offset;
555 lcc = mb->tables + lcc_offset;
556 fcc = mb->tables + fcc_offset;
557
558 match_count = PCRE2_ERROR_NOMATCH; /* A negative number */
559
560 active_states = (stateblock *)(workspace + 2);
561 next_new_state = new_states = active_states + wscount;
562 new_count = 0;
563
564 /* The first thing in any (sub) pattern is a bracket of some sort. Push all
565 the alternative states onto the list, and find out where the end is. This
566 makes is possible to use this function recursively, when we want to stop at a
567 matching internal ket rather than at the end.
568
569 If we are dealing with a backward assertion we have to find out the maximum
570 amount to move back, and set up each alternative appropriately. */
571
572 if (*this_start_code == OP_ASSERTBACK || *this_start_code == OP_ASSERTBACK_NOT)
573 {
574 size_t max_back = 0;
575 size_t gone_back;
576
577 end_code = this_start_code;
578 do
579 {
580 size_t back = (size_t)GET(end_code, 2+LINK_SIZE);
581 if (back > max_back) max_back = back;
582 end_code += GET(end_code, 1);
583 }
584 while (*end_code == OP_ALT);
585
586 /* If we can't go back the amount required for the longest lookbehind
587 pattern, go back as far as we can; some alternatives may still be viable. */
588
589 #ifdef SUPPORT_UNICODE
590 /* In character mode we have to step back character by character */
591
592 if (utf)
593 {
594 for (gone_back = 0; gone_back < max_back; gone_back++)
595 {
596 if (current_subject <= start_subject) break;
597 current_subject--;
598 ACROSSCHAR(current_subject > start_subject, current_subject,
599 current_subject--);
600 }
601 }
602 else
603 #endif
604
605 /* In byte-mode we can do this quickly. */
606
607 {
608 size_t current_offset = (size_t)(current_subject - start_subject);
609 gone_back = (current_offset < max_back)? current_offset : max_back;
610 current_subject -= gone_back;
611 }
612
613 /* Save the earliest consulted character */
614
615 if (current_subject < mb->start_used_ptr)
616 mb->start_used_ptr = current_subject;
617
618 /* Now we can process the individual branches. There will be an OP_REVERSE at
619 the start of each branch, except when the length of the branch is zero. */
620
621 end_code = this_start_code;
622 do
623 {
624 uint32_t revlen = (end_code[1+LINK_SIZE] == OP_REVERSE)? 1 + LINK_SIZE : 0;
625 size_t back = (revlen == 0)? 0 : (size_t)GET(end_code, 2+LINK_SIZE);
626 if (back <= gone_back)
627 {
628 int bstate = (int)(end_code - start_code + 1 + LINK_SIZE + revlen);
629 ADD_NEW_DATA(-bstate, 0, (int)(gone_back - back));
630 }
631 end_code += GET(end_code, 1);
632 }
633 while (*end_code == OP_ALT);
634 }
635
636 /* This is the code for a "normal" subpattern (not a backward assertion). The
637 start of a whole pattern is always one of these. If we are at the top level,
638 we may be asked to restart matching from the same point that we reached for a
639 previous partial match. We still have to scan through the top-level branches to
640 find the end state. */
641
642 else
643 {
644 end_code = this_start_code;
645
646 /* Restarting */
647
648 if (rlevel == 1 && (mb->moptions & PCRE2_DFA_RESTART) != 0)
649 {
650 do { end_code += GET(end_code, 1); } while (*end_code == OP_ALT);
651 new_count = workspace[1];
652 if (!workspace[0])
653 memcpy(new_states, active_states, (size_t)new_count * sizeof(stateblock));
654 }
655
656 /* Not restarting */
657
658 else
659 {
660 int length = 1 + LINK_SIZE +
661 ((*this_start_code == OP_CBRA || *this_start_code == OP_SCBRA ||
662 *this_start_code == OP_CBRAPOS || *this_start_code == OP_SCBRAPOS)
663 ? IMM2_SIZE:0);
664 do
665 {
666 ADD_NEW((int)(end_code - start_code + length), 0);
667 end_code += GET(end_code, 1);
668 length = 1 + LINK_SIZE;
669 }
670 while (*end_code == OP_ALT);
671 }
672 }
673
674 workspace[0] = 0; /* Bit indicating which vector is current */
675
676 /* Loop for scanning the subject */
677
678 ptr = current_subject;
679 for (;;)
680 {
681 int i, j;
682 int clen, dlen;
683 uint32_t c, d;
684 int forced_fail = 0;
685 BOOL partial_newline = FALSE;
686 BOOL could_continue = reset_could_continue;
687 reset_could_continue = FALSE;
688
689 if (ptr > mb->last_used_ptr) mb->last_used_ptr = ptr;
690
691 /* Make the new state list into the active state list and empty the
692 new state list. */
693
694 temp_states = active_states;
695 active_states = new_states;
696 new_states = temp_states;
697 active_count = new_count;
698 new_count = 0;
699
700 workspace[0] ^= 1; /* Remember for the restarting feature */
701 workspace[1] = active_count;
702
703 /* Set the pointers for adding new states */
704
705 next_active_state = active_states + active_count;
706 next_new_state = new_states;
707
708 /* Load the current character from the subject outside the loop, as many
709 different states may want to look at it, and we assume that at least one
710 will. */
711
712 if (ptr < end_subject)
713 {
714 clen = 1; /* Number of data items in the character */
715 #ifdef SUPPORT_UNICODE
716 GETCHARLENTEST(c, ptr, clen);
717 #else
718 c = *ptr;
719 #endif /* SUPPORT_UNICODE */
720 }
721 else
722 {
723 clen = 0; /* This indicates the end of the subject */
724 c = NOTACHAR; /* This value should never actually be used */
725 }
726
727 /* Scan up the active states and act on each one. The result of an action
728 may be to add more states to the currently active list (e.g. on hitting a
729 parenthesis) or it may be to put states on the new list, for considering
730 when we move the character pointer on. */
731
732 for (i = 0; i < active_count; i++)
733 {
734 stateblock *current_state = active_states + i;
735 BOOL caseless = FALSE;
736 PCRE2_SPTR code;
737 uint32_t codevalue;
738 int state_offset = current_state->offset;
739 int rrc;
740 int count;
741
742 /* A negative offset is a special case meaning "hold off going to this
743 (negated) state until the number of characters in the data field have
744 been skipped". If the could_continue flag was passed over from a previous
745 state, arrange for it to passed on. */
746
747 if (state_offset < 0)
748 {
749 if (current_state->data > 0)
750 {
751 ADD_NEW_DATA(state_offset, current_state->count,
752 current_state->data - 1);
753 if (could_continue) reset_could_continue = TRUE;
754 continue;
755 }
756 else
757 {
758 current_state->offset = state_offset = -state_offset;
759 }
760 }
761
762 /* Check for a duplicate state with the same count, and skip if found.
763 See the note at the head of this module about the possibility of improving
764 performance here. */
765
766 for (j = 0; j < i; j++)
767 {
768 if (active_states[j].offset == state_offset &&
769 active_states[j].count == current_state->count)
770 goto NEXT_ACTIVE_STATE;
771 }
772
773 /* The state offset is the offset to the opcode */
774
775 code = start_code + state_offset;
776 codevalue = *code;
777
778 /* If this opcode inspects a character, but we are at the end of the
779 subject, remember the fact for use when testing for a partial match. */
780
781 if (clen == 0 && poptable[codevalue] != 0)
782 could_continue = TRUE;
783
784 /* If this opcode is followed by an inline character, load it. It is
785 tempting to test for the presence of a subject character here, but that
786 is wrong, because sometimes zero repetitions of the subject are
787 permitted.
788
789 We also use this mechanism for opcodes such as OP_TYPEPLUS that take an
790 argument that is not a data character - but is always one byte long because
791 the values are small. We have to take special action to deal with \P, \p,
792 \H, \h, \V, \v and \X in this case. To keep the other cases fast, convert
793 these ones to new opcodes. */
794
795 if (coptable[codevalue] > 0)
796 {
797 dlen = 1;
798 #ifdef SUPPORT_UNICODE
799 if (utf) { GETCHARLEN(d, (code + coptable[codevalue]), dlen); } else
800 #endif /* SUPPORT_UNICODE */
801 d = code[coptable[codevalue]];
802 if (codevalue >= OP_TYPESTAR)
803 {
804 switch(d)
805 {
806 case OP_ANYBYTE: return PCRE2_ERROR_DFA_UITEM;
807 case OP_NOTPROP:
808 case OP_PROP: codevalue += OP_PROP_EXTRA; break;
809 case OP_ANYNL: codevalue += OP_ANYNL_EXTRA; break;
810 case OP_EXTUNI: codevalue += OP_EXTUNI_EXTRA; break;
811 case OP_NOT_HSPACE:
812 case OP_HSPACE: codevalue += OP_HSPACE_EXTRA; break;
813 case OP_NOT_VSPACE:
814 case OP_VSPACE: codevalue += OP_VSPACE_EXTRA; break;
815 default: break;
816 }
817 }
818 }
819 else
820 {
821 dlen = 0; /* Not strictly necessary, but compilers moan */
822 d = NOTACHAR; /* if these variables are not set. */
823 }
824
825
826 /* Now process the individual opcodes */
827
828 switch (codevalue)
829 {
830 /* ========================================================================== */
831 /* These cases are never obeyed. This is a fudge that causes a compile-
832 time error if the vectors coptable or poptable, which are indexed by
833 opcode, are not the correct length. It seems to be the only way to do
834 such a check at compile time, as the sizeof() operator does not work
835 in the C preprocessor. */
836
837 case OP_TABLE_LENGTH:
838 case OP_TABLE_LENGTH +
839 ((sizeof(coptable) == OP_TABLE_LENGTH) &&
840 (sizeof(poptable) == OP_TABLE_LENGTH)):
841 return 0;
842
843 /* ========================================================================== */
844 /* Reached a closing bracket. If not at the end of the pattern, carry
845 on with the next opcode. For repeating opcodes, also add the repeat
846 state. Note that KETRPOS will always be encountered at the end of the
847 subpattern, because the possessive subpattern repeats are always handled
848 using recursive calls. Thus, it never adds any new states.
849
850 At the end of the (sub)pattern, unless we have an empty string and
851 PCRE2_NOTEMPTY is set, or PCRE2_NOTEMPTY_ATSTART is set and we are at the
852 start of the subject, save the match data, shifting up all previous
853 matches so we always have the longest first. */
854
855 case OP_KET:
856 case OP_KETRMIN:
857 case OP_KETRMAX:
858 case OP_KETRPOS:
859 if (code != end_code)
860 {
861 ADD_ACTIVE(state_offset + 1 + LINK_SIZE, 0);
862 if (codevalue != OP_KET)
863 {
864 ADD_ACTIVE(state_offset - (int)GET(code, 1), 0);
865 }
866 }
867 else
868 {
869 if (ptr > current_subject ||
870 ((mb->moptions & PCRE2_NOTEMPTY) == 0 &&
871 ((mb->moptions & PCRE2_NOTEMPTY_ATSTART) == 0 ||
872 current_subject > start_subject + mb->start_offset)))
873 {
874 if (match_count < 0) match_count = (offsetcount >= 2)? 1 : 0;
875 else if (match_count > 0 && ++match_count * 2 > (int)offsetcount)
876 match_count = 0;
877 count = ((match_count == 0)? (int)offsetcount : match_count * 2) - 2;
878 if (count > 0) (void)memmove(offsets + 2, offsets,
879 (size_t)count * sizeof(PCRE2_SIZE));
880 if (offsetcount >= 2)
881 {
882 offsets[0] = (PCRE2_SIZE)(current_subject - start_subject);
883 offsets[1] = (PCRE2_SIZE)(ptr - start_subject);
884 }
885 if ((mb->moptions & PCRE2_DFA_SHORTEST) != 0) return match_count;
886 }
887 }
888 break;
889
890 /* ========================================================================== */
891 /* These opcodes add to the current list of states without looking
892 at the current character. */
893
894 /*-----------------------------------------------------------------*/
895 case OP_ALT:
896 do { code += GET(code, 1); } while (*code == OP_ALT);
897 ADD_ACTIVE((int)(code - start_code), 0);
898 break;
899
900 /*-----------------------------------------------------------------*/
901 case OP_BRA:
902 case OP_SBRA:
903 do
904 {
905 ADD_ACTIVE((int)(code - start_code + 1 + LINK_SIZE), 0);
906 code += GET(code, 1);
907 }
908 while (*code == OP_ALT);
909 break;
910
911 /*-----------------------------------------------------------------*/
912 case OP_CBRA:
913 case OP_SCBRA:
914 ADD_ACTIVE((int)(code - start_code + 1 + LINK_SIZE + IMM2_SIZE), 0);
915 code += GET(code, 1);
916 while (*code == OP_ALT)
917 {
918 ADD_ACTIVE((int)(code - start_code + 1 + LINK_SIZE), 0);
919 code += GET(code, 1);
920 }
921 break;
922
923 /*-----------------------------------------------------------------*/
924 case OP_BRAZERO:
925 case OP_BRAMINZERO:
926 ADD_ACTIVE(state_offset + 1, 0);
927 code += 1 + GET(code, 2);
928 while (*code == OP_ALT) code += GET(code, 1);
929 ADD_ACTIVE((int)(code - start_code + 1 + LINK_SIZE), 0);
930 break;
931
932 /*-----------------------------------------------------------------*/
933 case OP_SKIPZERO:
934 code += 1 + GET(code, 2);
935 while (*code == OP_ALT) code += GET(code, 1);
936 ADD_ACTIVE((int)(code - start_code + 1 + LINK_SIZE), 0);
937 break;
938
939 /*-----------------------------------------------------------------*/
940 case OP_CIRC:
941 if (ptr == start_subject && (mb->moptions & PCRE2_NOTBOL) == 0)
942 { ADD_ACTIVE(state_offset + 1, 0); }
943 break;
944
945 /*-----------------------------------------------------------------*/
946 case OP_CIRCM:
947 if ((ptr == start_subject && (mb->moptions & PCRE2_NOTBOL) == 0) ||
948 ((ptr != end_subject || (mb->poptions & PCRE2_ALT_CIRCUMFLEX) != 0 )
949 && WAS_NEWLINE(ptr)))
950 { ADD_ACTIVE(state_offset + 1, 0); }
951 break;
952
953 /*-----------------------------------------------------------------*/
954 case OP_EOD:
955 if (ptr >= end_subject)
956 {
957 if ((mb->moptions & PCRE2_PARTIAL_HARD) != 0)
958 could_continue = TRUE;
959 else { ADD_ACTIVE(state_offset + 1, 0); }
960 }
961 break;
962
963 /*-----------------------------------------------------------------*/
964 case OP_SOD:
965 if (ptr == start_subject) { ADD_ACTIVE(state_offset + 1, 0); }
966 break;
967
968 /*-----------------------------------------------------------------*/
969 case OP_SOM:
970 if (ptr == start_subject + start_offset) { ADD_ACTIVE(state_offset + 1, 0); }
971 break;
972
973
974 /* ========================================================================== */
975 /* These opcodes inspect the next subject character, and sometimes
976 the previous one as well, but do not have an argument. The variable
977 clen contains the length of the current character and is zero if we are
978 at the end of the subject. */
979
980 /*-----------------------------------------------------------------*/
981 case OP_ANY:
982 if (clen > 0 && !IS_NEWLINE(ptr))
983 {
984 if (ptr + 1 >= mb->end_subject &&
985 (mb->moptions & (PCRE2_PARTIAL_HARD)) != 0 &&
986 NLBLOCK->nltype == NLTYPE_FIXED &&
987 NLBLOCK->nllen == 2 &&
988 c == NLBLOCK->nl[0])
989 {
990 could_continue = partial_newline = TRUE;
991 }
992 else
993 {
994 ADD_NEW(state_offset + 1, 0);
995 }
996 }
997 break;
998
999 /*-----------------------------------------------------------------*/
1000 case OP_ALLANY:
1001 if (clen > 0)
1002 { ADD_NEW(state_offset + 1, 0); }
1003 break;
1004
1005 /*-----------------------------------------------------------------*/
1006 case OP_EODN:
1007 if (clen == 0 && (mb->moptions & PCRE2_PARTIAL_HARD) != 0)
1008 could_continue = TRUE;
1009 else if (clen == 0 || (IS_NEWLINE(ptr) && ptr == end_subject - mb->nllen))
1010 { ADD_ACTIVE(state_offset + 1, 0); }
1011 break;
1012
1013 /*-----------------------------------------------------------------*/
1014 case OP_DOLL:
1015 if ((mb->moptions & PCRE2_NOTEOL) == 0)
1016 {
1017 if (clen == 0 && (mb->moptions & PCRE2_PARTIAL_HARD) != 0)
1018 could_continue = TRUE;
1019 else if (clen == 0 ||
1020 ((mb->poptions & PCRE2_DOLLAR_ENDONLY) == 0 && IS_NEWLINE(ptr) &&
1021 (ptr == end_subject - mb->nllen)
1022 ))
1023 { ADD_ACTIVE(state_offset + 1, 0); }
1024 else if (ptr + 1 >= mb->end_subject &&
1025 (mb->moptions & (PCRE2_PARTIAL_HARD|PCRE2_PARTIAL_SOFT)) != 0 &&
1026 NLBLOCK->nltype == NLTYPE_FIXED &&
1027 NLBLOCK->nllen == 2 &&
1028 c == NLBLOCK->nl[0])
1029 {
1030 if ((mb->moptions & PCRE2_PARTIAL_HARD) != 0)
1031 {
1032 reset_could_continue = TRUE;
1033 ADD_NEW_DATA(-(state_offset + 1), 0, 1);
1034 }
1035 else could_continue = partial_newline = TRUE;
1036 }
1037 }
1038 break;
1039
1040 /*-----------------------------------------------------------------*/
1041 case OP_DOLLM:
1042 if ((mb->moptions & PCRE2_NOTEOL) == 0)
1043 {
1044 if (clen == 0 && (mb->moptions & PCRE2_PARTIAL_HARD) != 0)
1045 could_continue = TRUE;
1046 else if (clen == 0 ||
1047 ((mb->poptions & PCRE2_DOLLAR_ENDONLY) == 0 && IS_NEWLINE(ptr)))
1048 { ADD_ACTIVE(state_offset + 1, 0); }
1049 else if (ptr + 1 >= mb->end_subject &&
1050 (mb->moptions & (PCRE2_PARTIAL_HARD|PCRE2_PARTIAL_SOFT)) != 0 &&
1051 NLBLOCK->nltype == NLTYPE_FIXED &&
1052 NLBLOCK->nllen == 2 &&
1053 c == NLBLOCK->nl[0])
1054 {
1055 if ((mb->moptions & PCRE2_PARTIAL_HARD) != 0)
1056 {
1057 reset_could_continue = TRUE;
1058 ADD_NEW_DATA(-(state_offset + 1), 0, 1);
1059 }
1060 else could_continue = partial_newline = TRUE;
1061 }
1062 }
1063 else if (IS_NEWLINE(ptr))
1064 { ADD_ACTIVE(state_offset + 1, 0); }
1065 break;
1066
1067 /*-----------------------------------------------------------------*/
1068
1069 case OP_DIGIT:
1070 case OP_WHITESPACE:
1071 case OP_WORDCHAR:
1072 if (clen > 0 && c < 256 &&
1073 ((ctypes[c] & toptable1[codevalue]) ^ toptable2[codevalue]) != 0)
1074 { ADD_NEW(state_offset + 1, 0); }
1075 break;
1076
1077 /*-----------------------------------------------------------------*/
1078 case OP_NOT_DIGIT:
1079 case OP_NOT_WHITESPACE:
1080 case OP_NOT_WORDCHAR:
1081 if (clen > 0 && (c >= 256 ||
1082 ((ctypes[c] & toptable1[codevalue]) ^ toptable2[codevalue]) != 0))
1083 { ADD_NEW(state_offset + 1, 0); }
1084 break;
1085
1086 /*-----------------------------------------------------------------*/
1087 case OP_WORD_BOUNDARY:
1088 case OP_NOT_WORD_BOUNDARY:
1089 {
1090 int left_word, right_word;
1091
1092 if (ptr > start_subject)
1093 {
1094 PCRE2_SPTR temp = ptr - 1;
1095 if (temp < mb->start_used_ptr) mb->start_used_ptr = temp;
1096 #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32
1097 if (utf) { BACKCHAR(temp); }
1098 #endif
1099 GETCHARTEST(d, temp);
1100 #ifdef SUPPORT_UNICODE
1101 if ((mb->poptions & PCRE2_UCP) != 0)
1102 {
1103 if (d == '_') left_word = TRUE; else
1104 {
1105 uint32_t cat = UCD_CATEGORY(d);
1106 left_word = (cat == ucp_L || cat == ucp_N);
1107 }
1108 }
1109 else
1110 #endif
1111 left_word = d < 256 && (ctypes[d] & ctype_word) != 0;
1112 }
1113 else left_word = FALSE;
1114
1115 if (clen > 0)
1116 {
1117 if (ptr >= mb->last_used_ptr)
1118 {
1119 PCRE2_SPTR temp = ptr + 1;
1120 #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32
1121 if (utf) { FORWARDCHARTEST(temp, mb->end_subject); }
1122 #endif
1123 mb->last_used_ptr = temp;
1124 }
1125 #ifdef SUPPORT_UNICODE
1126 if ((mb->poptions & PCRE2_UCP) != 0)
1127 {
1128 if (c == '_') right_word = TRUE; else
1129 {
1130 uint32_t cat = UCD_CATEGORY(c);
1131 right_word = (cat == ucp_L || cat == ucp_N);
1132 }
1133 }
1134 else
1135 #endif
1136 right_word = c < 256 && (ctypes[c] & ctype_word) != 0;
1137 }
1138 else right_word = FALSE;
1139
1140 if ((left_word == right_word) == (codevalue == OP_NOT_WORD_BOUNDARY))
1141 { ADD_ACTIVE(state_offset + 1, 0); }
1142 }
1143 break;
1144
1145
1146 /*-----------------------------------------------------------------*/
1147 /* Check the next character by Unicode property. We will get here only
1148 if the support is in the binary; otherwise a compile-time error occurs.
1149 */
1150
1151 #ifdef SUPPORT_UNICODE
1152 case OP_PROP:
1153 case OP_NOTPROP:
1154 if (clen > 0)
1155 {
1156 BOOL OK;
1157 const uint32_t *cp;
1158 const ucd_record * prop = GET_UCD(c);
1159 switch(code[1])
1160 {
1161 case PT_ANY:
1162 OK = TRUE;
1163 break;
1164
1165 case PT_LAMP:
1166 OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll ||
1167 prop->chartype == ucp_Lt;
1168 break;
1169
1170 case PT_GC:
1171 OK = PRIV(ucp_gentype)[prop->chartype] == code[2];
1172 break;
1173
1174 case PT_PC:
1175 OK = prop->chartype == code[2];
1176 break;
1177
1178 case PT_SC:
1179 OK = prop->script == code[2];
1180 break;
1181
1182 /* These are specials for combination cases. */
1183
1184 case PT_ALNUM:
1185 OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
1186 PRIV(ucp_gentype)[prop->chartype] == ucp_N;
1187 break;
1188
1189 /* Perl space used to exclude VT, but from Perl 5.18 it is included,
1190 which means that Perl space and POSIX space are now identical. PCRE
1191 was changed at release 8.34. */
1192
1193 case PT_SPACE: /* Perl space */
1194 case PT_PXSPACE: /* POSIX space */
1195 switch(c)
1196 {
1197 HSPACE_CASES:
1198 VSPACE_CASES:
1199 OK = TRUE;
1200 break;
1201
1202 default:
1203 OK = PRIV(ucp_gentype)[prop->chartype] == ucp_Z;
1204 break;
1205 }
1206 break;
1207
1208 case PT_WORD:
1209 OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
1210 PRIV(ucp_gentype)[prop->chartype] == ucp_N ||
1211 c == CHAR_UNDERSCORE;
1212 break;
1213
1214 case PT_CLIST:
1215 cp = PRIV(ucd_caseless_sets) + code[2];
1216 for (;;)
1217 {
1218 if (c < *cp) { OK = FALSE; break; }
1219 if (c == *cp++) { OK = TRUE; break; }
1220 }
1221 break;
1222
1223 case PT_UCNC:
1224 OK = c == CHAR_DOLLAR_SIGN || c == CHAR_COMMERCIAL_AT ||
1225 c == CHAR_GRAVE_ACCENT || (c >= 0xa0 && c <= 0xd7ff) ||
1226 c >= 0xe000;
1227 break;
1228
1229 /* Should never occur, but keep compilers from grumbling. */
1230
1231 default:
1232 OK = codevalue != OP_PROP;
1233 break;
1234 }
1235
1236 if (OK == (codevalue == OP_PROP)) { ADD_NEW(state_offset + 3, 0); }
1237 }
1238 break;
1239 #endif
1240
1241
1242
1243 /* ========================================================================== */
1244 /* These opcodes likewise inspect the subject character, but have an
1245 argument that is not a data character. It is one of these opcodes:
1246 OP_ANY, OP_ALLANY, OP_DIGIT, OP_NOT_DIGIT, OP_WHITESPACE, OP_NOT_SPACE,
1247 OP_WORDCHAR, OP_NOT_WORDCHAR. The value is loaded into d. */
1248
1249 case OP_TYPEPLUS:
1250 case OP_TYPEMINPLUS:
1251 case OP_TYPEPOSPLUS:
1252 count = current_state->count; /* Already matched */
1253 if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1254 if (clen > 0)
1255 {
1256 if (d == OP_ANY && ptr + 1 >= mb->end_subject &&
1257 (mb->moptions & (PCRE2_PARTIAL_HARD)) != 0 &&
1258 NLBLOCK->nltype == NLTYPE_FIXED &&
1259 NLBLOCK->nllen == 2 &&
1260 c == NLBLOCK->nl[0])
1261 {
1262 could_continue = partial_newline = TRUE;
1263 }
1264 else if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
1265 (c < 256 &&
1266 (d != OP_ANY || !IS_NEWLINE(ptr)) &&
1267 ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
1268 {
1269 if (count > 0 && codevalue == OP_TYPEPOSPLUS)
1270 {
1271 active_count--; /* Remove non-match possibility */
1272 next_active_state--;
1273 }
1274 count++;
1275 ADD_NEW(state_offset, count);
1276 }
1277 }
1278 break;
1279
1280 /*-----------------------------------------------------------------*/
1281 case OP_TYPEQUERY:
1282 case OP_TYPEMINQUERY:
1283 case OP_TYPEPOSQUERY:
1284 ADD_ACTIVE(state_offset + 2, 0);
1285 if (clen > 0)
1286 {
1287 if (d == OP_ANY && ptr + 1 >= mb->end_subject &&
1288 (mb->moptions & (PCRE2_PARTIAL_HARD)) != 0 &&
1289 NLBLOCK->nltype == NLTYPE_FIXED &&
1290 NLBLOCK->nllen == 2 &&
1291 c == NLBLOCK->nl[0])
1292 {
1293 could_continue = partial_newline = TRUE;
1294 }
1295 else if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
1296 (c < 256 &&
1297 (d != OP_ANY || !IS_NEWLINE(ptr)) &&
1298 ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
1299 {
1300 if (codevalue == OP_TYPEPOSQUERY)
1301 {
1302 active_count--; /* Remove non-match possibility */
1303 next_active_state--;
1304 }
1305 ADD_NEW(state_offset + 2, 0);
1306 }
1307 }
1308 break;
1309
1310 /*-----------------------------------------------------------------*/
1311 case OP_TYPESTAR:
1312 case OP_TYPEMINSTAR:
1313 case OP_TYPEPOSSTAR:
1314 ADD_ACTIVE(state_offset + 2, 0);
1315 if (clen > 0)
1316 {
1317 if (d == OP_ANY && ptr + 1 >= mb->end_subject &&
1318 (mb->moptions & (PCRE2_PARTIAL_HARD)) != 0 &&
1319 NLBLOCK->nltype == NLTYPE_FIXED &&
1320 NLBLOCK->nllen == 2 &&
1321 c == NLBLOCK->nl[0])
1322 {
1323 could_continue = partial_newline = TRUE;
1324 }
1325 else if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
1326 (c < 256 &&
1327 (d != OP_ANY || !IS_NEWLINE(ptr)) &&
1328 ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
1329 {
1330 if (codevalue == OP_TYPEPOSSTAR)
1331 {
1332 active_count--; /* Remove non-match possibility */
1333 next_active_state--;
1334 }
1335 ADD_NEW(state_offset, 0);
1336 }
1337 }
1338 break;
1339
1340 /*-----------------------------------------------------------------*/
1341 case OP_TYPEEXACT:
1342 count = current_state->count; /* Number already matched */
1343 if (clen > 0)
1344 {
1345 if (d == OP_ANY && ptr + 1 >= mb->end_subject &&
1346 (mb->moptions & (PCRE2_PARTIAL_HARD)) != 0 &&
1347 NLBLOCK->nltype == NLTYPE_FIXED &&
1348 NLBLOCK->nllen == 2 &&
1349 c == NLBLOCK->nl[0])
1350 {
1351 could_continue = partial_newline = TRUE;
1352 }
1353 else if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
1354 (c < 256 &&
1355 (d != OP_ANY || !IS_NEWLINE(ptr)) &&
1356 ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
1357 {
1358 if (++count >= (int)GET2(code, 1))
1359 { ADD_NEW(state_offset + 1 + IMM2_SIZE + 1, 0); }
1360 else
1361 { ADD_NEW(state_offset, count); }
1362 }
1363 }
1364 break;
1365
1366 /*-----------------------------------------------------------------*/
1367 case OP_TYPEUPTO:
1368 case OP_TYPEMINUPTO:
1369 case OP_TYPEPOSUPTO:
1370 ADD_ACTIVE(state_offset + 2 + IMM2_SIZE, 0);
1371 count = current_state->count; /* Number already matched */
1372 if (clen > 0)
1373 {
1374 if (d == OP_ANY && ptr + 1 >= mb->end_subject &&
1375 (mb->moptions & (PCRE2_PARTIAL_HARD)) != 0 &&
1376 NLBLOCK->nltype == NLTYPE_FIXED &&
1377 NLBLOCK->nllen == 2 &&
1378 c == NLBLOCK->nl[0])
1379 {
1380 could_continue = partial_newline = TRUE;
1381 }
1382 else if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
1383 (c < 256 &&
1384 (d != OP_ANY || !IS_NEWLINE(ptr)) &&
1385 ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
1386 {
1387 if (codevalue == OP_TYPEPOSUPTO)
1388 {
1389 active_count--; /* Remove non-match possibility */
1390 next_active_state--;
1391 }
1392 if (++count >= (int)GET2(code, 1))
1393 { ADD_NEW(state_offset + 2 + IMM2_SIZE, 0); }
1394 else
1395 { ADD_NEW(state_offset, count); }
1396 }
1397 }
1398 break;
1399
1400 /* ========================================================================== */
1401 /* These are virtual opcodes that are used when something like
1402 OP_TYPEPLUS has OP_PROP, OP_NOTPROP, OP_ANYNL, or OP_EXTUNI as its
1403 argument. It keeps the code above fast for the other cases. The argument
1404 is in the d variable. */
1405
1406 #ifdef SUPPORT_UNICODE
1407 case OP_PROP_EXTRA + OP_TYPEPLUS:
1408 case OP_PROP_EXTRA + OP_TYPEMINPLUS:
1409 case OP_PROP_EXTRA + OP_TYPEPOSPLUS:
1410 count = current_state->count; /* Already matched */
1411 if (count > 0) { ADD_ACTIVE(state_offset + 4, 0); }
1412 if (clen > 0)
1413 {
1414 BOOL OK;
1415 const uint32_t *cp;
1416 const ucd_record * prop = GET_UCD(c);
1417 switch(code[2])
1418 {
1419 case PT_ANY:
1420 OK = TRUE;
1421 break;
1422
1423 case PT_LAMP:
1424 OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll ||
1425 prop->chartype == ucp_Lt;
1426 break;
1427
1428 case PT_GC:
1429 OK = PRIV(ucp_gentype)[prop->chartype] == code[3];
1430 break;
1431
1432 case PT_PC:
1433 OK = prop->chartype == code[3];
1434 break;
1435
1436 case PT_SC:
1437 OK = prop->script == code[3];
1438 break;
1439
1440 /* These are specials for combination cases. */
1441
1442 case PT_ALNUM:
1443 OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
1444 PRIV(ucp_gentype)[prop->chartype] == ucp_N;
1445 break;
1446
1447 /* Perl space used to exclude VT, but from Perl 5.18 it is included,
1448 which means that Perl space and POSIX space are now identical. PCRE
1449 was changed at release 8.34. */
1450
1451 case PT_SPACE: /* Perl space */
1452 case PT_PXSPACE: /* POSIX space */
1453 switch(c)
1454 {
1455 HSPACE_CASES:
1456 VSPACE_CASES:
1457 OK = TRUE;
1458 break;
1459
1460 default:
1461 OK = PRIV(ucp_gentype)[prop->chartype] == ucp_Z;
1462 break;
1463 }
1464 break;
1465
1466 case PT_WORD:
1467 OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
1468 PRIV(ucp_gentype)[prop->chartype] == ucp_N ||
1469 c == CHAR_UNDERSCORE;
1470 break;
1471
1472 case PT_CLIST:
1473 cp = PRIV(ucd_caseless_sets) + code[3];
1474 for (;;)
1475 {
1476 if (c < *cp) { OK = FALSE; break; }
1477 if (c == *cp++) { OK = TRUE; break; }
1478 }
1479 break;
1480
1481 case PT_UCNC:
1482 OK = c == CHAR_DOLLAR_SIGN || c == CHAR_COMMERCIAL_AT ||
1483 c == CHAR_GRAVE_ACCENT || (c >= 0xa0 && c <= 0xd7ff) ||
1484 c >= 0xe000;
1485 break;
1486
1487 /* Should never occur, but keep compilers from grumbling. */
1488
1489 default:
1490 OK = codevalue != OP_PROP;
1491 break;
1492 }
1493
1494 if (OK == (d == OP_PROP))
1495 {
1496 if (count > 0 && codevalue == OP_PROP_EXTRA + OP_TYPEPOSPLUS)
1497 {
1498 active_count--; /* Remove non-match possibility */
1499 next_active_state--;
1500 }
1501 count++;
1502 ADD_NEW(state_offset, count);
1503 }
1504 }
1505 break;
1506
1507 /*-----------------------------------------------------------------*/
1508 case OP_EXTUNI_EXTRA + OP_TYPEPLUS:
1509 case OP_EXTUNI_EXTRA + OP_TYPEMINPLUS:
1510 case OP_EXTUNI_EXTRA + OP_TYPEPOSPLUS:
1511 count = current_state->count; /* Already matched */
1512 if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1513 if (clen > 0)
1514 {
1515 int ncount = 0;
1516 if (count > 0 && codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSPLUS)
1517 {
1518 active_count--; /* Remove non-match possibility */
1519 next_active_state--;
1520 }
1521 (void)PRIV(extuni)(c, ptr + clen, mb->start_subject, end_subject, utf,
1522 &ncount);
1523 count++;
1524 ADD_NEW_DATA(-state_offset, count, ncount);
1525 }
1526 break;
1527 #endif
1528
1529 /*-----------------------------------------------------------------*/
1530 case OP_ANYNL_EXTRA + OP_TYPEPLUS:
1531 case OP_ANYNL_EXTRA + OP_TYPEMINPLUS:
1532 case OP_ANYNL_EXTRA + OP_TYPEPOSPLUS:
1533 count = current_state->count; /* Already matched */
1534 if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1535 if (clen > 0)
1536 {
1537 int ncount = 0;
1538 switch (c)
1539 {
1540 case CHAR_VT:
1541 case CHAR_FF:
1542 case CHAR_NEL:
1543 #ifndef EBCDIC
1544 case 0x2028:
1545 case 0x2029:
1546 #endif /* Not EBCDIC */
1547 if (mb->bsr_convention == PCRE2_BSR_ANYCRLF) break;
1548 goto ANYNL01;
1549
1550 case CHAR_CR:
1551 if (ptr + 1 < end_subject && UCHAR21TEST(ptr + 1) == CHAR_LF) ncount = 1;
1552 /* Fall through */
1553
1554 ANYNL01:
1555 case CHAR_LF:
1556 if (count > 0 && codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSPLUS)
1557 {
1558 active_count--; /* Remove non-match possibility */
1559 next_active_state--;
1560 }
1561 count++;
1562 ADD_NEW_DATA(-state_offset, count, ncount);
1563 break;
1564
1565 default:
1566 break;
1567 }
1568 }
1569 break;
1570
1571 /*-----------------------------------------------------------------*/
1572 case OP_VSPACE_EXTRA + OP_TYPEPLUS:
1573 case OP_VSPACE_EXTRA + OP_TYPEMINPLUS:
1574 case OP_VSPACE_EXTRA + OP_TYPEPOSPLUS:
1575 count = current_state->count; /* Already matched */
1576 if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1577 if (clen > 0)
1578 {
1579 BOOL OK;
1580 switch (c)
1581 {
1582 VSPACE_CASES:
1583 OK = TRUE;
1584 break;
1585
1586 default:
1587 OK = FALSE;
1588 break;
1589 }
1590
1591 if (OK == (d == OP_VSPACE))
1592 {
1593 if (count > 0 && codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSPLUS)
1594 {
1595 active_count--; /* Remove non-match possibility */
1596 next_active_state--;
1597 }
1598 count++;
1599 ADD_NEW_DATA(-state_offset, count, 0);
1600 }
1601 }
1602 break;
1603
1604 /*-----------------------------------------------------------------*/
1605 case OP_HSPACE_EXTRA + OP_TYPEPLUS:
1606 case OP_HSPACE_EXTRA + OP_TYPEMINPLUS:
1607 case OP_HSPACE_EXTRA + OP_TYPEPOSPLUS:
1608 count = current_state->count; /* Already matched */
1609 if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1610 if (clen > 0)
1611 {
1612 BOOL OK;
1613 switch (c)
1614 {
1615 HSPACE_CASES:
1616 OK = TRUE;
1617 break;
1618
1619 default:
1620 OK = FALSE;
1621 break;
1622 }
1623
1624 if (OK == (d == OP_HSPACE))
1625 {
1626 if (count > 0 && codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSPLUS)
1627 {
1628 active_count--; /* Remove non-match possibility */
1629 next_active_state--;
1630 }
1631 count++;
1632 ADD_NEW_DATA(-state_offset, count, 0);
1633 }
1634 }
1635 break;
1636
1637 /*-----------------------------------------------------------------*/
1638 #ifdef SUPPORT_UNICODE
1639 case OP_PROP_EXTRA + OP_TYPEQUERY:
1640 case OP_PROP_EXTRA + OP_TYPEMINQUERY:
1641 case OP_PROP_EXTRA + OP_TYPEPOSQUERY:
1642 count = 4;
1643 goto QS1;
1644
1645 case OP_PROP_EXTRA + OP_TYPESTAR:
1646 case OP_PROP_EXTRA + OP_TYPEMINSTAR:
1647 case OP_PROP_EXTRA + OP_TYPEPOSSTAR:
1648 count = 0;
1649
1650 QS1:
1651
1652 ADD_ACTIVE(state_offset + 4, 0);
1653 if (clen > 0)
1654 {
1655 BOOL OK;
1656 const uint32_t *cp;
1657 const ucd_record * prop = GET_UCD(c);
1658 switch(code[2])
1659 {
1660 case PT_ANY:
1661 OK = TRUE;
1662 break;
1663
1664 case PT_LAMP:
1665 OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll ||
1666 prop->chartype == ucp_Lt;
1667 break;
1668
1669 case PT_GC:
1670 OK = PRIV(ucp_gentype)[prop->chartype] == code[3];
1671 break;
1672
1673 case PT_PC:
1674 OK = prop->chartype == code[3];
1675 break;
1676
1677 case PT_SC:
1678 OK = prop->script == code[3];
1679 break;
1680
1681 /* These are specials for combination cases. */
1682
1683 case PT_ALNUM:
1684 OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
1685 PRIV(ucp_gentype)[prop->chartype] == ucp_N;
1686 break;
1687
1688 /* Perl space used to exclude VT, but from Perl 5.18 it is included,
1689 which means that Perl space and POSIX space are now identical. PCRE
1690 was changed at release 8.34. */
1691
1692 case PT_SPACE: /* Perl space */
1693 case PT_PXSPACE: /* POSIX space */
1694 switch(c)
1695 {
1696 HSPACE_CASES:
1697 VSPACE_CASES:
1698 OK = TRUE;
1699 break;
1700
1701 default:
1702 OK = PRIV(ucp_gentype)[prop->chartype] == ucp_Z;
1703 break;
1704 }
1705 break;
1706
1707 case PT_WORD:
1708 OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
1709 PRIV(ucp_gentype)[prop->chartype] == ucp_N ||
1710 c == CHAR_UNDERSCORE;
1711 break;
1712
1713 case PT_CLIST:
1714 cp = PRIV(ucd_caseless_sets) + code[3];
1715 for (;;)
1716 {
1717 if (c < *cp) { OK = FALSE; break; }
1718 if (c == *cp++) { OK = TRUE; break; }
1719 }
1720 break;
1721
1722 case PT_UCNC:
1723 OK = c == CHAR_DOLLAR_SIGN || c == CHAR_COMMERCIAL_AT ||
1724 c == CHAR_GRAVE_ACCENT || (c >= 0xa0 && c <= 0xd7ff) ||
1725 c >= 0xe000;
1726 break;
1727
1728 /* Should never occur, but keep compilers from grumbling. */
1729
1730 default:
1731 OK = codevalue != OP_PROP;
1732 break;
1733 }
1734
1735 if (OK == (d == OP_PROP))
1736 {
1737 if (codevalue == OP_PROP_EXTRA + OP_TYPEPOSSTAR ||
1738 codevalue == OP_PROP_EXTRA + OP_TYPEPOSQUERY)
1739 {
1740 active_count--; /* Remove non-match possibility */
1741 next_active_state--;
1742 }
1743 ADD_NEW(state_offset + count, 0);
1744 }
1745 }
1746 break;
1747
1748 /*-----------------------------------------------------------------*/
1749 case OP_EXTUNI_EXTRA + OP_TYPEQUERY:
1750 case OP_EXTUNI_EXTRA + OP_TYPEMINQUERY:
1751 case OP_EXTUNI_EXTRA + OP_TYPEPOSQUERY:
1752 count = 2;
1753 goto QS2;
1754
1755 case OP_EXTUNI_EXTRA + OP_TYPESTAR:
1756 case OP_EXTUNI_EXTRA + OP_TYPEMINSTAR:
1757 case OP_EXTUNI_EXTRA + OP_TYPEPOSSTAR:
1758 count = 0;
1759
1760 QS2:
1761
1762 ADD_ACTIVE(state_offset + 2, 0);
1763 if (clen > 0)
1764 {
1765 int ncount = 0;
1766 if (codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSSTAR ||
1767 codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSQUERY)
1768 {
1769 active_count--; /* Remove non-match possibility */
1770 next_active_state--;
1771 }
1772 (void)PRIV(extuni)(c, ptr + clen, mb->start_subject, end_subject, utf,
1773 &ncount);
1774 ADD_NEW_DATA(-(state_offset + count), 0, ncount);
1775 }
1776 break;
1777 #endif
1778
1779 /*-----------------------------------------------------------------*/
1780 case OP_ANYNL_EXTRA + OP_TYPEQUERY:
1781 case OP_ANYNL_EXTRA + OP_TYPEMINQUERY:
1782 case OP_ANYNL_EXTRA + OP_TYPEPOSQUERY:
1783 count = 2;
1784 goto QS3;
1785
1786 case OP_ANYNL_EXTRA + OP_TYPESTAR:
1787 case OP_ANYNL_EXTRA + OP_TYPEMINSTAR:
1788 case OP_ANYNL_EXTRA + OP_TYPEPOSSTAR:
1789 count = 0;
1790
1791 QS3:
1792 ADD_ACTIVE(state_offset + 2, 0);
1793 if (clen > 0)
1794 {
1795 int ncount = 0;
1796 switch (c)
1797 {
1798 case CHAR_VT:
1799 case CHAR_FF:
1800 case CHAR_NEL:
1801 #ifndef EBCDIC
1802 case 0x2028:
1803 case 0x2029:
1804 #endif /* Not EBCDIC */
1805 if (mb->bsr_convention == PCRE2_BSR_ANYCRLF) break;
1806 goto ANYNL02;
1807
1808 case CHAR_CR:
1809 if (ptr + 1 < end_subject && UCHAR21TEST(ptr + 1) == CHAR_LF) ncount = 1;
1810 /* Fall through */
1811
1812 ANYNL02:
1813 case CHAR_LF:
1814 if (codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSSTAR ||
1815 codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSQUERY)
1816 {
1817 active_count--; /* Remove non-match possibility */
1818 next_active_state--;
1819 }
1820 ADD_NEW_DATA(-(state_offset + (int)count), 0, ncount);
1821 break;
1822
1823 default:
1824 break;
1825 }
1826 }
1827 break;
1828
1829 /*-----------------------------------------------------------------*/
1830 case OP_VSPACE_EXTRA + OP_TYPEQUERY:
1831 case OP_VSPACE_EXTRA + OP_TYPEMINQUERY:
1832 case OP_VSPACE_EXTRA + OP_TYPEPOSQUERY:
1833 count = 2;
1834 goto QS4;
1835
1836 case OP_VSPACE_EXTRA + OP_TYPESTAR:
1837 case OP_VSPACE_EXTRA + OP_TYPEMINSTAR:
1838 case OP_VSPACE_EXTRA + OP_TYPEPOSSTAR:
1839 count = 0;
1840
1841 QS4:
1842 ADD_ACTIVE(state_offset + 2, 0);
1843 if (clen > 0)
1844 {
1845 BOOL OK;
1846 switch (c)
1847 {
1848 VSPACE_CASES:
1849 OK = TRUE;
1850 break;
1851
1852 default:
1853 OK = FALSE;
1854 break;
1855 }
1856 if (OK == (d == OP_VSPACE))
1857 {
1858 if (codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSSTAR ||
1859 codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSQUERY)
1860 {
1861 active_count--; /* Remove non-match possibility */
1862 next_active_state--;
1863 }
1864 ADD_NEW_DATA(-(state_offset + (int)count), 0, 0);
1865 }
1866 }
1867 break;
1868
1869 /*-----------------------------------------------------------------*/
1870 case OP_HSPACE_EXTRA + OP_TYPEQUERY:
1871 case OP_HSPACE_EXTRA + OP_TYPEMINQUERY:
1872 case OP_HSPACE_EXTRA + OP_TYPEPOSQUERY:
1873 count = 2;
1874 goto QS5;
1875
1876 case OP_HSPACE_EXTRA + OP_TYPESTAR:
1877 case OP_HSPACE_EXTRA + OP_TYPEMINSTAR:
1878 case OP_HSPACE_EXTRA + OP_TYPEPOSSTAR:
1879 count = 0;
1880
1881 QS5:
1882 ADD_ACTIVE(state_offset + 2, 0);
1883 if (clen > 0)
1884 {
1885 BOOL OK;
1886 switch (c)
1887 {
1888 HSPACE_CASES:
1889 OK = TRUE;
1890 break;
1891
1892 default:
1893 OK = FALSE;
1894 break;
1895 }
1896
1897 if (OK == (d == OP_HSPACE))
1898 {
1899 if (codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSSTAR ||
1900 codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSQUERY)
1901 {
1902 active_count--; /* Remove non-match possibility */
1903 next_active_state--;
1904 }
1905 ADD_NEW_DATA(-(state_offset + (int)count), 0, 0);
1906 }
1907 }
1908 break;
1909
1910 /*-----------------------------------------------------------------*/
1911 #ifdef SUPPORT_UNICODE
1912 case OP_PROP_EXTRA + OP_TYPEEXACT:
1913 case OP_PROP_EXTRA + OP_TYPEUPTO:
1914 case OP_PROP_EXTRA + OP_TYPEMINUPTO:
1915 case OP_PROP_EXTRA + OP_TYPEPOSUPTO:
1916 if (codevalue != OP_PROP_EXTRA + OP_TYPEEXACT)
1917 { ADD_ACTIVE(state_offset + 1 + IMM2_SIZE + 3, 0); }
1918 count = current_state->count; /* Number already matched */
1919 if (clen > 0)
1920 {
1921 BOOL OK;
1922 const uint32_t *cp;
1923 const ucd_record * prop = GET_UCD(c);
1924 switch(code[1 + IMM2_SIZE + 1])
1925 {
1926 case PT_ANY:
1927 OK = TRUE;
1928 break;
1929
1930 case PT_LAMP:
1931 OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll ||
1932 prop->chartype == ucp_Lt;
1933 break;
1934
1935 case PT_GC:
1936 OK = PRIV(ucp_gentype)[prop->chartype] == code[1 + IMM2_SIZE + 2];
1937 break;
1938
1939 case PT_PC:
1940 OK = prop->chartype == code[1 + IMM2_SIZE + 2];
1941 break;
1942
1943 case PT_SC:
1944 OK = prop->script == code[1 + IMM2_SIZE + 2];
1945 break;
1946
1947 /* These are specials for combination cases. */
1948
1949 case PT_ALNUM:
1950 OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
1951 PRIV(ucp_gentype)[prop->chartype] == ucp_N;
1952 break;
1953
1954 /* Perl space used to exclude VT, but from Perl 5.18 it is included,
1955 which means that Perl space and POSIX space are now identical. PCRE
1956 was changed at release 8.34. */
1957
1958 case PT_SPACE: /* Perl space */
1959 case PT_PXSPACE: /* POSIX space */
1960 switch(c)
1961 {
1962 HSPACE_CASES:
1963 VSPACE_CASES:
1964 OK = TRUE;
1965 break;
1966
1967 default:
1968 OK = PRIV(ucp_gentype)[prop->chartype] == ucp_Z;
1969 break;
1970 }
1971 break;
1972
1973 case PT_WORD:
1974 OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
1975 PRIV(ucp_gentype)[prop->chartype] == ucp_N ||
1976 c == CHAR_UNDERSCORE;
1977 break;
1978
1979 case PT_CLIST:
1980 cp = PRIV(ucd_caseless_sets) + code[1 + IMM2_SIZE + 2];
1981 for (;;)
1982 {
1983 if (c < *cp) { OK = FALSE; break; }
1984 if (c == *cp++) { OK = TRUE; break; }
1985 }
1986 break;
1987
1988 case PT_UCNC:
1989 OK = c == CHAR_DOLLAR_SIGN || c == CHAR_COMMERCIAL_AT ||
1990 c == CHAR_GRAVE_ACCENT || (c >= 0xa0 && c <= 0xd7ff) ||
1991 c >= 0xe000;
1992 break;
1993
1994 /* Should never occur, but keep compilers from grumbling. */
1995
1996 default:
1997 OK = codevalue != OP_PROP;
1998 break;
1999 }
2000
2001 if (OK == (d == OP_PROP))
2002 {
2003 if (codevalue == OP_PROP_EXTRA + OP_TYPEPOSUPTO)
2004 {
2005 active_count--; /* Remove non-match possibility */
2006 next_active_state--;
2007 }
2008 if (++count >= (int)GET2(code, 1))
2009 { ADD_NEW(state_offset + 1 + IMM2_SIZE + 3, 0); }
2010 else
2011 { ADD_NEW(state_offset, count); }
2012 }
2013 }
2014 break;
2015
2016 /*-----------------------------------------------------------------*/
2017 case OP_EXTUNI_EXTRA + OP_TYPEEXACT:
2018 case OP_EXTUNI_EXTRA + OP_TYPEUPTO:
2019 case OP_EXTUNI_EXTRA + OP_TYPEMINUPTO:
2020 case OP_EXTUNI_EXTRA + OP_TYPEPOSUPTO:
2021 if (codevalue != OP_EXTUNI_EXTRA + OP_TYPEEXACT)
2022 { ADD_ACTIVE(state_offset + 2 + IMM2_SIZE, 0); }
2023 count = current_state->count; /* Number already matched */
2024 if (clen > 0)
2025 {
2026 PCRE2_SPTR nptr;
2027 int ncount = 0;
2028 if (codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSUPTO)
2029 {
2030 active_count--; /* Remove non-match possibility */
2031 next_active_state--;
2032 }
2033 nptr = PRIV(extuni)(c, ptr + clen, mb->start_subject, end_subject, utf,
2034 &ncount);
2035 if (nptr >= end_subject && (mb->moptions & PCRE2_PARTIAL_HARD) != 0)
2036 reset_could_continue = TRUE;
2037 if (++count >= (int)GET2(code, 1))
2038 { ADD_NEW_DATA(-(state_offset + 2 + IMM2_SIZE), 0, ncount); }
2039 else
2040 { ADD_NEW_DATA(-state_offset, count, ncount); }
2041 }
2042 break;
2043 #endif
2044
2045 /*-----------------------------------------------------------------*/
2046 case OP_ANYNL_EXTRA + OP_TYPEEXACT:
2047 case OP_ANYNL_EXTRA + OP_TYPEUPTO:
2048 case OP_ANYNL_EXTRA + OP_TYPEMINUPTO:
2049 case OP_ANYNL_EXTRA + OP_TYPEPOSUPTO:
2050 if (codevalue != OP_ANYNL_EXTRA + OP_TYPEEXACT)
2051 { ADD_ACTIVE(state_offset + 2 + IMM2_SIZE, 0); }
2052 count = current_state->count; /* Number already matched */
2053 if (clen > 0)
2054 {
2055 int ncount = 0;
2056 switch (c)
2057 {
2058 case CHAR_VT:
2059 case CHAR_FF:
2060 case CHAR_NEL:
2061 #ifndef EBCDIC
2062 case 0x2028:
2063 case 0x2029:
2064 #endif /* Not EBCDIC */
2065 if (mb->bsr_convention == PCRE2_BSR_ANYCRLF) break;
2066 goto ANYNL03;
2067
2068 case CHAR_CR:
2069 if (ptr + 1 < end_subject && UCHAR21TEST(ptr + 1) == CHAR_LF) ncount = 1;
2070 /* Fall through */
2071
2072 ANYNL03:
2073 case CHAR_LF:
2074 if (codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSUPTO)
2075 {
2076 active_count--; /* Remove non-match possibility */
2077 next_active_state--;
2078 }
2079 if (++count >= (int)GET2(code, 1))
2080 { ADD_NEW_DATA(-(state_offset + 2 + IMM2_SIZE), 0, ncount); }
2081 else
2082 { ADD_NEW_DATA(-state_offset, count, ncount); }
2083 break;
2084
2085 default:
2086 break;
2087 }
2088 }
2089 break;
2090
2091 /*-----------------------------------------------------------------*/
2092 case OP_VSPACE_EXTRA + OP_TYPEEXACT:
2093 case OP_VSPACE_EXTRA + OP_TYPEUPTO:
2094 case OP_VSPACE_EXTRA + OP_TYPEMINUPTO:
2095 case OP_VSPACE_EXTRA + OP_TYPEPOSUPTO:
2096 if (codevalue != OP_VSPACE_EXTRA + OP_TYPEEXACT)
2097 { ADD_ACTIVE(state_offset + 2 + IMM2_SIZE, 0); }
2098 count = current_state->count; /* Number already matched */
2099 if (clen > 0)
2100 {
2101 BOOL OK;
2102 switch (c)
2103 {
2104 VSPACE_CASES:
2105 OK = TRUE;
2106 break;
2107
2108 default:
2109 OK = FALSE;
2110 }
2111
2112 if (OK == (d == OP_VSPACE))
2113 {
2114 if (codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSUPTO)
2115 {
2116 active_count--; /* Remove non-match possibility */
2117 next_active_state--;
2118 }
2119 if (++count >= (int)GET2(code, 1))
2120 { ADD_NEW_DATA(-(state_offset + 2 + IMM2_SIZE), 0, 0); }
2121 else
2122 { ADD_NEW_DATA(-state_offset, count, 0); }
2123 }
2124 }
2125 break;
2126
2127 /*-----------------------------------------------------------------*/
2128 case OP_HSPACE_EXTRA + OP_TYPEEXACT:
2129 case OP_HSPACE_EXTRA + OP_TYPEUPTO:
2130 case OP_HSPACE_EXTRA + OP_TYPEMINUPTO:
2131 case OP_HSPACE_EXTRA + OP_TYPEPOSUPTO:
2132 if (codevalue != OP_HSPACE_EXTRA + OP_TYPEEXACT)
2133 { ADD_ACTIVE(state_offset + 2 + IMM2_SIZE, 0); }
2134 count = current_state->count; /* Number already matched */
2135 if (clen > 0)
2136 {
2137 BOOL OK;
2138 switch (c)
2139 {
2140 HSPACE_CASES:
2141 OK = TRUE;
2142 break;
2143
2144 default:
2145 OK = FALSE;
2146 break;
2147 }
2148
2149 if (OK == (d == OP_HSPACE))
2150 {
2151 if (codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSUPTO)
2152 {
2153 active_count--; /* Remove non-match possibility */
2154 next_active_state--;
2155 }
2156 if (++count >= (int)GET2(code, 1))
2157 { ADD_NEW_DATA(-(state_offset + 2 + IMM2_SIZE), 0, 0); }
2158 else
2159 { ADD_NEW_DATA(-state_offset, count, 0); }
2160 }
2161 }
2162 break;
2163
2164 /* ========================================================================== */
2165 /* These opcodes are followed by a character that is usually compared
2166 to the current subject character; it is loaded into d. We still get
2167 here even if there is no subject character, because in some cases zero
2168 repetitions are permitted. */
2169
2170 /*-----------------------------------------------------------------*/
2171 case OP_CHAR:
2172 if (clen > 0 && c == d) { ADD_NEW(state_offset + dlen + 1, 0); }
2173 break;
2174
2175 /*-----------------------------------------------------------------*/
2176 case OP_CHARI:
2177 if (clen == 0) break;
2178
2179 #ifdef SUPPORT_UNICODE
2180 if (utf)
2181 {
2182 if (c == d) { ADD_NEW(state_offset + dlen + 1, 0); } else
2183 {
2184 unsigned int othercase;
2185 if (c < 128)
2186 othercase = fcc[c];
2187 else
2188 othercase = UCD_OTHERCASE(c);
2189 if (d == othercase) { ADD_NEW(state_offset + dlen + 1, 0); }
2190 }
2191 }
2192 else
2193 #endif /* SUPPORT_UNICODE */
2194 /* Not UTF mode */
2195 {
2196 if (TABLE_GET(c, lcc, c) == TABLE_GET(d, lcc, d))
2197 { ADD_NEW(state_offset + 2, 0); }
2198 }
2199 break;
2200
2201
2202 #ifdef SUPPORT_UNICODE
2203 /*-----------------------------------------------------------------*/
2204 /* This is a tricky one because it can match more than one character.
2205 Find out how many characters to skip, and then set up a negative state
2206 to wait for them to pass before continuing. */
2207
2208 case OP_EXTUNI:
2209 if (clen > 0)
2210 {
2211 int ncount = 0;
2212 PCRE2_SPTR nptr = PRIV(extuni)(c, ptr + clen, mb->start_subject,
2213 end_subject, utf, &ncount);
2214 if (nptr >= end_subject && (mb->moptions & PCRE2_PARTIAL_HARD) != 0)
2215 reset_could_continue = TRUE;
2216 ADD_NEW_DATA(-(state_offset + 1), 0, ncount);
2217 }
2218 break;
2219 #endif
2220
2221 /*-----------------------------------------------------------------*/
2222 /* This is a tricky like EXTUNI because it too can match more than one
2223 character (when CR is followed by LF). In this case, set up a negative
2224 state to wait for one character to pass before continuing. */
2225
2226 case OP_ANYNL:
2227 if (clen > 0) switch(c)
2228 {
2229 case CHAR_VT:
2230 case CHAR_FF:
2231 case CHAR_NEL:
2232 #ifndef EBCDIC
2233 case 0x2028:
2234 case 0x2029:
2235 #endif /* Not EBCDIC */
2236 if (mb->bsr_convention == PCRE2_BSR_ANYCRLF) break;
2237 /* Fall through */
2238
2239 case CHAR_LF:
2240 ADD_NEW(state_offset + 1, 0);
2241 break;
2242
2243 case CHAR_CR:
2244 if (ptr + 1 >= end_subject)
2245 {
2246 ADD_NEW(state_offset + 1, 0);
2247 if ((mb->moptions & PCRE2_PARTIAL_HARD) != 0)
2248 reset_could_continue = TRUE;
2249 }
2250 else if (UCHAR21TEST(ptr + 1) == CHAR_LF)
2251 {
2252 ADD_NEW_DATA(-(state_offset + 1), 0, 1);
2253 }
2254 else
2255 {
2256 ADD_NEW(state_offset + 1, 0);
2257 }
2258 break;
2259 }
2260 break;
2261
2262 /*-----------------------------------------------------------------*/
2263 case OP_NOT_VSPACE:
2264 if (clen > 0) switch(c)
2265 {
2266 VSPACE_CASES:
2267 break;
2268
2269 default:
2270 ADD_NEW(state_offset + 1, 0);
2271 break;
2272 }
2273 break;
2274
2275 /*-----------------------------------------------------------------*/
2276 case OP_VSPACE:
2277 if (clen > 0) switch(c)
2278 {
2279 VSPACE_CASES:
2280 ADD_NEW(state_offset + 1, 0);
2281 break;
2282
2283 default:
2284 break;
2285 }
2286 break;
2287
2288 /*-----------------------------------------------------------------*/
2289 case OP_NOT_HSPACE:
2290 if (clen > 0) switch(c)
2291 {
2292 HSPACE_CASES:
2293 break;
2294
2295 default:
2296 ADD_NEW(state_offset + 1, 0);
2297 break;
2298 }
2299 break;
2300
2301 /*-----------------------------------------------------------------*/
2302 case OP_HSPACE:
2303 if (clen > 0) switch(c)
2304 {
2305 HSPACE_CASES:
2306 ADD_NEW(state_offset + 1, 0);
2307 break;
2308
2309 default:
2310 break;
2311 }
2312 break;
2313
2314 /*-----------------------------------------------------------------*/
2315 /* Match a negated single character casefully. */
2316
2317 case OP_NOT:
2318 if (clen > 0 && c != d) { ADD_NEW(state_offset + dlen + 1, 0); }
2319 break;
2320
2321 /*-----------------------------------------------------------------*/
2322 /* Match a negated single character caselessly. */
2323
2324 case OP_NOTI:
2325 if (clen > 0)
2326 {
2327 uint32_t otherd;
2328 #ifdef SUPPORT_UNICODE
2329 if (utf && d >= 128)
2330 otherd = UCD_OTHERCASE(d);
2331 else
2332 #endif /* SUPPORT_UNICODE */
2333 otherd = TABLE_GET(d, fcc, d);
2334 if (c != d && c != otherd)
2335 { ADD_NEW(state_offset + dlen + 1, 0); }
2336 }
2337 break;
2338
2339 /*-----------------------------------------------------------------*/
2340 case OP_PLUSI:
2341 case OP_MINPLUSI:
2342 case OP_POSPLUSI:
2343 case OP_NOTPLUSI:
2344 case OP_NOTMINPLUSI:
2345 case OP_NOTPOSPLUSI:
2346 caseless = TRUE;
2347 codevalue -= OP_STARI - OP_STAR;
2348
2349 /* Fall through */
2350 case OP_PLUS:
2351 case OP_MINPLUS:
2352 case OP_POSPLUS:
2353 case OP_NOTPLUS:
2354 case OP_NOTMINPLUS:
2355 case OP_NOTPOSPLUS:
2356 count = current_state->count; /* Already matched */
2357 if (count > 0) { ADD_ACTIVE(state_offset + dlen + 1, 0); }
2358 if (clen > 0)
2359 {
2360 uint32_t otherd = NOTACHAR;
2361 if (caseless)
2362 {
2363 #ifdef SUPPORT_UNICODE
2364 if (utf && d >= 128)
2365 otherd = UCD_OTHERCASE(d);
2366 else
2367 #endif /* SUPPORT_UNICODE */
2368 otherd = TABLE_GET(d, fcc, d);
2369 }
2370 if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
2371 {
2372 if (count > 0 &&
2373 (codevalue == OP_POSPLUS || codevalue == OP_NOTPOSPLUS))
2374 {
2375 active_count--; /* Remove non-match possibility */
2376 next_active_state--;
2377 }
2378 count++;
2379 ADD_NEW(state_offset, count);
2380 }
2381 }
2382 break;
2383
2384 /*-----------------------------------------------------------------*/
2385 case OP_QUERYI:
2386 case OP_MINQUERYI:
2387 case OP_POSQUERYI:
2388 case OP_NOTQUERYI:
2389 case OP_NOTMINQUERYI:
2390 case OP_NOTPOSQUERYI:
2391 caseless = TRUE;
2392 codevalue -= OP_STARI - OP_STAR;
2393 /* Fall through */
2394 case OP_QUERY:
2395 case OP_MINQUERY:
2396 case OP_POSQUERY:
2397 case OP_NOTQUERY:
2398 case OP_NOTMINQUERY:
2399 case OP_NOTPOSQUERY:
2400 ADD_ACTIVE(state_offset + dlen + 1, 0);
2401 if (clen > 0)
2402 {
2403 uint32_t otherd = NOTACHAR;
2404 if (caseless)
2405 {
2406 #ifdef SUPPORT_UNICODE
2407 if (utf && d >= 128)
2408 otherd = UCD_OTHERCASE(d);
2409 else
2410 #endif /* SUPPORT_UNICODE */
2411 otherd = TABLE_GET(d, fcc, d);
2412 }
2413 if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
2414 {
2415 if (codevalue == OP_POSQUERY || codevalue == OP_NOTPOSQUERY)
2416 {
2417 active_count--; /* Remove non-match possibility */
2418 next_active_state--;
2419 }
2420 ADD_NEW(state_offset + dlen + 1, 0);
2421 }
2422 }
2423 break;
2424
2425 /*-----------------------------------------------------------------*/
2426 case OP_STARI:
2427 case OP_MINSTARI:
2428 case OP_POSSTARI:
2429 case OP_NOTSTARI:
2430 case OP_NOTMINSTARI:
2431 case OP_NOTPOSSTARI:
2432 caseless = TRUE;
2433 codevalue -= OP_STARI - OP_STAR;
2434 /* Fall through */
2435 case OP_STAR:
2436 case OP_MINSTAR:
2437 case OP_POSSTAR:
2438 case OP_NOTSTAR:
2439 case OP_NOTMINSTAR:
2440 case OP_NOTPOSSTAR:
2441 ADD_ACTIVE(state_offset + dlen + 1, 0);
2442 if (clen > 0)
2443 {
2444 uint32_t otherd = NOTACHAR;
2445 if (caseless)
2446 {
2447 #ifdef SUPPORT_UNICODE
2448 if (utf && d >= 128)
2449 otherd = UCD_OTHERCASE(d);
2450 else
2451 #endif /* SUPPORT_UNICODE */
2452 otherd = TABLE_GET(d, fcc, d);
2453 }
2454 if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
2455 {
2456 if (codevalue == OP_POSSTAR || codevalue == OP_NOTPOSSTAR)
2457 {
2458 active_count--; /* Remove non-match possibility */
2459 next_active_state--;
2460 }
2461 ADD_NEW(state_offset, 0);
2462 }
2463 }
2464 break;
2465
2466 /*-----------------------------------------------------------------*/
2467 case OP_EXACTI:
2468 case OP_NOTEXACTI:
2469 caseless = TRUE;
2470 codevalue -= OP_STARI - OP_STAR;
2471 /* Fall through */
2472 case OP_EXACT:
2473 case OP_NOTEXACT:
2474 count = current_state->count; /* Number already matched */
2475 if (clen > 0)
2476 {
2477 uint32_t otherd = NOTACHAR;
2478 if (caseless)
2479 {
2480 #ifdef SUPPORT_UNICODE
2481 if (utf && d >= 128)
2482 otherd = UCD_OTHERCASE(d);
2483 else
2484 #endif /* SUPPORT_UNICODE */
2485 otherd = TABLE_GET(d, fcc, d);
2486 }
2487 if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
2488 {
2489 if (++count >= (int)GET2(code, 1))
2490 { ADD_NEW(state_offset + dlen + 1 + IMM2_SIZE, 0); }
2491 else
2492 { ADD_NEW(state_offset, count); }
2493 }
2494 }
2495 break;
2496
2497 /*-----------------------------------------------------------------*/
2498 case OP_UPTOI:
2499 case OP_MINUPTOI:
2500 case OP_POSUPTOI:
2501 case OP_NOTUPTOI:
2502 case OP_NOTMINUPTOI:
2503 case OP_NOTPOSUPTOI:
2504 caseless = TRUE;
2505 codevalue -= OP_STARI - OP_STAR;
2506 /* Fall through */
2507 case OP_UPTO:
2508 case OP_MINUPTO:
2509 case OP_POSUPTO:
2510 case OP_NOTUPTO:
2511 case OP_NOTMINUPTO:
2512 case OP_NOTPOSUPTO:
2513 ADD_ACTIVE(state_offset + dlen + 1 + IMM2_SIZE, 0);
2514 count = current_state->count; /* Number already matched */
2515 if (clen > 0)
2516 {
2517 uint32_t otherd = NOTACHAR;
2518 if (caseless)
2519 {
2520 #ifdef SUPPORT_UNICODE
2521 if (utf && d >= 128)
2522 otherd = UCD_OTHERCASE(d);
2523 else
2524 #endif /* SUPPORT_UNICODE */
2525 otherd = TABLE_GET(d, fcc, d);
2526 }
2527 if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
2528 {
2529 if (codevalue == OP_POSUPTO || codevalue == OP_NOTPOSUPTO)
2530 {
2531 active_count--; /* Remove non-match possibility */
2532 next_active_state--;
2533 }
2534 if (++count >= (int)GET2(code, 1))
2535 { ADD_NEW(state_offset + dlen + 1 + IMM2_SIZE, 0); }
2536 else
2537 { ADD_NEW(state_offset, count); }
2538 }
2539 }
2540 break;
2541
2542
2543 /* ========================================================================== */
2544 /* These are the class-handling opcodes */
2545
2546 case OP_CLASS:
2547 case OP_NCLASS:
2548 case OP_XCLASS:
2549 {
2550 BOOL isinclass = FALSE;
2551 int next_state_offset;
2552 PCRE2_SPTR ecode;
2553
2554 /* For a simple class, there is always just a 32-byte table, and we
2555 can set isinclass from it. */
2556
2557 if (codevalue != OP_XCLASS)
2558 {
2559 ecode = code + 1 + (32 / sizeof(PCRE2_UCHAR));
2560 if (clen > 0)
2561 {
2562 isinclass = (c > 255)? (codevalue == OP_NCLASS) :
2563 ((((uint8_t *)(code + 1))[c/8] & (1 << (c&7))) != 0);
2564 }
2565 }
2566
2567 /* An extended class may have a table or a list of single characters,
2568 ranges, or both, and it may be positive or negative. There's a
2569 function that sorts all this out. */
2570
2571 else
2572 {
2573 ecode = code + GET(code, 1);
2574 if (clen > 0) isinclass = PRIV(xclass)(c, code + 1 + LINK_SIZE, utf);
2575 }
2576
2577 /* At this point, isinclass is set for all kinds of class, and ecode
2578 points to the byte after the end of the class. If there is a
2579 quantifier, this is where it will be. */
2580
2581 next_state_offset = (int)(ecode - start_code);
2582
2583 switch (*ecode)
2584 {
2585 case OP_CRSTAR:
2586 case OP_CRMINSTAR:
2587 case OP_CRPOSSTAR:
2588 ADD_ACTIVE(next_state_offset + 1, 0);
2589 if (isinclass)
2590 {
2591 if (*ecode == OP_CRPOSSTAR)
2592 {
2593 active_count--; /* Remove non-match possibility */
2594 next_active_state--;
2595 }
2596 ADD_NEW(state_offset, 0);
2597 }
2598 break;
2599
2600 case OP_CRPLUS:
2601 case OP_CRMINPLUS:
2602 case OP_CRPOSPLUS:
2603 count = current_state->count; /* Already matched */
2604 if (count > 0) { ADD_ACTIVE(next_state_offset + 1, 0); }
2605 if (isinclass)
2606 {
2607 if (count > 0 && *ecode == OP_CRPOSPLUS)
2608 {
2609 active_count--; /* Remove non-match possibility */
2610 next_active_state--;
2611 }
2612 count++;
2613 ADD_NEW(state_offset, count);
2614 }
2615 break;
2616
2617 case OP_CRQUERY:
2618 case OP_CRMINQUERY:
2619 case OP_CRPOSQUERY:
2620 ADD_ACTIVE(next_state_offset + 1, 0);
2621 if (isinclass)
2622 {
2623 if (*ecode == OP_CRPOSQUERY)
2624 {
2625 active_count--; /* Remove non-match possibility */
2626 next_active_state--;
2627 }
2628 ADD_NEW(next_state_offset + 1, 0);
2629 }
2630 break;
2631
2632 case OP_CRRANGE:
2633 case OP_CRMINRANGE:
2634 case OP_CRPOSRANGE:
2635 count = current_state->count; /* Already matched */
2636 if (count >= (int)GET2(ecode, 1))
2637 { ADD_ACTIVE(next_state_offset + 1 + 2 * IMM2_SIZE, 0); }
2638 if (isinclass)
2639 {
2640 int max = (int)GET2(ecode, 1 + IMM2_SIZE);
2641
2642 if (*ecode == OP_CRPOSRANGE && count >= (int)GET2(ecode, 1))
2643 {
2644 active_count--; /* Remove non-match possibility */
2645 next_active_state--;
2646 }
2647
2648 if (++count >= max && max != 0) /* Max 0 => no limit */
2649 { ADD_NEW(next_state_offset + 1 + 2 * IMM2_SIZE, 0); }
2650 else
2651 { ADD_NEW(state_offset, count); }
2652 }
2653 break;
2654
2655 default:
2656 if (isinclass) { ADD_NEW(next_state_offset, 0); }
2657 break;
2658 }
2659 }
2660 break;
2661
2662 /* ========================================================================== */
2663 /* These are the opcodes for fancy brackets of various kinds. We have
2664 to use recursion in order to handle them. The "always failing" assertion
2665 (?!) is optimised to OP_FAIL when compiling, so we have to support that,
2666 though the other "backtracking verbs" are not supported. */
2667
2668 case OP_FAIL:
2669 forced_fail++; /* Count FAILs for multiple states */
2670 break;
2671
2672 case OP_ASSERT:
2673 case OP_ASSERT_NOT:
2674 case OP_ASSERTBACK:
2675 case OP_ASSERTBACK_NOT:
2676 {
2677 int rc;
2678 int *local_workspace;
2679 PCRE2_SIZE *local_offsets;
2680 PCRE2_SPTR endasscode = code + GET(code, 1);
2681 RWS_anchor *rws = (RWS_anchor *)RWS;
2682
2683 if (rws->free < RWS_RSIZE + RWS_OVEC_OSIZE)
2684 {
2685 rc = more_workspace(&rws, RWS_OVEC_OSIZE, mb);
2686 if (rc != 0) return rc;
2687 RWS = (int *)rws;
2688 }
2689
2690 local_offsets = (PCRE2_SIZE *)(RWS + rws->size - rws->free);
2691 local_workspace = ((int *)local_offsets) + RWS_OVEC_OSIZE;
2692 rws->free -= RWS_RSIZE + RWS_OVEC_OSIZE;
2693
2694 while (*endasscode == OP_ALT) endasscode += GET(endasscode, 1);
2695
2696 rc = internal_dfa_match(
2697 mb, /* static match data */
2698 code, /* this subexpression's code */
2699 ptr, /* where we currently are */
2700 (PCRE2_SIZE)(ptr - start_subject), /* start offset */
2701 local_offsets, /* offset vector */
2702 RWS_OVEC_OSIZE/OVEC_UNIT, /* size of same */
2703 local_workspace, /* workspace vector */
2704 RWS_RSIZE, /* size of same */
2705 rlevel, /* function recursion level */
2706 RWS); /* recursion workspace */
2707
2708 rws->free += RWS_RSIZE + RWS_OVEC_OSIZE;
2709
2710 if (rc < 0 && rc != PCRE2_ERROR_NOMATCH) return rc;
2711 if ((rc >= 0) == (codevalue == OP_ASSERT || codevalue == OP_ASSERTBACK))
2712 { ADD_ACTIVE((int)(endasscode + LINK_SIZE + 1 - start_code), 0); }
2713 }
2714 break;
2715
2716 /*-----------------------------------------------------------------*/
2717 case OP_COND:
2718 case OP_SCOND:
2719 {
2720 int codelink = (int)GET(code, 1);
2721 PCRE2_UCHAR condcode;
2722
2723 /* Because of the way auto-callout works during compile, a callout item
2724 is inserted between OP_COND and an assertion condition. This does not
2725 happen for the other conditions. */
2726
2727 if (code[LINK_SIZE + 1] == OP_CALLOUT
2728 || code[LINK_SIZE + 1] == OP_CALLOUT_STR)
2729 {
2730 PCRE2_SIZE callout_length;
2731 rrc = do_callout(code, offsets, current_subject, ptr, mb,
2732 1 + LINK_SIZE, &callout_length);
2733 if (rrc < 0) return rrc; /* Abandon */
2734 if (rrc > 0) break; /* Fail this thread */
2735 code += callout_length; /* Skip callout data */
2736 }
2737
2738 condcode = code[LINK_SIZE+1];
2739
2740 /* Back reference conditions and duplicate named recursion conditions
2741 are not supported */
2742
2743 if (condcode == OP_CREF || condcode == OP_DNCREF ||
2744 condcode == OP_DNRREF)
2745 return PCRE2_ERROR_DFA_UCOND;
2746
2747 /* The DEFINE condition is always false, and the assertion (?!) is
2748 converted to OP_FAIL. */
2749
2750 if (condcode == OP_FALSE || condcode == OP_FAIL)
2751 { ADD_ACTIVE(state_offset + codelink + LINK_SIZE + 1, 0); }
2752
2753 /* There is also an always-true condition */
2754
2755 else if (condcode == OP_TRUE)
2756 { ADD_ACTIVE(state_offset + LINK_SIZE + 2 + IMM2_SIZE, 0); }
2757
2758 /* The only supported version of OP_RREF is for the value RREF_ANY,
2759 which means "test if in any recursion". We can't test for specifically
2760 recursed groups. */
2761
2762 else if (condcode == OP_RREF)
2763 {
2764 unsigned int value = GET2(code, LINK_SIZE + 2);
2765 if (value != RREF_ANY) return PCRE2_ERROR_DFA_UCOND;
2766 if (mb->recursive != NULL)
2767 { ADD_ACTIVE(state_offset + LINK_SIZE + 2 + IMM2_SIZE, 0); }
2768 else { ADD_ACTIVE(state_offset + codelink + LINK_SIZE + 1, 0); }
2769 }
2770
2771 /* Otherwise, the condition is an assertion */
2772
2773 else
2774 {
2775 int rc;
2776 int *local_workspace;
2777 PCRE2_SIZE *local_offsets;
2778 PCRE2_SPTR asscode = code + LINK_SIZE + 1;
2779 PCRE2_SPTR endasscode = asscode + GET(asscode, 1);
2780 RWS_anchor *rws = (RWS_anchor *)RWS;
2781
2782 if (rws->free < RWS_RSIZE + RWS_OVEC_OSIZE)
2783 {
2784 rc = more_workspace(&rws, RWS_OVEC_OSIZE, mb);
2785 if (rc != 0) return rc;
2786 RWS = (int *)rws;
2787 }
2788
2789 local_offsets = (PCRE2_SIZE *)(RWS + rws->size - rws->free);
2790 local_workspace = ((int *)local_offsets) + RWS_OVEC_OSIZE;
2791 rws->free -= RWS_RSIZE + RWS_OVEC_OSIZE;
2792
2793 while (*endasscode == OP_ALT) endasscode += GET(endasscode, 1);
2794
2795 rc = internal_dfa_match(
2796 mb, /* fixed match data */
2797 asscode, /* this subexpression's code */
2798 ptr, /* where we currently are */
2799 (PCRE2_SIZE)(ptr - start_subject), /* start offset */
2800 local_offsets, /* offset vector */
2801 RWS_OVEC_OSIZE/OVEC_UNIT, /* size of same */
2802 local_workspace, /* workspace vector */
2803 RWS_RSIZE, /* size of same */
2804 rlevel, /* function recursion level */
2805 RWS); /* recursion workspace */
2806
2807 rws->free += RWS_RSIZE + RWS_OVEC_OSIZE;
2808
2809 if (rc < 0 && rc != PCRE2_ERROR_NOMATCH) return rc;
2810 if ((rc >= 0) ==
2811 (condcode == OP_ASSERT || condcode == OP_ASSERTBACK))
2812 { ADD_ACTIVE((int)(endasscode + LINK_SIZE + 1 - start_code), 0); }
2813 else
2814 { ADD_ACTIVE(state_offset + codelink + LINK_SIZE + 1, 0); }
2815 }
2816 }
2817 break;
2818
2819 /*-----------------------------------------------------------------*/
2820 case OP_RECURSE:
2821 {
2822 int rc;
2823 int *local_workspace;
2824 PCRE2_SIZE *local_offsets;
2825 RWS_anchor *rws = (RWS_anchor *)RWS;
2826 dfa_recursion_info *ri;
2827 PCRE2_SPTR callpat = start_code + GET(code, 1);
2828 uint32_t recno = (callpat == mb->start_code)? 0 :
2829 GET2(callpat, 1 + LINK_SIZE);
2830
2831 if (rws->free < RWS_RSIZE + RWS_OVEC_RSIZE)
2832 {
2833 rc = more_workspace(&rws, RWS_OVEC_RSIZE, mb);
2834 if (rc != 0) return rc;
2835 RWS = (int *)rws;
2836 }
2837
2838 local_offsets = (PCRE2_SIZE *)(RWS + rws->size - rws->free);
2839 local_workspace = ((int *)local_offsets) + RWS_OVEC_RSIZE;
2840 rws->free -= RWS_RSIZE + RWS_OVEC_RSIZE;
2841
2842 /* Check for repeating a recursion without advancing the subject
2843 pointer. This should catch convoluted mutual recursions. (Some simple
2844 cases are caught at compile time.) */
2845
2846 for (ri = mb->recursive; ri != NULL; ri = ri->prevrec)
2847 if (recno == ri->group_num && ptr == ri->subject_position)
2848 return PCRE2_ERROR_RECURSELOOP;
2849
2850 /* Remember this recursion and where we started it so as to
2851 catch infinite loops. */
2852
2853 new_recursive.group_num = recno;
2854 new_recursive.subject_position = ptr;
2855 new_recursive.prevrec = mb->recursive;
2856 mb->recursive = &new_recursive;
2857
2858 rc = internal_dfa_match(
2859 mb, /* fixed match data */
2860 callpat, /* this subexpression's code */
2861 ptr, /* where we currently are */
2862 (PCRE2_SIZE)(ptr - start_subject), /* start offset */
2863 local_offsets, /* offset vector */
2864 RWS_OVEC_RSIZE/OVEC_UNIT, /* size of same */
2865 local_workspace, /* workspace vector */
2866 RWS_RSIZE, /* size of same */
2867 rlevel, /* function recursion level */
2868 RWS); /* recursion workspace */
2869
2870 rws->free += RWS_RSIZE + RWS_OVEC_RSIZE;
2871 mb->recursive = new_recursive.prevrec; /* Done this recursion */
2872
2873 /* Ran out of internal offsets */
2874
2875 if (rc == 0) return PCRE2_ERROR_DFA_RECURSE;
2876
2877 /* For each successful matched substring, set up the next state with a
2878 count of characters to skip before trying it. Note that the count is in
2879 characters, not bytes. */
2880
2881 if (rc > 0)
2882 {
2883 for (rc = rc*2 - 2; rc >= 0; rc -= 2)
2884 {
2885 PCRE2_SIZE charcount = local_offsets[rc+1] - local_offsets[rc];
2886 #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32
2887 if (utf)
2888 {
2889 PCRE2_SPTR p = start_subject + local_offsets[rc];
2890 PCRE2_SPTR pp = start_subject + local_offsets[rc+1];
2891 while (p < pp) if (NOT_FIRSTCU(*p++)) charcount--;
2892 }
2893 #endif
2894 if (charcount > 0)
2895 {
2896 ADD_NEW_DATA(-(state_offset + LINK_SIZE + 1), 0,
2897 (int)(charcount - 1));
2898 }
2899 else
2900 {
2901 ADD_ACTIVE(state_offset + LINK_SIZE + 1, 0);
2902 }
2903 }
2904 }
2905 else if (rc != PCRE2_ERROR_NOMATCH) return rc;
2906 }
2907 break;
2908
2909 /*-----------------------------------------------------------------*/
2910 case OP_BRAPOS:
2911 case OP_SBRAPOS:
2912 case OP_CBRAPOS:
2913 case OP_SCBRAPOS:
2914 case OP_BRAPOSZERO:
2915 {
2916 int rc;
2917 int *local_workspace;
2918 PCRE2_SIZE *local_offsets;
2919 PCRE2_SIZE charcount, matched_count;
2920 PCRE2_SPTR local_ptr = ptr;
2921 RWS_anchor *rws = (RWS_anchor *)RWS;
2922 BOOL allow_zero;
2923
2924 if (rws->free < RWS_RSIZE + RWS_OVEC_OSIZE)
2925 {
2926 rc = more_workspace(&rws, RWS_OVEC_OSIZE, mb);
2927 if (rc != 0) return rc;
2928 RWS = (int *)rws;
2929 }
2930
2931 local_offsets = (PCRE2_SIZE *)(RWS + rws->size - rws->free);
2932 local_workspace = ((int *)local_offsets) + RWS_OVEC_OSIZE;
2933 rws->free -= RWS_RSIZE + RWS_OVEC_OSIZE;
2934
2935 if (codevalue == OP_BRAPOSZERO)
2936 {
2937 allow_zero = TRUE;
2938 codevalue = *(++code); /* Codevalue will be one of above BRAs */
2939 }
2940 else allow_zero = FALSE;
2941
2942 /* Loop to match the subpattern as many times as possible as if it were
2943 a complete pattern. */
2944
2945 for (matched_count = 0;; matched_count++)
2946 {
2947 rc = internal_dfa_match(
2948 mb, /* fixed match data */
2949 code, /* this subexpression's code */
2950 local_ptr, /* where we currently are */
2951 (PCRE2_SIZE)(ptr - start_subject), /* start offset */
2952 local_offsets, /* offset vector */
2953 RWS_OVEC_OSIZE/OVEC_UNIT, /* size of same */
2954 local_workspace, /* workspace vector */
2955 RWS_RSIZE, /* size of same */
2956 rlevel, /* function recursion level */
2957 RWS); /* recursion workspace */
2958
2959 /* Failed to match */
2960
2961 if (rc < 0)
2962 {
2963 if (rc != PCRE2_ERROR_NOMATCH) return rc;
2964 break;
2965 }
2966
2967 /* Matched: break the loop if zero characters matched. */
2968
2969 charcount = local_offsets[1] - local_offsets[0];
2970 if (charcount == 0) break;
2971 local_ptr += charcount; /* Advance temporary position ptr */
2972 }
2973
2974 rws->free += RWS_RSIZE + RWS_OVEC_OSIZE;
2975
2976 /* At this point we have matched the subpattern matched_count
2977 times, and local_ptr is pointing to the character after the end of the
2978 last match. */
2979
2980 if (matched_count > 0 || allow_zero)
2981 {
2982 PCRE2_SPTR end_subpattern = code;
2983 int next_state_offset;
2984
2985 do { end_subpattern += GET(end_subpattern, 1); }
2986 while (*end_subpattern == OP_ALT);
2987 next_state_offset =
2988 (int)(end_subpattern - start_code + LINK_SIZE + 1);
2989
2990 /* Optimization: if there are no more active states, and there
2991 are no new states yet set up, then skip over the subject string
2992 right here, to save looping. Otherwise, set up the new state to swing
2993 into action when the end of the matched substring is reached. */
2994
2995 if (i + 1 >= active_count && new_count == 0)
2996 {
2997 ptr = local_ptr;
2998 clen = 0;
2999 ADD_NEW(next_state_offset, 0);
3000 }
3001 else
3002 {
3003 PCRE2_SPTR p = ptr;
3004 PCRE2_SPTR pp = local_ptr;
3005 charcount = (PCRE2_SIZE)(pp - p);
3006 #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32
3007 if (utf) while (p < pp) if (NOT_FIRSTCU(*p++)) charcount--;
3008 #endif
3009 ADD_NEW_DATA(-next_state_offset, 0, (int)(charcount - 1));
3010 }
3011 }
3012 }
3013 break;
3014
3015 /*-----------------------------------------------------------------*/
3016 case OP_ONCE:
3017 {
3018 int rc;
3019 int *local_workspace;
3020 PCRE2_SIZE *local_offsets;
3021 RWS_anchor *rws = (RWS_anchor *)RWS;
3022
3023 if (rws->free < RWS_RSIZE + RWS_OVEC_OSIZE)
3024 {
3025 rc = more_workspace(&rws, RWS_OVEC_OSIZE, mb);
3026 if (rc != 0) return rc;
3027 RWS = (int *)rws;
3028 }
3029
3030 local_offsets = (PCRE2_SIZE *)(RWS + rws->size - rws->free);
3031 local_workspace = ((int *)local_offsets) + RWS_OVEC_OSIZE;
3032 rws->free -= RWS_RSIZE + RWS_OVEC_OSIZE;
3033
3034 rc = internal_dfa_match(
3035 mb, /* fixed match data */
3036 code, /* this subexpression's code */
3037 ptr, /* where we currently are */
3038 (PCRE2_SIZE)(ptr - start_subject), /* start offset */
3039 local_offsets, /* offset vector */
3040 RWS_OVEC_OSIZE/OVEC_UNIT, /* size of same */
3041 local_workspace, /* workspace vector */
3042 RWS_RSIZE, /* size of same */
3043 rlevel, /* function recursion level */
3044 RWS); /* recursion workspace */
3045
3046 rws->free += RWS_RSIZE + RWS_OVEC_OSIZE;
3047
3048 if (rc >= 0)
3049 {
3050 PCRE2_SPTR end_subpattern = code;
3051 PCRE2_SIZE charcount = local_offsets[1] - local_offsets[0];
3052 int next_state_offset, repeat_state_offset;
3053
3054 do { end_subpattern += GET(end_subpattern, 1); }
3055 while (*end_subpattern == OP_ALT);
3056 next_state_offset =
3057 (int)(end_subpattern - start_code + LINK_SIZE + 1);
3058
3059 /* If the end of this subpattern is KETRMAX or KETRMIN, we must
3060 arrange for the repeat state also to be added to the relevant list.
3061 Calculate the offset, or set -1 for no repeat. */
3062
3063 repeat_state_offset = (*end_subpattern == OP_KETRMAX ||
3064 *end_subpattern == OP_KETRMIN)?
3065 (int)(end_subpattern - start_code - GET(end_subpattern, 1)) : -1;
3066
3067 /* If we have matched an empty string, add the next state at the
3068 current character pointer. This is important so that the duplicate
3069 checking kicks in, which is what breaks infinite loops that match an
3070 empty string. */
3071
3072 if (charcount == 0)
3073 {
3074 ADD_ACTIVE(next_state_offset, 0);
3075 }
3076
3077 /* Optimization: if there are no more active states, and there
3078 are no new states yet set up, then skip over the subject string
3079 right here, to save looping. Otherwise, set up the new state to swing
3080 into action when the end of the matched substring is reached. */
3081
3082 else if (i + 1 >= active_count && new_count == 0)
3083 {
3084 ptr += charcount;
3085 clen = 0;
3086 ADD_NEW(next_state_offset, 0);
3087
3088 /* If we are adding a repeat state at the new character position,
3089 we must fudge things so that it is the only current state.
3090 Otherwise, it might be a duplicate of one we processed before, and
3091 that would cause it to be skipped. */
3092
3093 if (repeat_state_offset >= 0)
3094 {
3095 next_active_state = active_states;
3096 active_count = 0;
3097 i = -1;
3098 ADD_ACTIVE(repeat_state_offset, 0);
3099 }
3100 }
3101 else
3102 {
3103 #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32
3104 if (utf)
3105 {
3106 PCRE2_SPTR p = start_subject + local_offsets[0];
3107 PCRE2_SPTR pp = start_subject + local_offsets[1];
3108 while (p < pp) if (NOT_FIRSTCU(*p++)) charcount--;
3109 }
3110 #endif
3111 ADD_NEW_DATA(-next_state_offset, 0, (int)(charcount - 1));
3112 if (repeat_state_offset >= 0)
3113 { ADD_NEW_DATA(-repeat_state_offset, 0, (int)(charcount - 1)); }
3114 }
3115 }
3116 else if (rc != PCRE2_ERROR_NOMATCH) return rc;
3117 }
3118 break;
3119
3120
3121 /* ========================================================================== */
3122 /* Handle callouts */
3123
3124 case OP_CALLOUT:
3125 case OP_CALLOUT_STR:
3126 {
3127 PCRE2_SIZE callout_length;
3128 rrc = do_callout(code, offsets, current_subject, ptr, mb, 0,
3129 &callout_length);
3130 if (rrc < 0) return rrc; /* Abandon */
3131 if (rrc == 0)
3132 { ADD_ACTIVE(state_offset + (int)callout_length, 0); }
3133 }
3134 break;
3135
3136
3137 /* ========================================================================== */
3138 default: /* Unsupported opcode */
3139 return PCRE2_ERROR_DFA_UITEM;
3140 }
3141
3142 NEXT_ACTIVE_STATE: continue;
3143
3144 } /* End of loop scanning active states */
3145
3146 /* We have finished the processing at the current subject character. If no
3147 new states have been set for the next character, we have found all the
3148 matches that we are going to find. If we are at the top level and partial
3149 matching has been requested, check for appropriate conditions.
3150
3151 The "forced_ fail" variable counts the number of (*F) encountered for the
3152 character. If it is equal to the original active_count (saved in
3153 workspace[1]) it means that (*F) was found on every active state. In this
3154 case we don't want to give a partial match.
3155
3156 The "could_continue" variable is true if a state could have continued but
3157 for the fact that the end of the subject was reached. */
3158
3159 if (new_count <= 0)
3160 {
3161 if (rlevel == 1 && /* Top level, and */
3162 could_continue && /* Some could go on, and */
3163 forced_fail != workspace[1] && /* Not all forced fail & */
3164 ( /* either... */
3165 (mb->moptions & PCRE2_PARTIAL_HARD) != 0 /* Hard partial */
3166 || /* or... */
3167 ((mb->moptions & PCRE2_PARTIAL_SOFT) != 0 && /* Soft partial and */
3168 match_count < 0) /* no matches */
3169 ) && /* And... */
3170 (
3171 partial_newline || /* Either partial NL */
3172 ( /* or ... */
3173 ptr >= end_subject && /* End of subject and */
3174 ptr > mb->start_used_ptr) /* Inspected non-empty string */
3175 )
3176 )
3177 match_count = PCRE2_ERROR_PARTIAL;
3178 break; /* Exit from loop along the subject string */
3179 }
3180
3181 /* One or more states are active for the next character. */
3182
3183 ptr += clen; /* Advance to next subject character */
3184 } /* Loop to move along the subject string */
3185
3186 /* Control gets here from "break" a few lines above. If we have a match and
3187 PCRE2_ENDANCHORED is set, the match fails. */
3188
3189 if (match_count >= 0 &&
3190 ((mb->moptions | mb->poptions) & PCRE2_ENDANCHORED) != 0 &&
3191 ptr < end_subject)
3192 match_count = PCRE2_ERROR_NOMATCH;
3193
3194 return match_count;
3195 }
3196
3197
3198
3199 /*************************************************
3200 * Match a pattern using the DFA algorithm *
3201 *************************************************/
3202
3203 /* This function matches a compiled pattern to a subject string, using the
3204 alternate matching algorithm that finds all matches at once.
3205
3206 Arguments:
3207 code points to the compiled pattern
3208 subject subject string
3209 length length of subject string
3210 startoffset where to start matching in the subject
3211 options option bits
3212 match_data points to a match data structure
3213 gcontext points to a match context
3214 workspace pointer to workspace
3215 wscount size of workspace
3216
3217 Returns: > 0 => number of match offset pairs placed in offsets
3218 = 0 => offsets overflowed; longest matches are present
3219 -1 => failed to match
3220 < -1 => some kind of unexpected problem
3221 */
3222
3223 PCRE2_EXP_DEFN int PCRE2_CALL_CONVENTION
pcre2_dfa_match(const pcre2_code * code,PCRE2_SPTR subject,PCRE2_SIZE length,PCRE2_SIZE start_offset,uint32_t options,pcre2_match_data * match_data,pcre2_match_context * mcontext,int * workspace,PCRE2_SIZE wscount)3224 pcre2_dfa_match(const pcre2_code *code, PCRE2_SPTR subject, PCRE2_SIZE length,
3225 PCRE2_SIZE start_offset, uint32_t options, pcre2_match_data *match_data,
3226 pcre2_match_context *mcontext, int *workspace, PCRE2_SIZE wscount)
3227 {
3228 int rc;
3229 const pcre2_real_code *re = (const pcre2_real_code *)code;
3230
3231 PCRE2_SPTR start_match;
3232 PCRE2_SPTR end_subject;
3233 PCRE2_SPTR bumpalong_limit;
3234 PCRE2_SPTR req_cu_ptr;
3235
3236 BOOL utf, anchored, startline, firstline;
3237 BOOL has_first_cu = FALSE;
3238 BOOL has_req_cu = FALSE;
3239
3240 PCRE2_UCHAR first_cu = 0;
3241 PCRE2_UCHAR first_cu2 = 0;
3242 PCRE2_UCHAR req_cu = 0;
3243 PCRE2_UCHAR req_cu2 = 0;
3244
3245 const uint8_t *start_bits = NULL;
3246
3247 /* We need to have mb pointing to a match block, because the IS_NEWLINE macro
3248 is used below, and it expects NLBLOCK to be defined as a pointer. */
3249
3250 pcre2_callout_block cb;
3251 dfa_match_block actual_match_block;
3252 dfa_match_block *mb = &actual_match_block;
3253
3254 /* Set up a starting block of memory for use during recursive calls to
3255 internal_dfa_match(). By putting this on the stack, it minimizes resource use
3256 in the case when it is not needed. If this is too small, more memory is
3257 obtained from the heap. At the start of each block is an anchor structure.*/
3258
3259 int base_recursion_workspace[RWS_BASE_SIZE];
3260 RWS_anchor *rws = (RWS_anchor *)base_recursion_workspace;
3261 rws->next = NULL;
3262 rws->size = RWS_BASE_SIZE;
3263 rws->free = RWS_BASE_SIZE - RWS_ANCHOR_SIZE;
3264
3265 /* A length equal to PCRE2_ZERO_TERMINATED implies a zero-terminated
3266 subject string. */
3267
3268 if (length == PCRE2_ZERO_TERMINATED) length = PRIV(strlen)(subject);
3269
3270 /* Plausibility checks */
3271
3272 if ((options & ~PUBLIC_DFA_MATCH_OPTIONS) != 0) return PCRE2_ERROR_BADOPTION;
3273 if (re == NULL || subject == NULL || workspace == NULL || match_data == NULL)
3274 return PCRE2_ERROR_NULL;
3275 if (wscount < 20) return PCRE2_ERROR_DFA_WSSIZE;
3276 if (start_offset > length) return PCRE2_ERROR_BADOFFSET;
3277
3278 /* Partial matching and PCRE2_ENDANCHORED are currently not allowed at the same
3279 time. */
3280
3281 if ((options & (PCRE2_PARTIAL_HARD|PCRE2_PARTIAL_SOFT)) != 0 &&
3282 ((re->overall_options | options) & PCRE2_ENDANCHORED) != 0)
3283 return PCRE2_ERROR_BADOPTION;
3284
3285 /* Check that the first field in the block is the magic number. If it is not,
3286 return with PCRE2_ERROR_BADMAGIC. */
3287
3288 if (re->magic_number != MAGIC_NUMBER) return PCRE2_ERROR_BADMAGIC;
3289
3290 /* Check the code unit width. */
3291
3292 if ((re->flags & PCRE2_MODE_MASK) != PCRE2_CODE_UNIT_WIDTH/8)
3293 return PCRE2_ERROR_BADMODE;
3294
3295 /* PCRE2_NOTEMPTY and PCRE2_NOTEMPTY_ATSTART are match-time flags in the
3296 options variable for this function. Users of PCRE2 who are not calling the
3297 function directly would like to have a way of setting these flags, in the same
3298 way that they can set pcre2_compile() flags like PCRE2_NO_AUTOPOSSESS with
3299 constructions like (*NO_AUTOPOSSESS). To enable this, (*NOTEMPTY) and
3300 (*NOTEMPTY_ATSTART) set bits in the pattern's "flag" function which can now be
3301 transferred to the options for this function. The bits are guaranteed to be
3302 adjacent, but do not have the same values. This bit of Boolean trickery assumes
3303 that the match-time bits are not more significant than the flag bits. If by
3304 accident this is not the case, a compile-time division by zero error will
3305 occur. */
3306
3307 #define FF (PCRE2_NOTEMPTY_SET|PCRE2_NE_ATST_SET)
3308 #define OO (PCRE2_NOTEMPTY|PCRE2_NOTEMPTY_ATSTART)
3309 options |= (re->flags & FF) / ((FF & (~FF+1)) / (OO & (~OO+1)));
3310 #undef FF
3311 #undef OO
3312
3313 /* If restarting after a partial match, do some sanity checks on the contents
3314 of the workspace. */
3315
3316 if ((options & PCRE2_DFA_RESTART) != 0)
3317 {
3318 if ((workspace[0] & (-2)) != 0 || workspace[1] < 1 ||
3319 workspace[1] > (int)((wscount - 2)/INTS_PER_STATEBLOCK))
3320 return PCRE2_ERROR_DFA_BADRESTART;
3321 }
3322
3323 /* Set some local values */
3324
3325 utf = (re->overall_options & PCRE2_UTF) != 0;
3326 start_match = subject + start_offset;
3327 end_subject = subject + length;
3328 req_cu_ptr = start_match - 1;
3329 anchored = (options & (PCRE2_ANCHORED|PCRE2_DFA_RESTART)) != 0 ||
3330 (re->overall_options & PCRE2_ANCHORED) != 0;
3331
3332 /* The "must be at the start of a line" flags are used in a loop when finding
3333 where to start. */
3334
3335 startline = (re->flags & PCRE2_STARTLINE) != 0;
3336 firstline = (re->overall_options & PCRE2_FIRSTLINE) != 0;
3337 bumpalong_limit = end_subject;
3338
3339 /* Initialize and set up the fixed fields in the callout block, with a pointer
3340 in the match block. */
3341
3342 mb->cb = &cb;
3343 cb.version = 2;
3344 cb.subject = subject;
3345 cb.subject_length = (PCRE2_SIZE)(end_subject - subject);
3346 cb.callout_flags = 0;
3347 cb.capture_top = 1; /* No capture support */
3348 cb.capture_last = 0;
3349 cb.mark = NULL; /* No (*MARK) support */
3350
3351 /* Get data from the match context, if present, and fill in the remaining
3352 fields in the match block. It is an error to set an offset limit without
3353 setting the flag at compile time. */
3354
3355 if (mcontext == NULL)
3356 {
3357 mb->callout = NULL;
3358 mb->memctl = re->memctl;
3359 mb->match_limit = PRIV(default_match_context).match_limit;
3360 mb->match_limit_depth = PRIV(default_match_context).depth_limit;
3361 mb->heap_limit = PRIV(default_match_context).heap_limit;
3362 }
3363 else
3364 {
3365 if (mcontext->offset_limit != PCRE2_UNSET)
3366 {
3367 if ((re->overall_options & PCRE2_USE_OFFSET_LIMIT) == 0)
3368 return PCRE2_ERROR_BADOFFSETLIMIT;
3369 bumpalong_limit = subject + mcontext->offset_limit;
3370 }
3371 mb->callout = mcontext->callout;
3372 mb->callout_data = mcontext->callout_data;
3373 mb->memctl = mcontext->memctl;
3374 mb->match_limit = mcontext->match_limit;
3375 mb->match_limit_depth = mcontext->depth_limit;
3376 mb->heap_limit = mcontext->heap_limit;
3377 }
3378
3379 if (mb->match_limit > re->limit_match)
3380 mb->match_limit = re->limit_match;
3381
3382 if (mb->match_limit_depth > re->limit_depth)
3383 mb->match_limit_depth = re->limit_depth;
3384
3385 if (mb->heap_limit > re->limit_heap)
3386 mb->heap_limit = re->limit_heap;
3387
3388 mb->start_code = (PCRE2_UCHAR *)((uint8_t *)re + sizeof(pcre2_real_code)) +
3389 re->name_count * re->name_entry_size;
3390 mb->tables = re->tables;
3391 mb->start_subject = subject;
3392 mb->end_subject = end_subject;
3393 mb->start_offset = start_offset;
3394 mb->moptions = options;
3395 mb->poptions = re->overall_options;
3396 mb->match_call_count = 0;
3397 mb->heap_used = 0;
3398
3399 /* Process the \R and newline settings. */
3400
3401 mb->bsr_convention = re->bsr_convention;
3402 mb->nltype = NLTYPE_FIXED;
3403 switch(re->newline_convention)
3404 {
3405 case PCRE2_NEWLINE_CR:
3406 mb->nllen = 1;
3407 mb->nl[0] = CHAR_CR;
3408 break;
3409
3410 case PCRE2_NEWLINE_LF:
3411 mb->nllen = 1;
3412 mb->nl[0] = CHAR_NL;
3413 break;
3414
3415 case PCRE2_NEWLINE_NUL:
3416 mb->nllen = 1;
3417 mb->nl[0] = CHAR_NUL;
3418 break;
3419
3420 case PCRE2_NEWLINE_CRLF:
3421 mb->nllen = 2;
3422 mb->nl[0] = CHAR_CR;
3423 mb->nl[1] = CHAR_NL;
3424 break;
3425
3426 case PCRE2_NEWLINE_ANY:
3427 mb->nltype = NLTYPE_ANY;
3428 break;
3429
3430 case PCRE2_NEWLINE_ANYCRLF:
3431 mb->nltype = NLTYPE_ANYCRLF;
3432 break;
3433
3434 default: return PCRE2_ERROR_INTERNAL;
3435 }
3436
3437 /* Check a UTF string for validity if required. For 8-bit and 16-bit strings,
3438 we must also check that a starting offset does not point into the middle of a
3439 multiunit character. We check only the portion of the subject that is going to
3440 be inspected during matching - from the offset minus the maximum back reference
3441 to the given length. This saves time when a small part of a large subject is
3442 being matched by the use of a starting offset. Note that the maximum lookbehind
3443 is a number of characters, not code units. */
3444
3445 #ifdef SUPPORT_UNICODE
3446 if (utf && (options & PCRE2_NO_UTF_CHECK) == 0)
3447 {
3448 PCRE2_SPTR check_subject = start_match; /* start_match includes offset */
3449
3450 if (start_offset > 0)
3451 {
3452 #if PCRE2_CODE_UNIT_WIDTH != 32
3453 unsigned int i;
3454 if (start_match < end_subject && NOT_FIRSTCU(*start_match))
3455 return PCRE2_ERROR_BADUTFOFFSET;
3456 for (i = re->max_lookbehind; i > 0 && check_subject > subject; i--)
3457 {
3458 check_subject--;
3459 while (check_subject > subject &&
3460 #if PCRE2_CODE_UNIT_WIDTH == 8
3461 (*check_subject & 0xc0) == 0x80)
3462 #else /* 16-bit */
3463 (*check_subject & 0xfc00) == 0xdc00)
3464 #endif /* PCRE2_CODE_UNIT_WIDTH == 8 */
3465 check_subject--;
3466 }
3467 #else /* In the 32-bit library, one code unit equals one character. */
3468 check_subject -= re->max_lookbehind;
3469 if (check_subject < subject) check_subject = subject;
3470 #endif /* PCRE2_CODE_UNIT_WIDTH != 32 */
3471 }
3472
3473 /* Validate the relevant portion of the subject. After an error, adjust the
3474 offset to be an absolute offset in the whole string. */
3475
3476 match_data->rc = PRIV(valid_utf)(check_subject,
3477 length - (PCRE2_SIZE)(check_subject - subject), &(match_data->startchar));
3478 if (match_data->rc != 0)
3479 {
3480 match_data->startchar += (PCRE2_SIZE)(check_subject - subject);
3481 return match_data->rc;
3482 }
3483 }
3484 #endif /* SUPPORT_UNICODE */
3485
3486 /* Set up the first code unit to match, if available. If there's no first code
3487 unit there may be a bitmap of possible first characters. */
3488
3489 if ((re->flags & PCRE2_FIRSTSET) != 0)
3490 {
3491 has_first_cu = TRUE;
3492 first_cu = first_cu2 = (PCRE2_UCHAR)(re->first_codeunit);
3493 if ((re->flags & PCRE2_FIRSTCASELESS) != 0)
3494 {
3495 first_cu2 = TABLE_GET(first_cu, mb->tables + fcc_offset, first_cu);
3496 #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 8
3497 if (utf && first_cu > 127)
3498 first_cu2 = (PCRE2_UCHAR)UCD_OTHERCASE(first_cu);
3499 #endif
3500 }
3501 }
3502 else
3503 if (!startline && (re->flags & PCRE2_FIRSTMAPSET) != 0)
3504 start_bits = re->start_bitmap;
3505
3506 /* There may be a "last known required code unit" set. */
3507
3508 if ((re->flags & PCRE2_LASTSET) != 0)
3509 {
3510 has_req_cu = TRUE;
3511 req_cu = req_cu2 = (PCRE2_UCHAR)(re->last_codeunit);
3512 if ((re->flags & PCRE2_LASTCASELESS) != 0)
3513 {
3514 req_cu2 = TABLE_GET(req_cu, mb->tables + fcc_offset, req_cu);
3515 #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 8
3516 if (utf && req_cu > 127) req_cu2 = (PCRE2_UCHAR)UCD_OTHERCASE(req_cu);
3517 #endif
3518 }
3519 }
3520
3521 /* Fill in fields that are always returned in the match data. */
3522
3523 match_data->code = re;
3524 match_data->subject = subject;
3525 match_data->mark = NULL;
3526 match_data->matchedby = PCRE2_MATCHEDBY_DFA_INTERPRETER;
3527
3528 /* Call the main matching function, looping for a non-anchored regex after a
3529 failed match. If not restarting, perform certain optimizations at the start of
3530 a match. */
3531
3532 for (;;)
3533 {
3534 /* ----------------- Start of match optimizations ---------------- */
3535
3536 /* There are some optimizations that avoid running the match if a known
3537 starting point is not found, or if a known later code unit is not present.
3538 However, there is an option (settable at compile time) that disables
3539 these, for testing and for ensuring that all callouts do actually occur.
3540 The optimizations must also be avoided when restarting a DFA match. */
3541
3542 if ((re->overall_options & PCRE2_NO_START_OPTIMIZE) == 0 &&
3543 (options & PCRE2_DFA_RESTART) == 0)
3544 {
3545 /* If firstline is TRUE, the start of the match is constrained to the first
3546 line of a multiline string. That is, the match must be before or at the
3547 first newline following the start of matching. Temporarily adjust
3548 end_subject so that we stop the optimization scans for a first code unit
3549 immediately after the first character of a newline (the first code unit can
3550 legitimately be a newline). If the match fails at the newline, later code
3551 breaks this loop. */
3552
3553 if (firstline)
3554 {
3555 PCRE2_SPTR t = start_match;
3556 #ifdef SUPPORT_UNICODE
3557 if (utf)
3558 {
3559 while (t < end_subject && !IS_NEWLINE(t))
3560 {
3561 t++;
3562 ACROSSCHAR(t < end_subject, t, t++);
3563 }
3564 }
3565 else
3566 #endif
3567 while (t < end_subject && !IS_NEWLINE(t)) t++;
3568 end_subject = t;
3569 }
3570
3571 /* Anchored: check the first code unit if one is recorded. This may seem
3572 pointless but it can help in detecting a no match case without scanning for
3573 the required code unit. */
3574
3575 if (anchored)
3576 {
3577 if (has_first_cu || start_bits != NULL)
3578 {
3579 BOOL ok = start_match < end_subject;
3580 if (ok)
3581 {
3582 PCRE2_UCHAR c = UCHAR21TEST(start_match);
3583 ok = has_first_cu && (c == first_cu || c == first_cu2);
3584 if (!ok && start_bits != NULL)
3585 {
3586 #if PCRE2_CODE_UNIT_WIDTH != 8
3587 if (c > 255) c = 255;
3588 #endif
3589 ok = (start_bits[c/8] & (1 << (c&7))) != 0;
3590 }
3591 }
3592 if (!ok) break;
3593 }
3594 }
3595
3596 /* Not anchored. Advance to a unique first code unit if there is one. In
3597 8-bit mode, the use of memchr() gives a big speed up, even though we have
3598 to call it twice in caseless mode, in order to find the earliest occurrence
3599 of the character in either of its cases. */
3600
3601 else
3602 {
3603 if (has_first_cu)
3604 {
3605 if (first_cu != first_cu2) /* Caseless */
3606 {
3607 #if PCRE2_CODE_UNIT_WIDTH != 8
3608 PCRE2_UCHAR smc;
3609 while (start_match < end_subject &&
3610 (smc = UCHAR21TEST(start_match)) != first_cu &&
3611 smc != first_cu2)
3612 start_match++;
3613 #else /* 8-bit code units */
3614 PCRE2_SPTR pp1 =
3615 memchr(start_match, first_cu, end_subject-start_match);
3616 PCRE2_SPTR pp2 =
3617 memchr(start_match, first_cu2, end_subject-start_match);
3618 if (pp1 == NULL)
3619 start_match = (pp2 == NULL)? end_subject : pp2;
3620 else
3621 start_match = (pp2 == NULL || pp1 < pp2)? pp1 : pp2;
3622 #endif
3623 }
3624
3625 /* The caseful case */
3626
3627 else
3628 {
3629 #if PCRE2_CODE_UNIT_WIDTH != 8
3630 while (start_match < end_subject && UCHAR21TEST(start_match) !=
3631 first_cu)
3632 start_match++;
3633 #else
3634 start_match = memchr(start_match, first_cu, end_subject - start_match);
3635 if (start_match == NULL) start_match = end_subject;
3636 #endif
3637 }
3638
3639 /* If we can't find the required code unit, having reached the true end
3640 of the subject, break the bumpalong loop, to force a match failure,
3641 except when doing partial matching, when we let the next cycle run at
3642 the end of the subject. To see why, consider the pattern /(?<=abc)def/,
3643 which partially matches "abc", even though the string does not contain
3644 the starting character "d". If we have not reached the true end of the
3645 subject (PCRE2_FIRSTLINE caused end_subject to be temporarily modified)
3646 we also let the cycle run, because the matching string is legitimately
3647 allowed to start with the first code unit of a newline. */
3648
3649 if ((mb->moptions & (PCRE2_PARTIAL_HARD|PCRE2_PARTIAL_SOFT)) == 0 &&
3650 start_match >= mb->end_subject)
3651 break;
3652 }
3653
3654 /* If there's no first code unit, advance to just after a linebreak for a
3655 multiline match if required. */
3656
3657 else if (startline)
3658 {
3659 if (start_match > mb->start_subject + start_offset)
3660 {
3661 #ifdef SUPPORT_UNICODE
3662 if (utf)
3663 {
3664 while (start_match < end_subject && !WAS_NEWLINE(start_match))
3665 {
3666 start_match++;
3667 ACROSSCHAR(start_match < end_subject, start_match, start_match++);
3668 }
3669 }
3670 else
3671 #endif
3672 while (start_match < end_subject && !WAS_NEWLINE(start_match))
3673 start_match++;
3674
3675 /* If we have just passed a CR and the newline option is ANY or
3676 ANYCRLF, and we are now at a LF, advance the match position by one
3677 more code unit. */
3678
3679 if (start_match[-1] == CHAR_CR &&
3680 (mb->nltype == NLTYPE_ANY || mb->nltype == NLTYPE_ANYCRLF) &&
3681 start_match < end_subject &&
3682 UCHAR21TEST(start_match) == CHAR_NL)
3683 start_match++;
3684 }
3685 }
3686
3687 /* If there's no first code unit or a requirement for a multiline line
3688 start, advance to a non-unique first code unit if any have been
3689 identified. The bitmap contains only 256 bits. When code units are 16 or
3690 32 bits wide, all code units greater than 254 set the 255 bit. */
3691
3692 else if (start_bits != NULL)
3693 {
3694 while (start_match < end_subject)
3695 {
3696 uint32_t c = UCHAR21TEST(start_match);
3697 #if PCRE2_CODE_UNIT_WIDTH != 8
3698 if (c > 255) c = 255;
3699 #endif
3700 if ((start_bits[c/8] & (1 << (c&7))) != 0) break;
3701 start_match++;
3702 }
3703
3704 /* See comment above in first_cu checking about the next line. */
3705
3706 if ((mb->moptions & (PCRE2_PARTIAL_HARD|PCRE2_PARTIAL_SOFT)) == 0 &&
3707 start_match >= mb->end_subject)
3708 break;
3709 }
3710 } /* End of first code unit handling */
3711
3712 /* Restore fudged end_subject */
3713
3714 end_subject = mb->end_subject;
3715
3716 /* The following two optimizations are disabled for partial matching. */
3717
3718 if ((mb->moptions & (PCRE2_PARTIAL_HARD|PCRE2_PARTIAL_SOFT)) == 0)
3719 {
3720 /* The minimum matching length is a lower bound; no actual string of that
3721 length may actually match the pattern. Although the value is, strictly,
3722 in characters, we treat it as code units to avoid spending too much time
3723 in this optimization. */
3724
3725 if (end_subject - start_match < re->minlength) goto NOMATCH_EXIT;
3726
3727 /* If req_cu is set, we know that that code unit must appear in the
3728 subject for the match to succeed. If the first code unit is set, req_cu
3729 must be later in the subject; otherwise the test starts at the match
3730 point. This optimization can save a huge amount of backtracking in
3731 patterns with nested unlimited repeats that aren't going to match.
3732 Writing separate code for cased/caseless versions makes it go faster, as
3733 does using an autoincrement and backing off on a match.
3734
3735 HOWEVER: when the subject string is very, very long, searching to its end
3736 can take a long time, and give bad performance on quite ordinary
3737 patterns. This showed up when somebody was matching something like
3738 /^\d+C/ on a 32-megabyte string... so we don't do this when the string is
3739 sufficiently long. */
3740
3741 if (has_req_cu && end_subject - start_match < REQ_CU_MAX)
3742 {
3743 PCRE2_SPTR p = start_match + (has_first_cu? 1:0);
3744
3745 /* We don't need to repeat the search if we haven't yet reached the
3746 place we found it at last time. */
3747
3748 if (p > req_cu_ptr)
3749 {
3750 if (req_cu != req_cu2)
3751 {
3752 while (p < end_subject)
3753 {
3754 uint32_t pp = UCHAR21INCTEST(p);
3755 if (pp == req_cu || pp == req_cu2) { p--; break; }
3756 }
3757 }
3758 else
3759 {
3760 while (p < end_subject)
3761 {
3762 if (UCHAR21INCTEST(p) == req_cu) { p--; break; }
3763 }
3764 }
3765
3766 /* If we can't find the required code unit, break the matching loop,
3767 forcing a match failure. */
3768
3769 if (p >= end_subject) break;
3770
3771 /* If we have found the required code unit, save the point where we
3772 found it, so that we don't search again next time round the loop if
3773 the start hasn't passed this code unit yet. */
3774
3775 req_cu_ptr = p;
3776 }
3777 }
3778 }
3779 }
3780
3781 /* ------------ End of start of match optimizations ------------ */
3782
3783 /* Give no match if we have passed the bumpalong limit. */
3784
3785 if (start_match > bumpalong_limit) break;
3786
3787 /* OK, now we can do the business */
3788
3789 mb->start_used_ptr = start_match;
3790 mb->last_used_ptr = start_match;
3791 mb->recursive = NULL;
3792
3793 rc = internal_dfa_match(
3794 mb, /* fixed match data */
3795 mb->start_code, /* this subexpression's code */
3796 start_match, /* where we currently are */
3797 start_offset, /* start offset in subject */
3798 match_data->ovector, /* offset vector */
3799 (uint32_t)match_data->oveccount * 2, /* actual size of same */
3800 workspace, /* workspace vector */
3801 (int)wscount, /* size of same */
3802 0, /* function recurse level */
3803 base_recursion_workspace); /* initial workspace for recursion */
3804
3805 /* Anything other than "no match" means we are done, always; otherwise, carry
3806 on only if not anchored. */
3807
3808 if (rc != PCRE2_ERROR_NOMATCH || anchored)
3809 {
3810 if (rc == PCRE2_ERROR_PARTIAL && match_data->oveccount > 0)
3811 {
3812 match_data->ovector[0] = (PCRE2_SIZE)(start_match - subject);
3813 match_data->ovector[1] = (PCRE2_SIZE)(end_subject - subject);
3814 }
3815 match_data->leftchar = (PCRE2_SIZE)(mb->start_used_ptr - subject);
3816 match_data->rightchar = (PCRE2_SIZE)( mb->last_used_ptr - subject);
3817 match_data->startchar = (PCRE2_SIZE)(start_match - subject);
3818 match_data->rc = rc;
3819 goto EXIT;
3820 }
3821
3822 /* Advance to the next subject character unless we are at the end of a line
3823 and firstline is set. */
3824
3825 if (firstline && IS_NEWLINE(start_match)) break;
3826 start_match++;
3827 #ifdef SUPPORT_UNICODE
3828 if (utf)
3829 {
3830 ACROSSCHAR(start_match < end_subject, start_match, start_match++);
3831 }
3832 #endif
3833 if (start_match > end_subject) break;
3834
3835 /* If we have just passed a CR and we are now at a LF, and the pattern does
3836 not contain any explicit matches for \r or \n, and the newline option is CRLF
3837 or ANY or ANYCRLF, advance the match position by one more character. */
3838
3839 if (UCHAR21TEST(start_match - 1) == CHAR_CR &&
3840 start_match < end_subject &&
3841 UCHAR21TEST(start_match) == CHAR_NL &&
3842 (re->flags & PCRE2_HASCRORLF) == 0 &&
3843 (mb->nltype == NLTYPE_ANY ||
3844 mb->nltype == NLTYPE_ANYCRLF ||
3845 mb->nllen == 2))
3846 start_match++;
3847
3848 } /* "Bumpalong" loop */
3849
3850 NOMATCH_EXIT:
3851 rc = PCRE2_ERROR_NOMATCH;
3852
3853 EXIT:
3854 while (rws->next != NULL)
3855 {
3856 RWS_anchor *next = rws->next;
3857 rws->next = next->next;
3858 mb->memctl.free(next, mb->memctl.memory_data);
3859 }
3860
3861 return rc;
3862 }
3863
3864 /* End of pcre2_dfa_match.c */
3865