1 /*************************************************
2 * Perl-Compatible Regular Expressions *
3 *************************************************/
4
5 /* PCRE is a library of functions to support regular expressions whose syntax
6 and semantics are as close as possible to those of the Perl 5 language.
7
8 Written by Philip Hazel
9 Original API code Copyright (c) 1997-2012 University of Cambridge
10 New API code Copyright (c) 2016-2019 University of Cambridge
11
12 -----------------------------------------------------------------------------
13 Redistribution and use in source and binary forms, with or without
14 modification, are permitted provided that the following conditions are met:
15
16 * Redistributions of source code must retain the above copyright notice,
17 this list of conditions and the following disclaimer.
18
19 * Redistributions in binary form must reproduce the above copyright
20 notice, this list of conditions and the following disclaimer in the
21 documentation and/or other materials provided with the distribution.
22
23 * Neither the name of the University of Cambridge nor the names of its
24 contributors may be used to endorse or promote products derived from
25 this software without specific prior written permission.
26
27 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
28 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
29 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
30 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
31 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
32 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
33 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
34 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
35 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
36 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
37 POSSIBILITY OF SUCH DAMAGE.
38 -----------------------------------------------------------------------------
39 */
40
41
42 /* This module contains the external function pcre2_dfa_match(), which is an
43 alternative matching function that uses a sort of DFA algorithm (not a true
44 FSM). This is NOT Perl-compatible, but it has advantages in certain
45 applications. */
46
47
48 /* NOTE ABOUT PERFORMANCE: A user of this function sent some code that improved
49 the performance of his patterns greatly. I could not use it as it stood, as it
50 was not thread safe, and made assumptions about pattern sizes. Also, it caused
51 test 7 to loop, and test 9 to crash with a segfault.
52
53 The issue is the check for duplicate states, which is done by a simple linear
54 search up the state list. (Grep for "duplicate" below to find the code.) For
55 many patterns, there will never be many states active at one time, so a simple
56 linear search is fine. In patterns that have many active states, it might be a
57 bottleneck. The suggested code used an indexing scheme to remember which states
58 had previously been used for each character, and avoided the linear search when
59 it knew there was no chance of a duplicate. This was implemented when adding
60 states to the state lists.
61
62 I wrote some thread-safe, not-limited code to try something similar at the time
63 of checking for duplicates (instead of when adding states), using index vectors
64 on the stack. It did give a 13% improvement with one specially constructed
65 pattern for certain subject strings, but on other strings and on many of the
66 simpler patterns in the test suite it did worse. The major problem, I think,
67 was the extra time to initialize the index. This had to be done for each call
68 of internal_dfa_match(). (The supplied patch used a static vector, initialized
69 only once - I suspect this was the cause of the problems with the tests.)
70
71 Overall, I concluded that the gains in some cases did not outweigh the losses
72 in others, so I abandoned this code. */
73
74
75 #ifdef HAVE_CONFIG_H
76 #include "config.h"
77 #endif
78
79 #define NLBLOCK mb /* Block containing newline information */
80 #define PSSTART start_subject /* Field containing processed string start */
81 #define PSEND end_subject /* Field containing processed string end */
82
83 #include "pcre2_internal.h"
84
85 #define PUBLIC_DFA_MATCH_OPTIONS \
86 (PCRE2_ANCHORED|PCRE2_ENDANCHORED|PCRE2_NOTBOL|PCRE2_NOTEOL|PCRE2_NOTEMPTY| \
87 PCRE2_NOTEMPTY_ATSTART|PCRE2_NO_UTF_CHECK|PCRE2_PARTIAL_HARD| \
88 PCRE2_PARTIAL_SOFT|PCRE2_DFA_SHORTEST|PCRE2_DFA_RESTART| \
89 PCRE2_COPY_MATCHED_SUBJECT)
90
91
92 /*************************************************
93 * Code parameters and static tables *
94 *************************************************/
95
96 /* These are offsets that are used to turn the OP_TYPESTAR and friends opcodes
97 into others, under special conditions. A gap of 20 between the blocks should be
98 enough. The resulting opcodes don't have to be less than 256 because they are
99 never stored, so we push them well clear of the normal opcodes. */
100
101 #define OP_PROP_EXTRA 300
102 #define OP_EXTUNI_EXTRA 320
103 #define OP_ANYNL_EXTRA 340
104 #define OP_HSPACE_EXTRA 360
105 #define OP_VSPACE_EXTRA 380
106
107
108 /* This table identifies those opcodes that are followed immediately by a
109 character that is to be tested in some way. This makes it possible to
110 centralize the loading of these characters. In the case of Type * etc, the
111 "character" is the opcode for \D, \d, \S, \s, \W, or \w, which will always be a
112 small value. Non-zero values in the table are the offsets from the opcode where
113 the character is to be found. ***NOTE*** If the start of this table is
114 modified, the three tables that follow must also be modified. */
115
116 static const uint8_t coptable[] = {
117 0, /* End */
118 0, 0, 0, 0, 0, /* \A, \G, \K, \B, \b */
119 0, 0, 0, 0, 0, 0, /* \D, \d, \S, \s, \W, \w */
120 0, 0, 0, /* Any, AllAny, Anybyte */
121 0, 0, /* \P, \p */
122 0, 0, 0, 0, 0, /* \R, \H, \h, \V, \v */
123 0, /* \X */
124 0, 0, 0, 0, 0, 0, /* \Z, \z, $, $M, ^, ^M */
125 1, /* Char */
126 1, /* Chari */
127 1, /* not */
128 1, /* noti */
129 /* Positive single-char repeats */
130 1, 1, 1, 1, 1, 1, /* *, *?, +, +?, ?, ?? */
131 1+IMM2_SIZE, 1+IMM2_SIZE, /* upto, minupto */
132 1+IMM2_SIZE, /* exact */
133 1, 1, 1, 1+IMM2_SIZE, /* *+, ++, ?+, upto+ */
134 1, 1, 1, 1, 1, 1, /* *I, *?I, +I, +?I, ?I, ??I */
135 1+IMM2_SIZE, 1+IMM2_SIZE, /* upto I, minupto I */
136 1+IMM2_SIZE, /* exact I */
137 1, 1, 1, 1+IMM2_SIZE, /* *+I, ++I, ?+I, upto+I */
138 /* Negative single-char repeats - only for chars < 256 */
139 1, 1, 1, 1, 1, 1, /* NOT *, *?, +, +?, ?, ?? */
140 1+IMM2_SIZE, 1+IMM2_SIZE, /* NOT upto, minupto */
141 1+IMM2_SIZE, /* NOT exact */
142 1, 1, 1, 1+IMM2_SIZE, /* NOT *+, ++, ?+, upto+ */
143 1, 1, 1, 1, 1, 1, /* NOT *I, *?I, +I, +?I, ?I, ??I */
144 1+IMM2_SIZE, 1+IMM2_SIZE, /* NOT upto I, minupto I */
145 1+IMM2_SIZE, /* NOT exact I */
146 1, 1, 1, 1+IMM2_SIZE, /* NOT *+I, ++I, ?+I, upto+I */
147 /* Positive type repeats */
148 1, 1, 1, 1, 1, 1, /* Type *, *?, +, +?, ?, ?? */
149 1+IMM2_SIZE, 1+IMM2_SIZE, /* Type upto, minupto */
150 1+IMM2_SIZE, /* Type exact */
151 1, 1, 1, 1+IMM2_SIZE, /* Type *+, ++, ?+, upto+ */
152 /* Character class & ref repeats */
153 0, 0, 0, 0, 0, 0, /* *, *?, +, +?, ?, ?? */
154 0, 0, /* CRRANGE, CRMINRANGE */
155 0, 0, 0, 0, /* Possessive *+, ++, ?+, CRPOSRANGE */
156 0, /* CLASS */
157 0, /* NCLASS */
158 0, /* XCLASS - variable length */
159 0, /* REF */
160 0, /* REFI */
161 0, /* DNREF */
162 0, /* DNREFI */
163 0, /* RECURSE */
164 0, /* CALLOUT */
165 0, /* CALLOUT_STR */
166 0, /* Alt */
167 0, /* Ket */
168 0, /* KetRmax */
169 0, /* KetRmin */
170 0, /* KetRpos */
171 0, /* Reverse */
172 0, /* Assert */
173 0, /* Assert not */
174 0, /* Assert behind */
175 0, /* Assert behind not */
176 0, /* ONCE */
177 0, /* SCRIPT_RUN */
178 0, 0, 0, 0, 0, /* BRA, BRAPOS, CBRA, CBRAPOS, COND */
179 0, 0, 0, 0, 0, /* SBRA, SBRAPOS, SCBRA, SCBRAPOS, SCOND */
180 0, 0, /* CREF, DNCREF */
181 0, 0, /* RREF, DNRREF */
182 0, 0, /* FALSE, TRUE */
183 0, 0, 0, /* BRAZERO, BRAMINZERO, BRAPOSZERO */
184 0, 0, 0, /* MARK, PRUNE, PRUNE_ARG */
185 0, 0, 0, 0, /* SKIP, SKIP_ARG, THEN, THEN_ARG */
186 0, 0, /* COMMIT, COMMIT_ARG */
187 0, 0, 0, /* FAIL, ACCEPT, ASSERT_ACCEPT */
188 0, 0, 0 /* CLOSE, SKIPZERO, DEFINE */
189 };
190
191 /* This table identifies those opcodes that inspect a character. It is used to
192 remember the fact that a character could have been inspected when the end of
193 the subject is reached. ***NOTE*** If the start of this table is modified, the
194 two tables that follow must also be modified. */
195
196 static const uint8_t poptable[] = {
197 0, /* End */
198 0, 0, 0, 1, 1, /* \A, \G, \K, \B, \b */
199 1, 1, 1, 1, 1, 1, /* \D, \d, \S, \s, \W, \w */
200 1, 1, 1, /* Any, AllAny, Anybyte */
201 1, 1, /* \P, \p */
202 1, 1, 1, 1, 1, /* \R, \H, \h, \V, \v */
203 1, /* \X */
204 0, 0, 0, 0, 0, 0, /* \Z, \z, $, $M, ^, ^M */
205 1, /* Char */
206 1, /* Chari */
207 1, /* not */
208 1, /* noti */
209 /* Positive single-char repeats */
210 1, 1, 1, 1, 1, 1, /* *, *?, +, +?, ?, ?? */
211 1, 1, 1, /* upto, minupto, exact */
212 1, 1, 1, 1, /* *+, ++, ?+, upto+ */
213 1, 1, 1, 1, 1, 1, /* *I, *?I, +I, +?I, ?I, ??I */
214 1, 1, 1, /* upto I, minupto I, exact I */
215 1, 1, 1, 1, /* *+I, ++I, ?+I, upto+I */
216 /* Negative single-char repeats - only for chars < 256 */
217 1, 1, 1, 1, 1, 1, /* NOT *, *?, +, +?, ?, ?? */
218 1, 1, 1, /* NOT upto, minupto, exact */
219 1, 1, 1, 1, /* NOT *+, ++, ?+, upto+ */
220 1, 1, 1, 1, 1, 1, /* NOT *I, *?I, +I, +?I, ?I, ??I */
221 1, 1, 1, /* NOT upto I, minupto I, exact I */
222 1, 1, 1, 1, /* NOT *+I, ++I, ?+I, upto+I */
223 /* Positive type repeats */
224 1, 1, 1, 1, 1, 1, /* Type *, *?, +, +?, ?, ?? */
225 1, 1, 1, /* Type upto, minupto, exact */
226 1, 1, 1, 1, /* Type *+, ++, ?+, upto+ */
227 /* Character class & ref repeats */
228 1, 1, 1, 1, 1, 1, /* *, *?, +, +?, ?, ?? */
229 1, 1, /* CRRANGE, CRMINRANGE */
230 1, 1, 1, 1, /* Possessive *+, ++, ?+, CRPOSRANGE */
231 1, /* CLASS */
232 1, /* NCLASS */
233 1, /* XCLASS - variable length */
234 0, /* REF */
235 0, /* REFI */
236 0, /* DNREF */
237 0, /* DNREFI */
238 0, /* RECURSE */
239 0, /* CALLOUT */
240 0, /* CALLOUT_STR */
241 0, /* Alt */
242 0, /* Ket */
243 0, /* KetRmax */
244 0, /* KetRmin */
245 0, /* KetRpos */
246 0, /* Reverse */
247 0, /* Assert */
248 0, /* Assert not */
249 0, /* Assert behind */
250 0, /* Assert behind not */
251 0, /* ONCE */
252 0, /* SCRIPT_RUN */
253 0, 0, 0, 0, 0, /* BRA, BRAPOS, CBRA, CBRAPOS, COND */
254 0, 0, 0, 0, 0, /* SBRA, SBRAPOS, SCBRA, SCBRAPOS, SCOND */
255 0, 0, /* CREF, DNCREF */
256 0, 0, /* RREF, DNRREF */
257 0, 0, /* FALSE, TRUE */
258 0, 0, 0, /* BRAZERO, BRAMINZERO, BRAPOSZERO */
259 0, 0, 0, /* MARK, PRUNE, PRUNE_ARG */
260 0, 0, 0, 0, /* SKIP, SKIP_ARG, THEN, THEN_ARG */
261 0, 0, /* COMMIT, COMMIT_ARG */
262 0, 0, 0, /* FAIL, ACCEPT, ASSERT_ACCEPT */
263 0, 0, 0 /* CLOSE, SKIPZERO, DEFINE */
264 };
265
266 /* These 2 tables allow for compact code for testing for \D, \d, \S, \s, \W,
267 and \w */
268
269 static const uint8_t toptable1[] = {
270 0, 0, 0, 0, 0, 0,
271 ctype_digit, ctype_digit,
272 ctype_space, ctype_space,
273 ctype_word, ctype_word,
274 0, 0 /* OP_ANY, OP_ALLANY */
275 };
276
277 static const uint8_t toptable2[] = {
278 0, 0, 0, 0, 0, 0,
279 ctype_digit, 0,
280 ctype_space, 0,
281 ctype_word, 0,
282 1, 1 /* OP_ANY, OP_ALLANY */
283 };
284
285
286 /* Structure for holding data about a particular state, which is in effect the
287 current data for an active path through the match tree. It must consist
288 entirely of ints because the working vector we are passed, and which we put
289 these structures in, is a vector of ints. */
290
291 typedef struct stateblock {
292 int offset; /* Offset to opcode (-ve has meaning) */
293 int count; /* Count for repeats */
294 int data; /* Some use extra data */
295 } stateblock;
296
297 #define INTS_PER_STATEBLOCK (int)(sizeof(stateblock)/sizeof(int))
298
299
300 /* Before version 10.32 the recursive calls of internal_dfa_match() were passed
301 local working space and output vectors that were created on the stack. This has
302 caused issues for some patterns, especially in small-stack environments such as
303 Windows. A new scheme is now in use which sets up a vector on the stack, but if
304 this is too small, heap memory is used, up to the heap_limit. The main
305 parameters are all numbers of ints because the workspace is a vector of ints.
306
307 The size of the starting stack vector, DFA_START_RWS_SIZE, is in bytes, and is
308 defined in pcre2_internal.h so as to be available to pcre2test when it is
309 finding the minimum heap requirement for a match. */
310
311 #define OVEC_UNIT (sizeof(PCRE2_SIZE)/sizeof(int))
312
313 #define RWS_BASE_SIZE (DFA_START_RWS_SIZE/sizeof(int)) /* Stack vector */
314 #define RWS_RSIZE 1000 /* Work size for recursion */
315 #define RWS_OVEC_RSIZE (1000*OVEC_UNIT) /* Ovector for recursion */
316 #define RWS_OVEC_OSIZE (2*OVEC_UNIT) /* Ovector in other cases */
317
318 /* This structure is at the start of each workspace block. */
319
320 typedef struct RWS_anchor {
321 struct RWS_anchor *next;
322 uint32_t size; /* Number of ints */
323 uint32_t free; /* Number of ints */
324 } RWS_anchor;
325
326 #define RWS_ANCHOR_SIZE (sizeof(RWS_anchor)/sizeof(int))
327
328
329
330 /*************************************************
331 * Process a callout *
332 *************************************************/
333
334 /* This function is called to perform a callout.
335
336 Arguments:
337 code current code pointer
338 offsets points to current capture offsets
339 current_subject start of current subject match
340 ptr current position in subject
341 mb the match block
342 extracode extra code offset when called from condition
343 lengthptr where to return the callout length
344
345 Returns: the return from the callout
346 */
347
348 static int
do_callout(PCRE2_SPTR code,PCRE2_SIZE * offsets,PCRE2_SPTR current_subject,PCRE2_SPTR ptr,dfa_match_block * mb,PCRE2_SIZE extracode,PCRE2_SIZE * lengthptr)349 do_callout(PCRE2_SPTR code, PCRE2_SIZE *offsets, PCRE2_SPTR current_subject,
350 PCRE2_SPTR ptr, dfa_match_block *mb, PCRE2_SIZE extracode,
351 PCRE2_SIZE *lengthptr)
352 {
353 pcre2_callout_block *cb = mb->cb;
354
355 *lengthptr = (code[extracode] == OP_CALLOUT)?
356 (PCRE2_SIZE)PRIV(OP_lengths)[OP_CALLOUT] :
357 (PCRE2_SIZE)GET(code, 1 + 2*LINK_SIZE + extracode);
358
359 if (mb->callout == NULL) return 0; /* No callout provided */
360
361 /* Fixed fields in the callout block are set once and for all at the start of
362 matching. */
363
364 cb->offset_vector = offsets;
365 cb->start_match = (PCRE2_SIZE)(current_subject - mb->start_subject);
366 cb->current_position = (PCRE2_SIZE)(ptr - mb->start_subject);
367 cb->pattern_position = GET(code, 1 + extracode);
368 cb->next_item_length = GET(code, 1 + LINK_SIZE + extracode);
369
370 if (code[extracode] == OP_CALLOUT)
371 {
372 cb->callout_number = code[1 + 2*LINK_SIZE + extracode];
373 cb->callout_string_offset = 0;
374 cb->callout_string = NULL;
375 cb->callout_string_length = 0;
376 }
377 else
378 {
379 cb->callout_number = 0;
380 cb->callout_string_offset = GET(code, 1 + 3*LINK_SIZE + extracode);
381 cb->callout_string = code + (1 + 4*LINK_SIZE + extracode) + 1;
382 cb->callout_string_length = *lengthptr - (1 + 4*LINK_SIZE) - 2;
383 }
384
385 return (mb->callout)(cb, mb->callout_data);
386 }
387
388
389
390 /*************************************************
391 * Expand local workspace memory *
392 *************************************************/
393
394 /* This function is called when internal_dfa_match() is about to be called
395 recursively and there is insufficient working space left in the current
396 workspace block. If there's an existing next block, use it; otherwise get a new
397 block unless the heap limit is reached.
398
399 Arguments:
400 rwsptr pointer to block pointer (updated)
401 ovecsize space needed for an ovector
402 mb the match block
403
404 Returns: 0 rwsptr has been updated
405 !0 an error code
406 */
407
408 static int
more_workspace(RWS_anchor ** rwsptr,unsigned int ovecsize,dfa_match_block * mb)409 more_workspace(RWS_anchor **rwsptr, unsigned int ovecsize, dfa_match_block *mb)
410 {
411 RWS_anchor *rws = *rwsptr;
412 RWS_anchor *new;
413
414 if (rws->next != NULL)
415 {
416 new = rws->next;
417 }
418
419 /* Sizes in the RWS_anchor blocks are in units of sizeof(int), but
420 mb->heap_limit and mb->heap_used are in kibibytes. Play carefully, to avoid
421 overflow. */
422
423 else
424 {
425 uint32_t newsize = (rws->size >= UINT32_MAX/2)? UINT32_MAX/2 : rws->size * 2;
426 uint32_t newsizeK = newsize/(1024/sizeof(int));
427
428 if (newsizeK + mb->heap_used > mb->heap_limit)
429 newsizeK = (uint32_t)(mb->heap_limit - mb->heap_used);
430 newsize = newsizeK*(1024/sizeof(int));
431
432 if (newsize < RWS_RSIZE + ovecsize + RWS_ANCHOR_SIZE)
433 return PCRE2_ERROR_HEAPLIMIT;
434 new = mb->memctl.malloc(newsize*sizeof(int), mb->memctl.memory_data);
435 if (new == NULL) return PCRE2_ERROR_NOMEMORY;
436 mb->heap_used += newsizeK;
437 new->next = NULL;
438 new->size = newsize;
439 rws->next = new;
440 }
441
442 new->free = new->size - RWS_ANCHOR_SIZE;
443 *rwsptr = new;
444 return 0;
445 }
446
447
448
449 /*************************************************
450 * Match a Regular Expression - DFA engine *
451 *************************************************/
452
453 /* This internal function applies a compiled pattern to a subject string,
454 starting at a given point, using a DFA engine. This function is called from the
455 external one, possibly multiple times if the pattern is not anchored. The
456 function calls itself recursively for some kinds of subpattern.
457
458 Arguments:
459 mb the match_data block with fixed information
460 this_start_code the opening bracket of this subexpression's code
461 current_subject where we currently are in the subject string
462 start_offset start offset in the subject string
463 offsets vector to contain the matching string offsets
464 offsetcount size of same
465 workspace vector of workspace
466 wscount size of same
467 rlevel function call recursion level
468
469 Returns: > 0 => number of match offset pairs placed in offsets
470 = 0 => offsets overflowed; longest matches are present
471 -1 => failed to match
472 < -1 => some kind of unexpected problem
473
474 The following macros are used for adding states to the two state vectors (one
475 for the current character, one for the following character). */
476
477 #define ADD_ACTIVE(x,y) \
478 if (active_count++ < wscount) \
479 { \
480 next_active_state->offset = (x); \
481 next_active_state->count = (y); \
482 next_active_state++; \
483 } \
484 else return PCRE2_ERROR_DFA_WSSIZE
485
486 #define ADD_ACTIVE_DATA(x,y,z) \
487 if (active_count++ < wscount) \
488 { \
489 next_active_state->offset = (x); \
490 next_active_state->count = (y); \
491 next_active_state->data = (z); \
492 next_active_state++; \
493 } \
494 else return PCRE2_ERROR_DFA_WSSIZE
495
496 #define ADD_NEW(x,y) \
497 if (new_count++ < wscount) \
498 { \
499 next_new_state->offset = (x); \
500 next_new_state->count = (y); \
501 next_new_state++; \
502 } \
503 else return PCRE2_ERROR_DFA_WSSIZE
504
505 #define ADD_NEW_DATA(x,y,z) \
506 if (new_count++ < wscount) \
507 { \
508 next_new_state->offset = (x); \
509 next_new_state->count = (y); \
510 next_new_state->data = (z); \
511 next_new_state++; \
512 } \
513 else return PCRE2_ERROR_DFA_WSSIZE
514
515 /* And now, here is the code */
516
517 static int
internal_dfa_match(dfa_match_block * mb,PCRE2_SPTR this_start_code,PCRE2_SPTR current_subject,PCRE2_SIZE start_offset,PCRE2_SIZE * offsets,uint32_t offsetcount,int * workspace,int wscount,uint32_t rlevel,int * RWS)518 internal_dfa_match(
519 dfa_match_block *mb,
520 PCRE2_SPTR this_start_code,
521 PCRE2_SPTR current_subject,
522 PCRE2_SIZE start_offset,
523 PCRE2_SIZE *offsets,
524 uint32_t offsetcount,
525 int *workspace,
526 int wscount,
527 uint32_t rlevel,
528 int *RWS)
529 {
530 stateblock *active_states, *new_states, *temp_states;
531 stateblock *next_active_state, *next_new_state;
532 const uint8_t *ctypes, *lcc, *fcc;
533 PCRE2_SPTR ptr;
534 PCRE2_SPTR end_code;
535 dfa_recursion_info new_recursive;
536 int active_count, new_count, match_count;
537
538 /* Some fields in the mb block are frequently referenced, so we load them into
539 independent variables in the hope that this will perform better. */
540
541 PCRE2_SPTR start_subject = mb->start_subject;
542 PCRE2_SPTR end_subject = mb->end_subject;
543 PCRE2_SPTR start_code = mb->start_code;
544
545 #ifdef SUPPORT_UNICODE
546 BOOL utf = (mb->poptions & PCRE2_UTF) != 0;
547 #else
548 BOOL utf = FALSE;
549 #endif
550
551 BOOL reset_could_continue = FALSE;
552
553 if (mb->match_call_count++ >= mb->match_limit) return PCRE2_ERROR_MATCHLIMIT;
554 if (rlevel++ > mb->match_limit_depth) return PCRE2_ERROR_DEPTHLIMIT;
555 offsetcount &= (uint32_t)(-2); /* Round down */
556
557 wscount -= 2;
558 wscount = (wscount - (wscount % (INTS_PER_STATEBLOCK * 2))) /
559 (2 * INTS_PER_STATEBLOCK);
560
561 ctypes = mb->tables + ctypes_offset;
562 lcc = mb->tables + lcc_offset;
563 fcc = mb->tables + fcc_offset;
564
565 match_count = PCRE2_ERROR_NOMATCH; /* A negative number */
566
567 active_states = (stateblock *)(workspace + 2);
568 next_new_state = new_states = active_states + wscount;
569 new_count = 0;
570
571 /* The first thing in any (sub) pattern is a bracket of some sort. Push all
572 the alternative states onto the list, and find out where the end is. This
573 makes is possible to use this function recursively, when we want to stop at a
574 matching internal ket rather than at the end.
575
576 If we are dealing with a backward assertion we have to find out the maximum
577 amount to move back, and set up each alternative appropriately. */
578
579 if (*this_start_code == OP_ASSERTBACK || *this_start_code == OP_ASSERTBACK_NOT)
580 {
581 size_t max_back = 0;
582 size_t gone_back;
583
584 end_code = this_start_code;
585 do
586 {
587 size_t back = (size_t)GET(end_code, 2+LINK_SIZE);
588 if (back > max_back) max_back = back;
589 end_code += GET(end_code, 1);
590 }
591 while (*end_code == OP_ALT);
592
593 /* If we can't go back the amount required for the longest lookbehind
594 pattern, go back as far as we can; some alternatives may still be viable. */
595
596 #ifdef SUPPORT_UNICODE
597 /* In character mode we have to step back character by character */
598
599 if (utf)
600 {
601 for (gone_back = 0; gone_back < max_back; gone_back++)
602 {
603 if (current_subject <= start_subject) break;
604 current_subject--;
605 ACROSSCHAR(current_subject > start_subject, current_subject,
606 current_subject--);
607 }
608 }
609 else
610 #endif
611
612 /* In byte-mode we can do this quickly. */
613
614 {
615 size_t current_offset = (size_t)(current_subject - start_subject);
616 gone_back = (current_offset < max_back)? current_offset : max_back;
617 current_subject -= gone_back;
618 }
619
620 /* Save the earliest consulted character */
621
622 if (current_subject < mb->start_used_ptr)
623 mb->start_used_ptr = current_subject;
624
625 /* Now we can process the individual branches. There will be an OP_REVERSE at
626 the start of each branch, except when the length of the branch is zero. */
627
628 end_code = this_start_code;
629 do
630 {
631 uint32_t revlen = (end_code[1+LINK_SIZE] == OP_REVERSE)? 1 + LINK_SIZE : 0;
632 size_t back = (revlen == 0)? 0 : (size_t)GET(end_code, 2+LINK_SIZE);
633 if (back <= gone_back)
634 {
635 int bstate = (int)(end_code - start_code + 1 + LINK_SIZE + revlen);
636 ADD_NEW_DATA(-bstate, 0, (int)(gone_back - back));
637 }
638 end_code += GET(end_code, 1);
639 }
640 while (*end_code == OP_ALT);
641 }
642
643 /* This is the code for a "normal" subpattern (not a backward assertion). The
644 start of a whole pattern is always one of these. If we are at the top level,
645 we may be asked to restart matching from the same point that we reached for a
646 previous partial match. We still have to scan through the top-level branches to
647 find the end state. */
648
649 else
650 {
651 end_code = this_start_code;
652
653 /* Restarting */
654
655 if (rlevel == 1 && (mb->moptions & PCRE2_DFA_RESTART) != 0)
656 {
657 do { end_code += GET(end_code, 1); } while (*end_code == OP_ALT);
658 new_count = workspace[1];
659 if (!workspace[0])
660 memcpy(new_states, active_states, (size_t)new_count * sizeof(stateblock));
661 }
662
663 /* Not restarting */
664
665 else
666 {
667 int length = 1 + LINK_SIZE +
668 ((*this_start_code == OP_CBRA || *this_start_code == OP_SCBRA ||
669 *this_start_code == OP_CBRAPOS || *this_start_code == OP_SCBRAPOS)
670 ? IMM2_SIZE:0);
671 do
672 {
673 ADD_NEW((int)(end_code - start_code + length), 0);
674 end_code += GET(end_code, 1);
675 length = 1 + LINK_SIZE;
676 }
677 while (*end_code == OP_ALT);
678 }
679 }
680
681 workspace[0] = 0; /* Bit indicating which vector is current */
682
683 /* Loop for scanning the subject */
684
685 ptr = current_subject;
686 for (;;)
687 {
688 int i, j;
689 int clen, dlen;
690 uint32_t c, d;
691 int forced_fail = 0;
692 BOOL partial_newline = FALSE;
693 BOOL could_continue = reset_could_continue;
694 reset_could_continue = FALSE;
695
696 if (ptr > mb->last_used_ptr) mb->last_used_ptr = ptr;
697
698 /* Make the new state list into the active state list and empty the
699 new state list. */
700
701 temp_states = active_states;
702 active_states = new_states;
703 new_states = temp_states;
704 active_count = new_count;
705 new_count = 0;
706
707 workspace[0] ^= 1; /* Remember for the restarting feature */
708 workspace[1] = active_count;
709
710 /* Set the pointers for adding new states */
711
712 next_active_state = active_states + active_count;
713 next_new_state = new_states;
714
715 /* Load the current character from the subject outside the loop, as many
716 different states may want to look at it, and we assume that at least one
717 will. */
718
719 if (ptr < end_subject)
720 {
721 clen = 1; /* Number of data items in the character */
722 #ifdef SUPPORT_UNICODE
723 GETCHARLENTEST(c, ptr, clen);
724 #else
725 c = *ptr;
726 #endif /* SUPPORT_UNICODE */
727 }
728 else
729 {
730 clen = 0; /* This indicates the end of the subject */
731 c = NOTACHAR; /* This value should never actually be used */
732 }
733
734 /* Scan up the active states and act on each one. The result of an action
735 may be to add more states to the currently active list (e.g. on hitting a
736 parenthesis) or it may be to put states on the new list, for considering
737 when we move the character pointer on. */
738
739 for (i = 0; i < active_count; i++)
740 {
741 stateblock *current_state = active_states + i;
742 BOOL caseless = FALSE;
743 PCRE2_SPTR code;
744 uint32_t codevalue;
745 int state_offset = current_state->offset;
746 int rrc;
747 int count;
748
749 /* A negative offset is a special case meaning "hold off going to this
750 (negated) state until the number of characters in the data field have
751 been skipped". If the could_continue flag was passed over from a previous
752 state, arrange for it to passed on. */
753
754 if (state_offset < 0)
755 {
756 if (current_state->data > 0)
757 {
758 ADD_NEW_DATA(state_offset, current_state->count,
759 current_state->data - 1);
760 if (could_continue) reset_could_continue = TRUE;
761 continue;
762 }
763 else
764 {
765 current_state->offset = state_offset = -state_offset;
766 }
767 }
768
769 /* Check for a duplicate state with the same count, and skip if found.
770 See the note at the head of this module about the possibility of improving
771 performance here. */
772
773 for (j = 0; j < i; j++)
774 {
775 if (active_states[j].offset == state_offset &&
776 active_states[j].count == current_state->count)
777 goto NEXT_ACTIVE_STATE;
778 }
779
780 /* The state offset is the offset to the opcode */
781
782 code = start_code + state_offset;
783 codevalue = *code;
784
785 /* If this opcode inspects a character, but we are at the end of the
786 subject, remember the fact for use when testing for a partial match. */
787
788 if (clen == 0 && poptable[codevalue] != 0)
789 could_continue = TRUE;
790
791 /* If this opcode is followed by an inline character, load it. It is
792 tempting to test for the presence of a subject character here, but that
793 is wrong, because sometimes zero repetitions of the subject are
794 permitted.
795
796 We also use this mechanism for opcodes such as OP_TYPEPLUS that take an
797 argument that is not a data character - but is always one byte long because
798 the values are small. We have to take special action to deal with \P, \p,
799 \H, \h, \V, \v and \X in this case. To keep the other cases fast, convert
800 these ones to new opcodes. */
801
802 if (coptable[codevalue] > 0)
803 {
804 dlen = 1;
805 #ifdef SUPPORT_UNICODE
806 if (utf) { GETCHARLEN(d, (code + coptable[codevalue]), dlen); } else
807 #endif /* SUPPORT_UNICODE */
808 d = code[coptable[codevalue]];
809 if (codevalue >= OP_TYPESTAR)
810 {
811 switch(d)
812 {
813 case OP_ANYBYTE: return PCRE2_ERROR_DFA_UITEM;
814 case OP_NOTPROP:
815 case OP_PROP: codevalue += OP_PROP_EXTRA; break;
816 case OP_ANYNL: codevalue += OP_ANYNL_EXTRA; break;
817 case OP_EXTUNI: codevalue += OP_EXTUNI_EXTRA; break;
818 case OP_NOT_HSPACE:
819 case OP_HSPACE: codevalue += OP_HSPACE_EXTRA; break;
820 case OP_NOT_VSPACE:
821 case OP_VSPACE: codevalue += OP_VSPACE_EXTRA; break;
822 default: break;
823 }
824 }
825 }
826 else
827 {
828 dlen = 0; /* Not strictly necessary, but compilers moan */
829 d = NOTACHAR; /* if these variables are not set. */
830 }
831
832
833 /* Now process the individual opcodes */
834
835 switch (codevalue)
836 {
837 /* ========================================================================== */
838 /* These cases are never obeyed. This is a fudge that causes a compile-
839 time error if the vectors coptable or poptable, which are indexed by
840 opcode, are not the correct length. It seems to be the only way to do
841 such a check at compile time, as the sizeof() operator does not work
842 in the C preprocessor. */
843
844 case OP_TABLE_LENGTH:
845 case OP_TABLE_LENGTH +
846 ((sizeof(coptable) == OP_TABLE_LENGTH) &&
847 (sizeof(poptable) == OP_TABLE_LENGTH)):
848 return 0;
849
850 /* ========================================================================== */
851 /* Reached a closing bracket. If not at the end of the pattern, carry
852 on with the next opcode. For repeating opcodes, also add the repeat
853 state. Note that KETRPOS will always be encountered at the end of the
854 subpattern, because the possessive subpattern repeats are always handled
855 using recursive calls. Thus, it never adds any new states.
856
857 At the end of the (sub)pattern, unless we have an empty string and
858 PCRE2_NOTEMPTY is set, or PCRE2_NOTEMPTY_ATSTART is set and we are at the
859 start of the subject, save the match data, shifting up all previous
860 matches so we always have the longest first. */
861
862 case OP_KET:
863 case OP_KETRMIN:
864 case OP_KETRMAX:
865 case OP_KETRPOS:
866 if (code != end_code)
867 {
868 ADD_ACTIVE(state_offset + 1 + LINK_SIZE, 0);
869 if (codevalue != OP_KET)
870 {
871 ADD_ACTIVE(state_offset - (int)GET(code, 1), 0);
872 }
873 }
874 else
875 {
876 if (ptr > current_subject ||
877 ((mb->moptions & PCRE2_NOTEMPTY) == 0 &&
878 ((mb->moptions & PCRE2_NOTEMPTY_ATSTART) == 0 ||
879 current_subject > start_subject + mb->start_offset)))
880 {
881 if (match_count < 0) match_count = (offsetcount >= 2)? 1 : 0;
882 else if (match_count > 0 && ++match_count * 2 > (int)offsetcount)
883 match_count = 0;
884 count = ((match_count == 0)? (int)offsetcount : match_count * 2) - 2;
885 if (count > 0) (void)memmove(offsets + 2, offsets,
886 (size_t)count * sizeof(PCRE2_SIZE));
887 if (offsetcount >= 2)
888 {
889 offsets[0] = (PCRE2_SIZE)(current_subject - start_subject);
890 offsets[1] = (PCRE2_SIZE)(ptr - start_subject);
891 }
892 if ((mb->moptions & PCRE2_DFA_SHORTEST) != 0) return match_count;
893 }
894 }
895 break;
896
897 /* ========================================================================== */
898 /* These opcodes add to the current list of states without looking
899 at the current character. */
900
901 /*-----------------------------------------------------------------*/
902 case OP_ALT:
903 do { code += GET(code, 1); } while (*code == OP_ALT);
904 ADD_ACTIVE((int)(code - start_code), 0);
905 break;
906
907 /*-----------------------------------------------------------------*/
908 case OP_BRA:
909 case OP_SBRA:
910 do
911 {
912 ADD_ACTIVE((int)(code - start_code + 1 + LINK_SIZE), 0);
913 code += GET(code, 1);
914 }
915 while (*code == OP_ALT);
916 break;
917
918 /*-----------------------------------------------------------------*/
919 case OP_CBRA:
920 case OP_SCBRA:
921 ADD_ACTIVE((int)(code - start_code + 1 + LINK_SIZE + IMM2_SIZE), 0);
922 code += GET(code, 1);
923 while (*code == OP_ALT)
924 {
925 ADD_ACTIVE((int)(code - start_code + 1 + LINK_SIZE), 0);
926 code += GET(code, 1);
927 }
928 break;
929
930 /*-----------------------------------------------------------------*/
931 case OP_BRAZERO:
932 case OP_BRAMINZERO:
933 ADD_ACTIVE(state_offset + 1, 0);
934 code += 1 + GET(code, 2);
935 while (*code == OP_ALT) code += GET(code, 1);
936 ADD_ACTIVE((int)(code - start_code + 1 + LINK_SIZE), 0);
937 break;
938
939 /*-----------------------------------------------------------------*/
940 case OP_SKIPZERO:
941 code += 1 + GET(code, 2);
942 while (*code == OP_ALT) code += GET(code, 1);
943 ADD_ACTIVE((int)(code - start_code + 1 + LINK_SIZE), 0);
944 break;
945
946 /*-----------------------------------------------------------------*/
947 case OP_CIRC:
948 if (ptr == start_subject && (mb->moptions & PCRE2_NOTBOL) == 0)
949 { ADD_ACTIVE(state_offset + 1, 0); }
950 break;
951
952 /*-----------------------------------------------------------------*/
953 case OP_CIRCM:
954 if ((ptr == start_subject && (mb->moptions & PCRE2_NOTBOL) == 0) ||
955 ((ptr != end_subject || (mb->poptions & PCRE2_ALT_CIRCUMFLEX) != 0 )
956 && WAS_NEWLINE(ptr)))
957 { ADD_ACTIVE(state_offset + 1, 0); }
958 break;
959
960 /*-----------------------------------------------------------------*/
961 case OP_EOD:
962 if (ptr >= end_subject)
963 {
964 if ((mb->moptions & PCRE2_PARTIAL_HARD) != 0)
965 could_continue = TRUE;
966 else { ADD_ACTIVE(state_offset + 1, 0); }
967 }
968 break;
969
970 /*-----------------------------------------------------------------*/
971 case OP_SOD:
972 if (ptr == start_subject) { ADD_ACTIVE(state_offset + 1, 0); }
973 break;
974
975 /*-----------------------------------------------------------------*/
976 case OP_SOM:
977 if (ptr == start_subject + start_offset) { ADD_ACTIVE(state_offset + 1, 0); }
978 break;
979
980
981 /* ========================================================================== */
982 /* These opcodes inspect the next subject character, and sometimes
983 the previous one as well, but do not have an argument. The variable
984 clen contains the length of the current character and is zero if we are
985 at the end of the subject. */
986
987 /*-----------------------------------------------------------------*/
988 case OP_ANY:
989 if (clen > 0 && !IS_NEWLINE(ptr))
990 {
991 if (ptr + 1 >= mb->end_subject &&
992 (mb->moptions & (PCRE2_PARTIAL_HARD)) != 0 &&
993 NLBLOCK->nltype == NLTYPE_FIXED &&
994 NLBLOCK->nllen == 2 &&
995 c == NLBLOCK->nl[0])
996 {
997 could_continue = partial_newline = TRUE;
998 }
999 else
1000 {
1001 ADD_NEW(state_offset + 1, 0);
1002 }
1003 }
1004 break;
1005
1006 /*-----------------------------------------------------------------*/
1007 case OP_ALLANY:
1008 if (clen > 0)
1009 { ADD_NEW(state_offset + 1, 0); }
1010 break;
1011
1012 /*-----------------------------------------------------------------*/
1013 case OP_EODN:
1014 if (clen == 0 && (mb->moptions & PCRE2_PARTIAL_HARD) != 0)
1015 could_continue = TRUE;
1016 else if (clen == 0 || (IS_NEWLINE(ptr) && ptr == end_subject - mb->nllen))
1017 { ADD_ACTIVE(state_offset + 1, 0); }
1018 break;
1019
1020 /*-----------------------------------------------------------------*/
1021 case OP_DOLL:
1022 if ((mb->moptions & PCRE2_NOTEOL) == 0)
1023 {
1024 if (clen == 0 && (mb->moptions & PCRE2_PARTIAL_HARD) != 0)
1025 could_continue = TRUE;
1026 else if (clen == 0 ||
1027 ((mb->poptions & PCRE2_DOLLAR_ENDONLY) == 0 && IS_NEWLINE(ptr) &&
1028 (ptr == end_subject - mb->nllen)
1029 ))
1030 { ADD_ACTIVE(state_offset + 1, 0); }
1031 else if (ptr + 1 >= mb->end_subject &&
1032 (mb->moptions & (PCRE2_PARTIAL_HARD|PCRE2_PARTIAL_SOFT)) != 0 &&
1033 NLBLOCK->nltype == NLTYPE_FIXED &&
1034 NLBLOCK->nllen == 2 &&
1035 c == NLBLOCK->nl[0])
1036 {
1037 if ((mb->moptions & PCRE2_PARTIAL_HARD) != 0)
1038 {
1039 reset_could_continue = TRUE;
1040 ADD_NEW_DATA(-(state_offset + 1), 0, 1);
1041 }
1042 else could_continue = partial_newline = TRUE;
1043 }
1044 }
1045 break;
1046
1047 /*-----------------------------------------------------------------*/
1048 case OP_DOLLM:
1049 if ((mb->moptions & PCRE2_NOTEOL) == 0)
1050 {
1051 if (clen == 0 && (mb->moptions & PCRE2_PARTIAL_HARD) != 0)
1052 could_continue = TRUE;
1053 else if (clen == 0 ||
1054 ((mb->poptions & PCRE2_DOLLAR_ENDONLY) == 0 && IS_NEWLINE(ptr)))
1055 { ADD_ACTIVE(state_offset + 1, 0); }
1056 else if (ptr + 1 >= mb->end_subject &&
1057 (mb->moptions & (PCRE2_PARTIAL_HARD|PCRE2_PARTIAL_SOFT)) != 0 &&
1058 NLBLOCK->nltype == NLTYPE_FIXED &&
1059 NLBLOCK->nllen == 2 &&
1060 c == NLBLOCK->nl[0])
1061 {
1062 if ((mb->moptions & PCRE2_PARTIAL_HARD) != 0)
1063 {
1064 reset_could_continue = TRUE;
1065 ADD_NEW_DATA(-(state_offset + 1), 0, 1);
1066 }
1067 else could_continue = partial_newline = TRUE;
1068 }
1069 }
1070 else if (IS_NEWLINE(ptr))
1071 { ADD_ACTIVE(state_offset + 1, 0); }
1072 break;
1073
1074 /*-----------------------------------------------------------------*/
1075
1076 case OP_DIGIT:
1077 case OP_WHITESPACE:
1078 case OP_WORDCHAR:
1079 if (clen > 0 && c < 256 &&
1080 ((ctypes[c] & toptable1[codevalue]) ^ toptable2[codevalue]) != 0)
1081 { ADD_NEW(state_offset + 1, 0); }
1082 break;
1083
1084 /*-----------------------------------------------------------------*/
1085 case OP_NOT_DIGIT:
1086 case OP_NOT_WHITESPACE:
1087 case OP_NOT_WORDCHAR:
1088 if (clen > 0 && (c >= 256 ||
1089 ((ctypes[c] & toptable1[codevalue]) ^ toptable2[codevalue]) != 0))
1090 { ADD_NEW(state_offset + 1, 0); }
1091 break;
1092
1093 /*-----------------------------------------------------------------*/
1094 case OP_WORD_BOUNDARY:
1095 case OP_NOT_WORD_BOUNDARY:
1096 {
1097 int left_word, right_word;
1098
1099 if (ptr > start_subject)
1100 {
1101 PCRE2_SPTR temp = ptr - 1;
1102 if (temp < mb->start_used_ptr) mb->start_used_ptr = temp;
1103 #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32
1104 if (utf) { BACKCHAR(temp); }
1105 #endif
1106 GETCHARTEST(d, temp);
1107 #ifdef SUPPORT_UNICODE
1108 if ((mb->poptions & PCRE2_UCP) != 0)
1109 {
1110 if (d == '_') left_word = TRUE; else
1111 {
1112 uint32_t cat = UCD_CATEGORY(d);
1113 left_word = (cat == ucp_L || cat == ucp_N);
1114 }
1115 }
1116 else
1117 #endif
1118 left_word = d < 256 && (ctypes[d] & ctype_word) != 0;
1119 }
1120 else left_word = FALSE;
1121
1122 if (clen > 0)
1123 {
1124 if (ptr >= mb->last_used_ptr)
1125 {
1126 PCRE2_SPTR temp = ptr + 1;
1127 #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32
1128 if (utf) { FORWARDCHARTEST(temp, mb->end_subject); }
1129 #endif
1130 mb->last_used_ptr = temp;
1131 }
1132 #ifdef SUPPORT_UNICODE
1133 if ((mb->poptions & PCRE2_UCP) != 0)
1134 {
1135 if (c == '_') right_word = TRUE; else
1136 {
1137 uint32_t cat = UCD_CATEGORY(c);
1138 right_word = (cat == ucp_L || cat == ucp_N);
1139 }
1140 }
1141 else
1142 #endif
1143 right_word = c < 256 && (ctypes[c] & ctype_word) != 0;
1144 }
1145 else right_word = FALSE;
1146
1147 if ((left_word == right_word) == (codevalue == OP_NOT_WORD_BOUNDARY))
1148 { ADD_ACTIVE(state_offset + 1, 0); }
1149 }
1150 break;
1151
1152
1153 /*-----------------------------------------------------------------*/
1154 /* Check the next character by Unicode property. We will get here only
1155 if the support is in the binary; otherwise a compile-time error occurs.
1156 */
1157
1158 #ifdef SUPPORT_UNICODE
1159 case OP_PROP:
1160 case OP_NOTPROP:
1161 if (clen > 0)
1162 {
1163 BOOL OK;
1164 const uint32_t *cp;
1165 const ucd_record * prop = GET_UCD(c);
1166 switch(code[1])
1167 {
1168 case PT_ANY:
1169 OK = TRUE;
1170 break;
1171
1172 case PT_LAMP:
1173 OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll ||
1174 prop->chartype == ucp_Lt;
1175 break;
1176
1177 case PT_GC:
1178 OK = PRIV(ucp_gentype)[prop->chartype] == code[2];
1179 break;
1180
1181 case PT_PC:
1182 OK = prop->chartype == code[2];
1183 break;
1184
1185 case PT_SC:
1186 OK = prop->script == code[2];
1187 break;
1188
1189 /* These are specials for combination cases. */
1190
1191 case PT_ALNUM:
1192 OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
1193 PRIV(ucp_gentype)[prop->chartype] == ucp_N;
1194 break;
1195
1196 /* Perl space used to exclude VT, but from Perl 5.18 it is included,
1197 which means that Perl space and POSIX space are now identical. PCRE
1198 was changed at release 8.34. */
1199
1200 case PT_SPACE: /* Perl space */
1201 case PT_PXSPACE: /* POSIX space */
1202 switch(c)
1203 {
1204 HSPACE_CASES:
1205 VSPACE_CASES:
1206 OK = TRUE;
1207 break;
1208
1209 default:
1210 OK = PRIV(ucp_gentype)[prop->chartype] == ucp_Z;
1211 break;
1212 }
1213 break;
1214
1215 case PT_WORD:
1216 OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
1217 PRIV(ucp_gentype)[prop->chartype] == ucp_N ||
1218 c == CHAR_UNDERSCORE;
1219 break;
1220
1221 case PT_CLIST:
1222 cp = PRIV(ucd_caseless_sets) + code[2];
1223 for (;;)
1224 {
1225 if (c < *cp) { OK = FALSE; break; }
1226 if (c == *cp++) { OK = TRUE; break; }
1227 }
1228 break;
1229
1230 case PT_UCNC:
1231 OK = c == CHAR_DOLLAR_SIGN || c == CHAR_COMMERCIAL_AT ||
1232 c == CHAR_GRAVE_ACCENT || (c >= 0xa0 && c <= 0xd7ff) ||
1233 c >= 0xe000;
1234 break;
1235
1236 /* Should never occur, but keep compilers from grumbling. */
1237
1238 default:
1239 OK = codevalue != OP_PROP;
1240 break;
1241 }
1242
1243 if (OK == (codevalue == OP_PROP)) { ADD_NEW(state_offset + 3, 0); }
1244 }
1245 break;
1246 #endif
1247
1248
1249
1250 /* ========================================================================== */
1251 /* These opcodes likewise inspect the subject character, but have an
1252 argument that is not a data character. It is one of these opcodes:
1253 OP_ANY, OP_ALLANY, OP_DIGIT, OP_NOT_DIGIT, OP_WHITESPACE, OP_NOT_SPACE,
1254 OP_WORDCHAR, OP_NOT_WORDCHAR. The value is loaded into d. */
1255
1256 case OP_TYPEPLUS:
1257 case OP_TYPEMINPLUS:
1258 case OP_TYPEPOSPLUS:
1259 count = current_state->count; /* Already matched */
1260 if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1261 if (clen > 0)
1262 {
1263 if (d == OP_ANY && ptr + 1 >= mb->end_subject &&
1264 (mb->moptions & (PCRE2_PARTIAL_HARD)) != 0 &&
1265 NLBLOCK->nltype == NLTYPE_FIXED &&
1266 NLBLOCK->nllen == 2 &&
1267 c == NLBLOCK->nl[0])
1268 {
1269 could_continue = partial_newline = TRUE;
1270 }
1271 else if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
1272 (c < 256 &&
1273 (d != OP_ANY || !IS_NEWLINE(ptr)) &&
1274 ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
1275 {
1276 if (count > 0 && codevalue == OP_TYPEPOSPLUS)
1277 {
1278 active_count--; /* Remove non-match possibility */
1279 next_active_state--;
1280 }
1281 count++;
1282 ADD_NEW(state_offset, count);
1283 }
1284 }
1285 break;
1286
1287 /*-----------------------------------------------------------------*/
1288 case OP_TYPEQUERY:
1289 case OP_TYPEMINQUERY:
1290 case OP_TYPEPOSQUERY:
1291 ADD_ACTIVE(state_offset + 2, 0);
1292 if (clen > 0)
1293 {
1294 if (d == OP_ANY && ptr + 1 >= mb->end_subject &&
1295 (mb->moptions & (PCRE2_PARTIAL_HARD)) != 0 &&
1296 NLBLOCK->nltype == NLTYPE_FIXED &&
1297 NLBLOCK->nllen == 2 &&
1298 c == NLBLOCK->nl[0])
1299 {
1300 could_continue = partial_newline = TRUE;
1301 }
1302 else if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
1303 (c < 256 &&
1304 (d != OP_ANY || !IS_NEWLINE(ptr)) &&
1305 ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
1306 {
1307 if (codevalue == OP_TYPEPOSQUERY)
1308 {
1309 active_count--; /* Remove non-match possibility */
1310 next_active_state--;
1311 }
1312 ADD_NEW(state_offset + 2, 0);
1313 }
1314 }
1315 break;
1316
1317 /*-----------------------------------------------------------------*/
1318 case OP_TYPESTAR:
1319 case OP_TYPEMINSTAR:
1320 case OP_TYPEPOSSTAR:
1321 ADD_ACTIVE(state_offset + 2, 0);
1322 if (clen > 0)
1323 {
1324 if (d == OP_ANY && ptr + 1 >= mb->end_subject &&
1325 (mb->moptions & (PCRE2_PARTIAL_HARD)) != 0 &&
1326 NLBLOCK->nltype == NLTYPE_FIXED &&
1327 NLBLOCK->nllen == 2 &&
1328 c == NLBLOCK->nl[0])
1329 {
1330 could_continue = partial_newline = TRUE;
1331 }
1332 else if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
1333 (c < 256 &&
1334 (d != OP_ANY || !IS_NEWLINE(ptr)) &&
1335 ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
1336 {
1337 if (codevalue == OP_TYPEPOSSTAR)
1338 {
1339 active_count--; /* Remove non-match possibility */
1340 next_active_state--;
1341 }
1342 ADD_NEW(state_offset, 0);
1343 }
1344 }
1345 break;
1346
1347 /*-----------------------------------------------------------------*/
1348 case OP_TYPEEXACT:
1349 count = current_state->count; /* Number already matched */
1350 if (clen > 0)
1351 {
1352 if (d == OP_ANY && ptr + 1 >= mb->end_subject &&
1353 (mb->moptions & (PCRE2_PARTIAL_HARD)) != 0 &&
1354 NLBLOCK->nltype == NLTYPE_FIXED &&
1355 NLBLOCK->nllen == 2 &&
1356 c == NLBLOCK->nl[0])
1357 {
1358 could_continue = partial_newline = TRUE;
1359 }
1360 else if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
1361 (c < 256 &&
1362 (d != OP_ANY || !IS_NEWLINE(ptr)) &&
1363 ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
1364 {
1365 if (++count >= (int)GET2(code, 1))
1366 { ADD_NEW(state_offset + 1 + IMM2_SIZE + 1, 0); }
1367 else
1368 { ADD_NEW(state_offset, count); }
1369 }
1370 }
1371 break;
1372
1373 /*-----------------------------------------------------------------*/
1374 case OP_TYPEUPTO:
1375 case OP_TYPEMINUPTO:
1376 case OP_TYPEPOSUPTO:
1377 ADD_ACTIVE(state_offset + 2 + IMM2_SIZE, 0);
1378 count = current_state->count; /* Number already matched */
1379 if (clen > 0)
1380 {
1381 if (d == OP_ANY && ptr + 1 >= mb->end_subject &&
1382 (mb->moptions & (PCRE2_PARTIAL_HARD)) != 0 &&
1383 NLBLOCK->nltype == NLTYPE_FIXED &&
1384 NLBLOCK->nllen == 2 &&
1385 c == NLBLOCK->nl[0])
1386 {
1387 could_continue = partial_newline = TRUE;
1388 }
1389 else if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
1390 (c < 256 &&
1391 (d != OP_ANY || !IS_NEWLINE(ptr)) &&
1392 ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
1393 {
1394 if (codevalue == OP_TYPEPOSUPTO)
1395 {
1396 active_count--; /* Remove non-match possibility */
1397 next_active_state--;
1398 }
1399 if (++count >= (int)GET2(code, 1))
1400 { ADD_NEW(state_offset + 2 + IMM2_SIZE, 0); }
1401 else
1402 { ADD_NEW(state_offset, count); }
1403 }
1404 }
1405 break;
1406
1407 /* ========================================================================== */
1408 /* These are virtual opcodes that are used when something like
1409 OP_TYPEPLUS has OP_PROP, OP_NOTPROP, OP_ANYNL, or OP_EXTUNI as its
1410 argument. It keeps the code above fast for the other cases. The argument
1411 is in the d variable. */
1412
1413 #ifdef SUPPORT_UNICODE
1414 case OP_PROP_EXTRA + OP_TYPEPLUS:
1415 case OP_PROP_EXTRA + OP_TYPEMINPLUS:
1416 case OP_PROP_EXTRA + OP_TYPEPOSPLUS:
1417 count = current_state->count; /* Already matched */
1418 if (count > 0) { ADD_ACTIVE(state_offset + 4, 0); }
1419 if (clen > 0)
1420 {
1421 BOOL OK;
1422 const uint32_t *cp;
1423 const ucd_record * prop = GET_UCD(c);
1424 switch(code[2])
1425 {
1426 case PT_ANY:
1427 OK = TRUE;
1428 break;
1429
1430 case PT_LAMP:
1431 OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll ||
1432 prop->chartype == ucp_Lt;
1433 break;
1434
1435 case PT_GC:
1436 OK = PRIV(ucp_gentype)[prop->chartype] == code[3];
1437 break;
1438
1439 case PT_PC:
1440 OK = prop->chartype == code[3];
1441 break;
1442
1443 case PT_SC:
1444 OK = prop->script == code[3];
1445 break;
1446
1447 /* These are specials for combination cases. */
1448
1449 case PT_ALNUM:
1450 OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
1451 PRIV(ucp_gentype)[prop->chartype] == ucp_N;
1452 break;
1453
1454 /* Perl space used to exclude VT, but from Perl 5.18 it is included,
1455 which means that Perl space and POSIX space are now identical. PCRE
1456 was changed at release 8.34. */
1457
1458 case PT_SPACE: /* Perl space */
1459 case PT_PXSPACE: /* POSIX space */
1460 switch(c)
1461 {
1462 HSPACE_CASES:
1463 VSPACE_CASES:
1464 OK = TRUE;
1465 break;
1466
1467 default:
1468 OK = PRIV(ucp_gentype)[prop->chartype] == ucp_Z;
1469 break;
1470 }
1471 break;
1472
1473 case PT_WORD:
1474 OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
1475 PRIV(ucp_gentype)[prop->chartype] == ucp_N ||
1476 c == CHAR_UNDERSCORE;
1477 break;
1478
1479 case PT_CLIST:
1480 cp = PRIV(ucd_caseless_sets) + code[3];
1481 for (;;)
1482 {
1483 if (c < *cp) { OK = FALSE; break; }
1484 if (c == *cp++) { OK = TRUE; break; }
1485 }
1486 break;
1487
1488 case PT_UCNC:
1489 OK = c == CHAR_DOLLAR_SIGN || c == CHAR_COMMERCIAL_AT ||
1490 c == CHAR_GRAVE_ACCENT || (c >= 0xa0 && c <= 0xd7ff) ||
1491 c >= 0xe000;
1492 break;
1493
1494 /* Should never occur, but keep compilers from grumbling. */
1495
1496 default:
1497 OK = codevalue != OP_PROP;
1498 break;
1499 }
1500
1501 if (OK == (d == OP_PROP))
1502 {
1503 if (count > 0 && codevalue == OP_PROP_EXTRA + OP_TYPEPOSPLUS)
1504 {
1505 active_count--; /* Remove non-match possibility */
1506 next_active_state--;
1507 }
1508 count++;
1509 ADD_NEW(state_offset, count);
1510 }
1511 }
1512 break;
1513
1514 /*-----------------------------------------------------------------*/
1515 case OP_EXTUNI_EXTRA + OP_TYPEPLUS:
1516 case OP_EXTUNI_EXTRA + OP_TYPEMINPLUS:
1517 case OP_EXTUNI_EXTRA + OP_TYPEPOSPLUS:
1518 count = current_state->count; /* Already matched */
1519 if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1520 if (clen > 0)
1521 {
1522 int ncount = 0;
1523 if (count > 0 && codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSPLUS)
1524 {
1525 active_count--; /* Remove non-match possibility */
1526 next_active_state--;
1527 }
1528 (void)PRIV(extuni)(c, ptr + clen, mb->start_subject, end_subject, utf,
1529 &ncount);
1530 count++;
1531 ADD_NEW_DATA(-state_offset, count, ncount);
1532 }
1533 break;
1534 #endif
1535
1536 /*-----------------------------------------------------------------*/
1537 case OP_ANYNL_EXTRA + OP_TYPEPLUS:
1538 case OP_ANYNL_EXTRA + OP_TYPEMINPLUS:
1539 case OP_ANYNL_EXTRA + OP_TYPEPOSPLUS:
1540 count = current_state->count; /* Already matched */
1541 if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1542 if (clen > 0)
1543 {
1544 int ncount = 0;
1545 switch (c)
1546 {
1547 case CHAR_VT:
1548 case CHAR_FF:
1549 case CHAR_NEL:
1550 #ifndef EBCDIC
1551 case 0x2028:
1552 case 0x2029:
1553 #endif /* Not EBCDIC */
1554 if (mb->bsr_convention == PCRE2_BSR_ANYCRLF) break;
1555 goto ANYNL01;
1556
1557 case CHAR_CR:
1558 if (ptr + 1 < end_subject && UCHAR21TEST(ptr + 1) == CHAR_LF) ncount = 1;
1559 /* Fall through */
1560
1561 ANYNL01:
1562 case CHAR_LF:
1563 if (count > 0 && codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSPLUS)
1564 {
1565 active_count--; /* Remove non-match possibility */
1566 next_active_state--;
1567 }
1568 count++;
1569 ADD_NEW_DATA(-state_offset, count, ncount);
1570 break;
1571
1572 default:
1573 break;
1574 }
1575 }
1576 break;
1577
1578 /*-----------------------------------------------------------------*/
1579 case OP_VSPACE_EXTRA + OP_TYPEPLUS:
1580 case OP_VSPACE_EXTRA + OP_TYPEMINPLUS:
1581 case OP_VSPACE_EXTRA + OP_TYPEPOSPLUS:
1582 count = current_state->count; /* Already matched */
1583 if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1584 if (clen > 0)
1585 {
1586 BOOL OK;
1587 switch (c)
1588 {
1589 VSPACE_CASES:
1590 OK = TRUE;
1591 break;
1592
1593 default:
1594 OK = FALSE;
1595 break;
1596 }
1597
1598 if (OK == (d == OP_VSPACE))
1599 {
1600 if (count > 0 && codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSPLUS)
1601 {
1602 active_count--; /* Remove non-match possibility */
1603 next_active_state--;
1604 }
1605 count++;
1606 ADD_NEW_DATA(-state_offset, count, 0);
1607 }
1608 }
1609 break;
1610
1611 /*-----------------------------------------------------------------*/
1612 case OP_HSPACE_EXTRA + OP_TYPEPLUS:
1613 case OP_HSPACE_EXTRA + OP_TYPEMINPLUS:
1614 case OP_HSPACE_EXTRA + OP_TYPEPOSPLUS:
1615 count = current_state->count; /* Already matched */
1616 if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1617 if (clen > 0)
1618 {
1619 BOOL OK;
1620 switch (c)
1621 {
1622 HSPACE_CASES:
1623 OK = TRUE;
1624 break;
1625
1626 default:
1627 OK = FALSE;
1628 break;
1629 }
1630
1631 if (OK == (d == OP_HSPACE))
1632 {
1633 if (count > 0 && codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSPLUS)
1634 {
1635 active_count--; /* Remove non-match possibility */
1636 next_active_state--;
1637 }
1638 count++;
1639 ADD_NEW_DATA(-state_offset, count, 0);
1640 }
1641 }
1642 break;
1643
1644 /*-----------------------------------------------------------------*/
1645 #ifdef SUPPORT_UNICODE
1646 case OP_PROP_EXTRA + OP_TYPEQUERY:
1647 case OP_PROP_EXTRA + OP_TYPEMINQUERY:
1648 case OP_PROP_EXTRA + OP_TYPEPOSQUERY:
1649 count = 4;
1650 goto QS1;
1651
1652 case OP_PROP_EXTRA + OP_TYPESTAR:
1653 case OP_PROP_EXTRA + OP_TYPEMINSTAR:
1654 case OP_PROP_EXTRA + OP_TYPEPOSSTAR:
1655 count = 0;
1656
1657 QS1:
1658
1659 ADD_ACTIVE(state_offset + 4, 0);
1660 if (clen > 0)
1661 {
1662 BOOL OK;
1663 const uint32_t *cp;
1664 const ucd_record * prop = GET_UCD(c);
1665 switch(code[2])
1666 {
1667 case PT_ANY:
1668 OK = TRUE;
1669 break;
1670
1671 case PT_LAMP:
1672 OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll ||
1673 prop->chartype == ucp_Lt;
1674 break;
1675
1676 case PT_GC:
1677 OK = PRIV(ucp_gentype)[prop->chartype] == code[3];
1678 break;
1679
1680 case PT_PC:
1681 OK = prop->chartype == code[3];
1682 break;
1683
1684 case PT_SC:
1685 OK = prop->script == code[3];
1686 break;
1687
1688 /* These are specials for combination cases. */
1689
1690 case PT_ALNUM:
1691 OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
1692 PRIV(ucp_gentype)[prop->chartype] == ucp_N;
1693 break;
1694
1695 /* Perl space used to exclude VT, but from Perl 5.18 it is included,
1696 which means that Perl space and POSIX space are now identical. PCRE
1697 was changed at release 8.34. */
1698
1699 case PT_SPACE: /* Perl space */
1700 case PT_PXSPACE: /* POSIX space */
1701 switch(c)
1702 {
1703 HSPACE_CASES:
1704 VSPACE_CASES:
1705 OK = TRUE;
1706 break;
1707
1708 default:
1709 OK = PRIV(ucp_gentype)[prop->chartype] == ucp_Z;
1710 break;
1711 }
1712 break;
1713
1714 case PT_WORD:
1715 OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
1716 PRIV(ucp_gentype)[prop->chartype] == ucp_N ||
1717 c == CHAR_UNDERSCORE;
1718 break;
1719
1720 case PT_CLIST:
1721 cp = PRIV(ucd_caseless_sets) + code[3];
1722 for (;;)
1723 {
1724 if (c < *cp) { OK = FALSE; break; }
1725 if (c == *cp++) { OK = TRUE; break; }
1726 }
1727 break;
1728
1729 case PT_UCNC:
1730 OK = c == CHAR_DOLLAR_SIGN || c == CHAR_COMMERCIAL_AT ||
1731 c == CHAR_GRAVE_ACCENT || (c >= 0xa0 && c <= 0xd7ff) ||
1732 c >= 0xe000;
1733 break;
1734
1735 /* Should never occur, but keep compilers from grumbling. */
1736
1737 default:
1738 OK = codevalue != OP_PROP;
1739 break;
1740 }
1741
1742 if (OK == (d == OP_PROP))
1743 {
1744 if (codevalue == OP_PROP_EXTRA + OP_TYPEPOSSTAR ||
1745 codevalue == OP_PROP_EXTRA + OP_TYPEPOSQUERY)
1746 {
1747 active_count--; /* Remove non-match possibility */
1748 next_active_state--;
1749 }
1750 ADD_NEW(state_offset + count, 0);
1751 }
1752 }
1753 break;
1754
1755 /*-----------------------------------------------------------------*/
1756 case OP_EXTUNI_EXTRA + OP_TYPEQUERY:
1757 case OP_EXTUNI_EXTRA + OP_TYPEMINQUERY:
1758 case OP_EXTUNI_EXTRA + OP_TYPEPOSQUERY:
1759 count = 2;
1760 goto QS2;
1761
1762 case OP_EXTUNI_EXTRA + OP_TYPESTAR:
1763 case OP_EXTUNI_EXTRA + OP_TYPEMINSTAR:
1764 case OP_EXTUNI_EXTRA + OP_TYPEPOSSTAR:
1765 count = 0;
1766
1767 QS2:
1768
1769 ADD_ACTIVE(state_offset + 2, 0);
1770 if (clen > 0)
1771 {
1772 int ncount = 0;
1773 if (codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSSTAR ||
1774 codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSQUERY)
1775 {
1776 active_count--; /* Remove non-match possibility */
1777 next_active_state--;
1778 }
1779 (void)PRIV(extuni)(c, ptr + clen, mb->start_subject, end_subject, utf,
1780 &ncount);
1781 ADD_NEW_DATA(-(state_offset + count), 0, ncount);
1782 }
1783 break;
1784 #endif
1785
1786 /*-----------------------------------------------------------------*/
1787 case OP_ANYNL_EXTRA + OP_TYPEQUERY:
1788 case OP_ANYNL_EXTRA + OP_TYPEMINQUERY:
1789 case OP_ANYNL_EXTRA + OP_TYPEPOSQUERY:
1790 count = 2;
1791 goto QS3;
1792
1793 case OP_ANYNL_EXTRA + OP_TYPESTAR:
1794 case OP_ANYNL_EXTRA + OP_TYPEMINSTAR:
1795 case OP_ANYNL_EXTRA + OP_TYPEPOSSTAR:
1796 count = 0;
1797
1798 QS3:
1799 ADD_ACTIVE(state_offset + 2, 0);
1800 if (clen > 0)
1801 {
1802 int ncount = 0;
1803 switch (c)
1804 {
1805 case CHAR_VT:
1806 case CHAR_FF:
1807 case CHAR_NEL:
1808 #ifndef EBCDIC
1809 case 0x2028:
1810 case 0x2029:
1811 #endif /* Not EBCDIC */
1812 if (mb->bsr_convention == PCRE2_BSR_ANYCRLF) break;
1813 goto ANYNL02;
1814
1815 case CHAR_CR:
1816 if (ptr + 1 < end_subject && UCHAR21TEST(ptr + 1) == CHAR_LF) ncount = 1;
1817 /* Fall through */
1818
1819 ANYNL02:
1820 case CHAR_LF:
1821 if (codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSSTAR ||
1822 codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSQUERY)
1823 {
1824 active_count--; /* Remove non-match possibility */
1825 next_active_state--;
1826 }
1827 ADD_NEW_DATA(-(state_offset + (int)count), 0, ncount);
1828 break;
1829
1830 default:
1831 break;
1832 }
1833 }
1834 break;
1835
1836 /*-----------------------------------------------------------------*/
1837 case OP_VSPACE_EXTRA + OP_TYPEQUERY:
1838 case OP_VSPACE_EXTRA + OP_TYPEMINQUERY:
1839 case OP_VSPACE_EXTRA + OP_TYPEPOSQUERY:
1840 count = 2;
1841 goto QS4;
1842
1843 case OP_VSPACE_EXTRA + OP_TYPESTAR:
1844 case OP_VSPACE_EXTRA + OP_TYPEMINSTAR:
1845 case OP_VSPACE_EXTRA + OP_TYPEPOSSTAR:
1846 count = 0;
1847
1848 QS4:
1849 ADD_ACTIVE(state_offset + 2, 0);
1850 if (clen > 0)
1851 {
1852 BOOL OK;
1853 switch (c)
1854 {
1855 VSPACE_CASES:
1856 OK = TRUE;
1857 break;
1858
1859 default:
1860 OK = FALSE;
1861 break;
1862 }
1863 if (OK == (d == OP_VSPACE))
1864 {
1865 if (codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSSTAR ||
1866 codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSQUERY)
1867 {
1868 active_count--; /* Remove non-match possibility */
1869 next_active_state--;
1870 }
1871 ADD_NEW_DATA(-(state_offset + (int)count), 0, 0);
1872 }
1873 }
1874 break;
1875
1876 /*-----------------------------------------------------------------*/
1877 case OP_HSPACE_EXTRA + OP_TYPEQUERY:
1878 case OP_HSPACE_EXTRA + OP_TYPEMINQUERY:
1879 case OP_HSPACE_EXTRA + OP_TYPEPOSQUERY:
1880 count = 2;
1881 goto QS5;
1882
1883 case OP_HSPACE_EXTRA + OP_TYPESTAR:
1884 case OP_HSPACE_EXTRA + OP_TYPEMINSTAR:
1885 case OP_HSPACE_EXTRA + OP_TYPEPOSSTAR:
1886 count = 0;
1887
1888 QS5:
1889 ADD_ACTIVE(state_offset + 2, 0);
1890 if (clen > 0)
1891 {
1892 BOOL OK;
1893 switch (c)
1894 {
1895 HSPACE_CASES:
1896 OK = TRUE;
1897 break;
1898
1899 default:
1900 OK = FALSE;
1901 break;
1902 }
1903
1904 if (OK == (d == OP_HSPACE))
1905 {
1906 if (codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSSTAR ||
1907 codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSQUERY)
1908 {
1909 active_count--; /* Remove non-match possibility */
1910 next_active_state--;
1911 }
1912 ADD_NEW_DATA(-(state_offset + (int)count), 0, 0);
1913 }
1914 }
1915 break;
1916
1917 /*-----------------------------------------------------------------*/
1918 #ifdef SUPPORT_UNICODE
1919 case OP_PROP_EXTRA + OP_TYPEEXACT:
1920 case OP_PROP_EXTRA + OP_TYPEUPTO:
1921 case OP_PROP_EXTRA + OP_TYPEMINUPTO:
1922 case OP_PROP_EXTRA + OP_TYPEPOSUPTO:
1923 if (codevalue != OP_PROP_EXTRA + OP_TYPEEXACT)
1924 { ADD_ACTIVE(state_offset + 1 + IMM2_SIZE + 3, 0); }
1925 count = current_state->count; /* Number already matched */
1926 if (clen > 0)
1927 {
1928 BOOL OK;
1929 const uint32_t *cp;
1930 const ucd_record * prop = GET_UCD(c);
1931 switch(code[1 + IMM2_SIZE + 1])
1932 {
1933 case PT_ANY:
1934 OK = TRUE;
1935 break;
1936
1937 case PT_LAMP:
1938 OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll ||
1939 prop->chartype == ucp_Lt;
1940 break;
1941
1942 case PT_GC:
1943 OK = PRIV(ucp_gentype)[prop->chartype] == code[1 + IMM2_SIZE + 2];
1944 break;
1945
1946 case PT_PC:
1947 OK = prop->chartype == code[1 + IMM2_SIZE + 2];
1948 break;
1949
1950 case PT_SC:
1951 OK = prop->script == code[1 + IMM2_SIZE + 2];
1952 break;
1953
1954 /* These are specials for combination cases. */
1955
1956 case PT_ALNUM:
1957 OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
1958 PRIV(ucp_gentype)[prop->chartype] == ucp_N;
1959 break;
1960
1961 /* Perl space used to exclude VT, but from Perl 5.18 it is included,
1962 which means that Perl space and POSIX space are now identical. PCRE
1963 was changed at release 8.34. */
1964
1965 case PT_SPACE: /* Perl space */
1966 case PT_PXSPACE: /* POSIX space */
1967 switch(c)
1968 {
1969 HSPACE_CASES:
1970 VSPACE_CASES:
1971 OK = TRUE;
1972 break;
1973
1974 default:
1975 OK = PRIV(ucp_gentype)[prop->chartype] == ucp_Z;
1976 break;
1977 }
1978 break;
1979
1980 case PT_WORD:
1981 OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
1982 PRIV(ucp_gentype)[prop->chartype] == ucp_N ||
1983 c == CHAR_UNDERSCORE;
1984 break;
1985
1986 case PT_CLIST:
1987 cp = PRIV(ucd_caseless_sets) + code[1 + IMM2_SIZE + 2];
1988 for (;;)
1989 {
1990 if (c < *cp) { OK = FALSE; break; }
1991 if (c == *cp++) { OK = TRUE; break; }
1992 }
1993 break;
1994
1995 case PT_UCNC:
1996 OK = c == CHAR_DOLLAR_SIGN || c == CHAR_COMMERCIAL_AT ||
1997 c == CHAR_GRAVE_ACCENT || (c >= 0xa0 && c <= 0xd7ff) ||
1998 c >= 0xe000;
1999 break;
2000
2001 /* Should never occur, but keep compilers from grumbling. */
2002
2003 default:
2004 OK = codevalue != OP_PROP;
2005 break;
2006 }
2007
2008 if (OK == (d == OP_PROP))
2009 {
2010 if (codevalue == OP_PROP_EXTRA + OP_TYPEPOSUPTO)
2011 {
2012 active_count--; /* Remove non-match possibility */
2013 next_active_state--;
2014 }
2015 if (++count >= (int)GET2(code, 1))
2016 { ADD_NEW(state_offset + 1 + IMM2_SIZE + 3, 0); }
2017 else
2018 { ADD_NEW(state_offset, count); }
2019 }
2020 }
2021 break;
2022
2023 /*-----------------------------------------------------------------*/
2024 case OP_EXTUNI_EXTRA + OP_TYPEEXACT:
2025 case OP_EXTUNI_EXTRA + OP_TYPEUPTO:
2026 case OP_EXTUNI_EXTRA + OP_TYPEMINUPTO:
2027 case OP_EXTUNI_EXTRA + OP_TYPEPOSUPTO:
2028 if (codevalue != OP_EXTUNI_EXTRA + OP_TYPEEXACT)
2029 { ADD_ACTIVE(state_offset + 2 + IMM2_SIZE, 0); }
2030 count = current_state->count; /* Number already matched */
2031 if (clen > 0)
2032 {
2033 PCRE2_SPTR nptr;
2034 int ncount = 0;
2035 if (codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSUPTO)
2036 {
2037 active_count--; /* Remove non-match possibility */
2038 next_active_state--;
2039 }
2040 nptr = PRIV(extuni)(c, ptr + clen, mb->start_subject, end_subject, utf,
2041 &ncount);
2042 if (nptr >= end_subject && (mb->moptions & PCRE2_PARTIAL_HARD) != 0)
2043 reset_could_continue = TRUE;
2044 if (++count >= (int)GET2(code, 1))
2045 { ADD_NEW_DATA(-(state_offset + 2 + IMM2_SIZE), 0, ncount); }
2046 else
2047 { ADD_NEW_DATA(-state_offset, count, ncount); }
2048 }
2049 break;
2050 #endif
2051
2052 /*-----------------------------------------------------------------*/
2053 case OP_ANYNL_EXTRA + OP_TYPEEXACT:
2054 case OP_ANYNL_EXTRA + OP_TYPEUPTO:
2055 case OP_ANYNL_EXTRA + OP_TYPEMINUPTO:
2056 case OP_ANYNL_EXTRA + OP_TYPEPOSUPTO:
2057 if (codevalue != OP_ANYNL_EXTRA + OP_TYPEEXACT)
2058 { ADD_ACTIVE(state_offset + 2 + IMM2_SIZE, 0); }
2059 count = current_state->count; /* Number already matched */
2060 if (clen > 0)
2061 {
2062 int ncount = 0;
2063 switch (c)
2064 {
2065 case CHAR_VT:
2066 case CHAR_FF:
2067 case CHAR_NEL:
2068 #ifndef EBCDIC
2069 case 0x2028:
2070 case 0x2029:
2071 #endif /* Not EBCDIC */
2072 if (mb->bsr_convention == PCRE2_BSR_ANYCRLF) break;
2073 goto ANYNL03;
2074
2075 case CHAR_CR:
2076 if (ptr + 1 < end_subject && UCHAR21TEST(ptr + 1) == CHAR_LF) ncount = 1;
2077 /* Fall through */
2078
2079 ANYNL03:
2080 case CHAR_LF:
2081 if (codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSUPTO)
2082 {
2083 active_count--; /* Remove non-match possibility */
2084 next_active_state--;
2085 }
2086 if (++count >= (int)GET2(code, 1))
2087 { ADD_NEW_DATA(-(state_offset + 2 + IMM2_SIZE), 0, ncount); }
2088 else
2089 { ADD_NEW_DATA(-state_offset, count, ncount); }
2090 break;
2091
2092 default:
2093 break;
2094 }
2095 }
2096 break;
2097
2098 /*-----------------------------------------------------------------*/
2099 case OP_VSPACE_EXTRA + OP_TYPEEXACT:
2100 case OP_VSPACE_EXTRA + OP_TYPEUPTO:
2101 case OP_VSPACE_EXTRA + OP_TYPEMINUPTO:
2102 case OP_VSPACE_EXTRA + OP_TYPEPOSUPTO:
2103 if (codevalue != OP_VSPACE_EXTRA + OP_TYPEEXACT)
2104 { ADD_ACTIVE(state_offset + 2 + IMM2_SIZE, 0); }
2105 count = current_state->count; /* Number already matched */
2106 if (clen > 0)
2107 {
2108 BOOL OK;
2109 switch (c)
2110 {
2111 VSPACE_CASES:
2112 OK = TRUE;
2113 break;
2114
2115 default:
2116 OK = FALSE;
2117 }
2118
2119 if (OK == (d == OP_VSPACE))
2120 {
2121 if (codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSUPTO)
2122 {
2123 active_count--; /* Remove non-match possibility */
2124 next_active_state--;
2125 }
2126 if (++count >= (int)GET2(code, 1))
2127 { ADD_NEW_DATA(-(state_offset + 2 + IMM2_SIZE), 0, 0); }
2128 else
2129 { ADD_NEW_DATA(-state_offset, count, 0); }
2130 }
2131 }
2132 break;
2133
2134 /*-----------------------------------------------------------------*/
2135 case OP_HSPACE_EXTRA + OP_TYPEEXACT:
2136 case OP_HSPACE_EXTRA + OP_TYPEUPTO:
2137 case OP_HSPACE_EXTRA + OP_TYPEMINUPTO:
2138 case OP_HSPACE_EXTRA + OP_TYPEPOSUPTO:
2139 if (codevalue != OP_HSPACE_EXTRA + OP_TYPEEXACT)
2140 { ADD_ACTIVE(state_offset + 2 + IMM2_SIZE, 0); }
2141 count = current_state->count; /* Number already matched */
2142 if (clen > 0)
2143 {
2144 BOOL OK;
2145 switch (c)
2146 {
2147 HSPACE_CASES:
2148 OK = TRUE;
2149 break;
2150
2151 default:
2152 OK = FALSE;
2153 break;
2154 }
2155
2156 if (OK == (d == OP_HSPACE))
2157 {
2158 if (codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSUPTO)
2159 {
2160 active_count--; /* Remove non-match possibility */
2161 next_active_state--;
2162 }
2163 if (++count >= (int)GET2(code, 1))
2164 { ADD_NEW_DATA(-(state_offset + 2 + IMM2_SIZE), 0, 0); }
2165 else
2166 { ADD_NEW_DATA(-state_offset, count, 0); }
2167 }
2168 }
2169 break;
2170
2171 /* ========================================================================== */
2172 /* These opcodes are followed by a character that is usually compared
2173 to the current subject character; it is loaded into d. We still get
2174 here even if there is no subject character, because in some cases zero
2175 repetitions are permitted. */
2176
2177 /*-----------------------------------------------------------------*/
2178 case OP_CHAR:
2179 if (clen > 0 && c == d) { ADD_NEW(state_offset + dlen + 1, 0); }
2180 break;
2181
2182 /*-----------------------------------------------------------------*/
2183 case OP_CHARI:
2184 if (clen == 0) break;
2185
2186 #ifdef SUPPORT_UNICODE
2187 if (utf)
2188 {
2189 if (c == d) { ADD_NEW(state_offset + dlen + 1, 0); } else
2190 {
2191 unsigned int othercase;
2192 if (c < 128)
2193 othercase = fcc[c];
2194 else
2195 othercase = UCD_OTHERCASE(c);
2196 if (d == othercase) { ADD_NEW(state_offset + dlen + 1, 0); }
2197 }
2198 }
2199 else
2200 #endif /* SUPPORT_UNICODE */
2201 /* Not UTF mode */
2202 {
2203 if (TABLE_GET(c, lcc, c) == TABLE_GET(d, lcc, d))
2204 { ADD_NEW(state_offset + 2, 0); }
2205 }
2206 break;
2207
2208
2209 #ifdef SUPPORT_UNICODE
2210 /*-----------------------------------------------------------------*/
2211 /* This is a tricky one because it can match more than one character.
2212 Find out how many characters to skip, and then set up a negative state
2213 to wait for them to pass before continuing. */
2214
2215 case OP_EXTUNI:
2216 if (clen > 0)
2217 {
2218 int ncount = 0;
2219 PCRE2_SPTR nptr = PRIV(extuni)(c, ptr + clen, mb->start_subject,
2220 end_subject, utf, &ncount);
2221 if (nptr >= end_subject && (mb->moptions & PCRE2_PARTIAL_HARD) != 0)
2222 reset_could_continue = TRUE;
2223 ADD_NEW_DATA(-(state_offset + 1), 0, ncount);
2224 }
2225 break;
2226 #endif
2227
2228 /*-----------------------------------------------------------------*/
2229 /* This is a tricky like EXTUNI because it too can match more than one
2230 character (when CR is followed by LF). In this case, set up a negative
2231 state to wait for one character to pass before continuing. */
2232
2233 case OP_ANYNL:
2234 if (clen > 0) switch(c)
2235 {
2236 case CHAR_VT:
2237 case CHAR_FF:
2238 case CHAR_NEL:
2239 #ifndef EBCDIC
2240 case 0x2028:
2241 case 0x2029:
2242 #endif /* Not EBCDIC */
2243 if (mb->bsr_convention == PCRE2_BSR_ANYCRLF) break;
2244 /* Fall through */
2245
2246 case CHAR_LF:
2247 ADD_NEW(state_offset + 1, 0);
2248 break;
2249
2250 case CHAR_CR:
2251 if (ptr + 1 >= end_subject)
2252 {
2253 ADD_NEW(state_offset + 1, 0);
2254 if ((mb->moptions & PCRE2_PARTIAL_HARD) != 0)
2255 reset_could_continue = TRUE;
2256 }
2257 else if (UCHAR21TEST(ptr + 1) == CHAR_LF)
2258 {
2259 ADD_NEW_DATA(-(state_offset + 1), 0, 1);
2260 }
2261 else
2262 {
2263 ADD_NEW(state_offset + 1, 0);
2264 }
2265 break;
2266 }
2267 break;
2268
2269 /*-----------------------------------------------------------------*/
2270 case OP_NOT_VSPACE:
2271 if (clen > 0) switch(c)
2272 {
2273 VSPACE_CASES:
2274 break;
2275
2276 default:
2277 ADD_NEW(state_offset + 1, 0);
2278 break;
2279 }
2280 break;
2281
2282 /*-----------------------------------------------------------------*/
2283 case OP_VSPACE:
2284 if (clen > 0) switch(c)
2285 {
2286 VSPACE_CASES:
2287 ADD_NEW(state_offset + 1, 0);
2288 break;
2289
2290 default:
2291 break;
2292 }
2293 break;
2294
2295 /*-----------------------------------------------------------------*/
2296 case OP_NOT_HSPACE:
2297 if (clen > 0) switch(c)
2298 {
2299 HSPACE_CASES:
2300 break;
2301
2302 default:
2303 ADD_NEW(state_offset + 1, 0);
2304 break;
2305 }
2306 break;
2307
2308 /*-----------------------------------------------------------------*/
2309 case OP_HSPACE:
2310 if (clen > 0) switch(c)
2311 {
2312 HSPACE_CASES:
2313 ADD_NEW(state_offset + 1, 0);
2314 break;
2315
2316 default:
2317 break;
2318 }
2319 break;
2320
2321 /*-----------------------------------------------------------------*/
2322 /* Match a negated single character casefully. */
2323
2324 case OP_NOT:
2325 if (clen > 0 && c != d) { ADD_NEW(state_offset + dlen + 1, 0); }
2326 break;
2327
2328 /*-----------------------------------------------------------------*/
2329 /* Match a negated single character caselessly. */
2330
2331 case OP_NOTI:
2332 if (clen > 0)
2333 {
2334 uint32_t otherd;
2335 #ifdef SUPPORT_UNICODE
2336 if (utf && d >= 128)
2337 otherd = UCD_OTHERCASE(d);
2338 else
2339 #endif /* SUPPORT_UNICODE */
2340 otherd = TABLE_GET(d, fcc, d);
2341 if (c != d && c != otherd)
2342 { ADD_NEW(state_offset + dlen + 1, 0); }
2343 }
2344 break;
2345
2346 /*-----------------------------------------------------------------*/
2347 case OP_PLUSI:
2348 case OP_MINPLUSI:
2349 case OP_POSPLUSI:
2350 case OP_NOTPLUSI:
2351 case OP_NOTMINPLUSI:
2352 case OP_NOTPOSPLUSI:
2353 caseless = TRUE;
2354 codevalue -= OP_STARI - OP_STAR;
2355
2356 /* Fall through */
2357 case OP_PLUS:
2358 case OP_MINPLUS:
2359 case OP_POSPLUS:
2360 case OP_NOTPLUS:
2361 case OP_NOTMINPLUS:
2362 case OP_NOTPOSPLUS:
2363 count = current_state->count; /* Already matched */
2364 if (count > 0) { ADD_ACTIVE(state_offset + dlen + 1, 0); }
2365 if (clen > 0)
2366 {
2367 uint32_t otherd = NOTACHAR;
2368 if (caseless)
2369 {
2370 #ifdef SUPPORT_UNICODE
2371 if (utf && d >= 128)
2372 otherd = UCD_OTHERCASE(d);
2373 else
2374 #endif /* SUPPORT_UNICODE */
2375 otherd = TABLE_GET(d, fcc, d);
2376 }
2377 if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
2378 {
2379 if (count > 0 &&
2380 (codevalue == OP_POSPLUS || codevalue == OP_NOTPOSPLUS))
2381 {
2382 active_count--; /* Remove non-match possibility */
2383 next_active_state--;
2384 }
2385 count++;
2386 ADD_NEW(state_offset, count);
2387 }
2388 }
2389 break;
2390
2391 /*-----------------------------------------------------------------*/
2392 case OP_QUERYI:
2393 case OP_MINQUERYI:
2394 case OP_POSQUERYI:
2395 case OP_NOTQUERYI:
2396 case OP_NOTMINQUERYI:
2397 case OP_NOTPOSQUERYI:
2398 caseless = TRUE;
2399 codevalue -= OP_STARI - OP_STAR;
2400 /* Fall through */
2401 case OP_QUERY:
2402 case OP_MINQUERY:
2403 case OP_POSQUERY:
2404 case OP_NOTQUERY:
2405 case OP_NOTMINQUERY:
2406 case OP_NOTPOSQUERY:
2407 ADD_ACTIVE(state_offset + dlen + 1, 0);
2408 if (clen > 0)
2409 {
2410 uint32_t otherd = NOTACHAR;
2411 if (caseless)
2412 {
2413 #ifdef SUPPORT_UNICODE
2414 if (utf && d >= 128)
2415 otherd = UCD_OTHERCASE(d);
2416 else
2417 #endif /* SUPPORT_UNICODE */
2418 otherd = TABLE_GET(d, fcc, d);
2419 }
2420 if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
2421 {
2422 if (codevalue == OP_POSQUERY || codevalue == OP_NOTPOSQUERY)
2423 {
2424 active_count--; /* Remove non-match possibility */
2425 next_active_state--;
2426 }
2427 ADD_NEW(state_offset + dlen + 1, 0);
2428 }
2429 }
2430 break;
2431
2432 /*-----------------------------------------------------------------*/
2433 case OP_STARI:
2434 case OP_MINSTARI:
2435 case OP_POSSTARI:
2436 case OP_NOTSTARI:
2437 case OP_NOTMINSTARI:
2438 case OP_NOTPOSSTARI:
2439 caseless = TRUE;
2440 codevalue -= OP_STARI - OP_STAR;
2441 /* Fall through */
2442 case OP_STAR:
2443 case OP_MINSTAR:
2444 case OP_POSSTAR:
2445 case OP_NOTSTAR:
2446 case OP_NOTMINSTAR:
2447 case OP_NOTPOSSTAR:
2448 ADD_ACTIVE(state_offset + dlen + 1, 0);
2449 if (clen > 0)
2450 {
2451 uint32_t otherd = NOTACHAR;
2452 if (caseless)
2453 {
2454 #ifdef SUPPORT_UNICODE
2455 if (utf && d >= 128)
2456 otherd = UCD_OTHERCASE(d);
2457 else
2458 #endif /* SUPPORT_UNICODE */
2459 otherd = TABLE_GET(d, fcc, d);
2460 }
2461 if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
2462 {
2463 if (codevalue == OP_POSSTAR || codevalue == OP_NOTPOSSTAR)
2464 {
2465 active_count--; /* Remove non-match possibility */
2466 next_active_state--;
2467 }
2468 ADD_NEW(state_offset, 0);
2469 }
2470 }
2471 break;
2472
2473 /*-----------------------------------------------------------------*/
2474 case OP_EXACTI:
2475 case OP_NOTEXACTI:
2476 caseless = TRUE;
2477 codevalue -= OP_STARI - OP_STAR;
2478 /* Fall through */
2479 case OP_EXACT:
2480 case OP_NOTEXACT:
2481 count = current_state->count; /* Number already matched */
2482 if (clen > 0)
2483 {
2484 uint32_t otherd = NOTACHAR;
2485 if (caseless)
2486 {
2487 #ifdef SUPPORT_UNICODE
2488 if (utf && d >= 128)
2489 otherd = UCD_OTHERCASE(d);
2490 else
2491 #endif /* SUPPORT_UNICODE */
2492 otherd = TABLE_GET(d, fcc, d);
2493 }
2494 if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
2495 {
2496 if (++count >= (int)GET2(code, 1))
2497 { ADD_NEW(state_offset + dlen + 1 + IMM2_SIZE, 0); }
2498 else
2499 { ADD_NEW(state_offset, count); }
2500 }
2501 }
2502 break;
2503
2504 /*-----------------------------------------------------------------*/
2505 case OP_UPTOI:
2506 case OP_MINUPTOI:
2507 case OP_POSUPTOI:
2508 case OP_NOTUPTOI:
2509 case OP_NOTMINUPTOI:
2510 case OP_NOTPOSUPTOI:
2511 caseless = TRUE;
2512 codevalue -= OP_STARI - OP_STAR;
2513 /* Fall through */
2514 case OP_UPTO:
2515 case OP_MINUPTO:
2516 case OP_POSUPTO:
2517 case OP_NOTUPTO:
2518 case OP_NOTMINUPTO:
2519 case OP_NOTPOSUPTO:
2520 ADD_ACTIVE(state_offset + dlen + 1 + IMM2_SIZE, 0);
2521 count = current_state->count; /* Number already matched */
2522 if (clen > 0)
2523 {
2524 uint32_t otherd = NOTACHAR;
2525 if (caseless)
2526 {
2527 #ifdef SUPPORT_UNICODE
2528 if (utf && d >= 128)
2529 otherd = UCD_OTHERCASE(d);
2530 else
2531 #endif /* SUPPORT_UNICODE */
2532 otherd = TABLE_GET(d, fcc, d);
2533 }
2534 if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
2535 {
2536 if (codevalue == OP_POSUPTO || codevalue == OP_NOTPOSUPTO)
2537 {
2538 active_count--; /* Remove non-match possibility */
2539 next_active_state--;
2540 }
2541 if (++count >= (int)GET2(code, 1))
2542 { ADD_NEW(state_offset + dlen + 1 + IMM2_SIZE, 0); }
2543 else
2544 { ADD_NEW(state_offset, count); }
2545 }
2546 }
2547 break;
2548
2549
2550 /* ========================================================================== */
2551 /* These are the class-handling opcodes */
2552
2553 case OP_CLASS:
2554 case OP_NCLASS:
2555 case OP_XCLASS:
2556 {
2557 BOOL isinclass = FALSE;
2558 int next_state_offset;
2559 PCRE2_SPTR ecode;
2560
2561 /* For a simple class, there is always just a 32-byte table, and we
2562 can set isinclass from it. */
2563
2564 if (codevalue != OP_XCLASS)
2565 {
2566 ecode = code + 1 + (32 / sizeof(PCRE2_UCHAR));
2567 if (clen > 0)
2568 {
2569 isinclass = (c > 255)? (codevalue == OP_NCLASS) :
2570 ((((uint8_t *)(code + 1))[c/8] & (1u << (c&7))) != 0);
2571 }
2572 }
2573
2574 /* An extended class may have a table or a list of single characters,
2575 ranges, or both, and it may be positive or negative. There's a
2576 function that sorts all this out. */
2577
2578 else
2579 {
2580 ecode = code + GET(code, 1);
2581 if (clen > 0) isinclass = PRIV(xclass)(c, code + 1 + LINK_SIZE, utf);
2582 }
2583
2584 /* At this point, isinclass is set for all kinds of class, and ecode
2585 points to the byte after the end of the class. If there is a
2586 quantifier, this is where it will be. */
2587
2588 next_state_offset = (int)(ecode - start_code);
2589
2590 switch (*ecode)
2591 {
2592 case OP_CRSTAR:
2593 case OP_CRMINSTAR:
2594 case OP_CRPOSSTAR:
2595 ADD_ACTIVE(next_state_offset + 1, 0);
2596 if (isinclass)
2597 {
2598 if (*ecode == OP_CRPOSSTAR)
2599 {
2600 active_count--; /* Remove non-match possibility */
2601 next_active_state--;
2602 }
2603 ADD_NEW(state_offset, 0);
2604 }
2605 break;
2606
2607 case OP_CRPLUS:
2608 case OP_CRMINPLUS:
2609 case OP_CRPOSPLUS:
2610 count = current_state->count; /* Already matched */
2611 if (count > 0) { ADD_ACTIVE(next_state_offset + 1, 0); }
2612 if (isinclass)
2613 {
2614 if (count > 0 && *ecode == OP_CRPOSPLUS)
2615 {
2616 active_count--; /* Remove non-match possibility */
2617 next_active_state--;
2618 }
2619 count++;
2620 ADD_NEW(state_offset, count);
2621 }
2622 break;
2623
2624 case OP_CRQUERY:
2625 case OP_CRMINQUERY:
2626 case OP_CRPOSQUERY:
2627 ADD_ACTIVE(next_state_offset + 1, 0);
2628 if (isinclass)
2629 {
2630 if (*ecode == OP_CRPOSQUERY)
2631 {
2632 active_count--; /* Remove non-match possibility */
2633 next_active_state--;
2634 }
2635 ADD_NEW(next_state_offset + 1, 0);
2636 }
2637 break;
2638
2639 case OP_CRRANGE:
2640 case OP_CRMINRANGE:
2641 case OP_CRPOSRANGE:
2642 count = current_state->count; /* Already matched */
2643 if (count >= (int)GET2(ecode, 1))
2644 { ADD_ACTIVE(next_state_offset + 1 + 2 * IMM2_SIZE, 0); }
2645 if (isinclass)
2646 {
2647 int max = (int)GET2(ecode, 1 + IMM2_SIZE);
2648
2649 if (*ecode == OP_CRPOSRANGE && count >= (int)GET2(ecode, 1))
2650 {
2651 active_count--; /* Remove non-match possibility */
2652 next_active_state--;
2653 }
2654
2655 if (++count >= max && max != 0) /* Max 0 => no limit */
2656 { ADD_NEW(next_state_offset + 1 + 2 * IMM2_SIZE, 0); }
2657 else
2658 { ADD_NEW(state_offset, count); }
2659 }
2660 break;
2661
2662 default:
2663 if (isinclass) { ADD_NEW(next_state_offset, 0); }
2664 break;
2665 }
2666 }
2667 break;
2668
2669 /* ========================================================================== */
2670 /* These are the opcodes for fancy brackets of various kinds. We have
2671 to use recursion in order to handle them. The "always failing" assertion
2672 (?!) is optimised to OP_FAIL when compiling, so we have to support that,
2673 though the other "backtracking verbs" are not supported. */
2674
2675 case OP_FAIL:
2676 forced_fail++; /* Count FAILs for multiple states */
2677 break;
2678
2679 case OP_ASSERT:
2680 case OP_ASSERT_NOT:
2681 case OP_ASSERTBACK:
2682 case OP_ASSERTBACK_NOT:
2683 {
2684 int rc;
2685 int *local_workspace;
2686 PCRE2_SIZE *local_offsets;
2687 PCRE2_SPTR endasscode = code + GET(code, 1);
2688 RWS_anchor *rws = (RWS_anchor *)RWS;
2689
2690 if (rws->free < RWS_RSIZE + RWS_OVEC_OSIZE)
2691 {
2692 rc = more_workspace(&rws, RWS_OVEC_OSIZE, mb);
2693 if (rc != 0) return rc;
2694 RWS = (int *)rws;
2695 }
2696
2697 local_offsets = (PCRE2_SIZE *)(RWS + rws->size - rws->free);
2698 local_workspace = ((int *)local_offsets) + RWS_OVEC_OSIZE;
2699 rws->free -= RWS_RSIZE + RWS_OVEC_OSIZE;
2700
2701 while (*endasscode == OP_ALT) endasscode += GET(endasscode, 1);
2702
2703 rc = internal_dfa_match(
2704 mb, /* static match data */
2705 code, /* this subexpression's code */
2706 ptr, /* where we currently are */
2707 (PCRE2_SIZE)(ptr - start_subject), /* start offset */
2708 local_offsets, /* offset vector */
2709 RWS_OVEC_OSIZE/OVEC_UNIT, /* size of same */
2710 local_workspace, /* workspace vector */
2711 RWS_RSIZE, /* size of same */
2712 rlevel, /* function recursion level */
2713 RWS); /* recursion workspace */
2714
2715 rws->free += RWS_RSIZE + RWS_OVEC_OSIZE;
2716
2717 if (rc < 0 && rc != PCRE2_ERROR_NOMATCH) return rc;
2718 if ((rc >= 0) == (codevalue == OP_ASSERT || codevalue == OP_ASSERTBACK))
2719 { ADD_ACTIVE((int)(endasscode + LINK_SIZE + 1 - start_code), 0); }
2720 }
2721 break;
2722
2723 /*-----------------------------------------------------------------*/
2724 case OP_COND:
2725 case OP_SCOND:
2726 {
2727 int codelink = (int)GET(code, 1);
2728 PCRE2_UCHAR condcode;
2729
2730 /* Because of the way auto-callout works during compile, a callout item
2731 is inserted between OP_COND and an assertion condition. This does not
2732 happen for the other conditions. */
2733
2734 if (code[LINK_SIZE + 1] == OP_CALLOUT
2735 || code[LINK_SIZE + 1] == OP_CALLOUT_STR)
2736 {
2737 PCRE2_SIZE callout_length;
2738 rrc = do_callout(code, offsets, current_subject, ptr, mb,
2739 1 + LINK_SIZE, &callout_length);
2740 if (rrc < 0) return rrc; /* Abandon */
2741 if (rrc > 0) break; /* Fail this thread */
2742 code += callout_length; /* Skip callout data */
2743 }
2744
2745 condcode = code[LINK_SIZE+1];
2746
2747 /* Back reference conditions and duplicate named recursion conditions
2748 are not supported */
2749
2750 if (condcode == OP_CREF || condcode == OP_DNCREF ||
2751 condcode == OP_DNRREF)
2752 return PCRE2_ERROR_DFA_UCOND;
2753
2754 /* The DEFINE condition is always false, and the assertion (?!) is
2755 converted to OP_FAIL. */
2756
2757 if (condcode == OP_FALSE || condcode == OP_FAIL)
2758 { ADD_ACTIVE(state_offset + codelink + LINK_SIZE + 1, 0); }
2759
2760 /* There is also an always-true condition */
2761
2762 else if (condcode == OP_TRUE)
2763 { ADD_ACTIVE(state_offset + LINK_SIZE + 2, 0); }
2764
2765 /* The only supported version of OP_RREF is for the value RREF_ANY,
2766 which means "test if in any recursion". We can't test for specifically
2767 recursed groups. */
2768
2769 else if (condcode == OP_RREF)
2770 {
2771 unsigned int value = GET2(code, LINK_SIZE + 2);
2772 if (value != RREF_ANY) return PCRE2_ERROR_DFA_UCOND;
2773 if (mb->recursive != NULL)
2774 { ADD_ACTIVE(state_offset + LINK_SIZE + 2 + IMM2_SIZE, 0); }
2775 else { ADD_ACTIVE(state_offset + codelink + LINK_SIZE + 1, 0); }
2776 }
2777
2778 /* Otherwise, the condition is an assertion */
2779
2780 else
2781 {
2782 int rc;
2783 int *local_workspace;
2784 PCRE2_SIZE *local_offsets;
2785 PCRE2_SPTR asscode = code + LINK_SIZE + 1;
2786 PCRE2_SPTR endasscode = asscode + GET(asscode, 1);
2787 RWS_anchor *rws = (RWS_anchor *)RWS;
2788
2789 if (rws->free < RWS_RSIZE + RWS_OVEC_OSIZE)
2790 {
2791 rc = more_workspace(&rws, RWS_OVEC_OSIZE, mb);
2792 if (rc != 0) return rc;
2793 RWS = (int *)rws;
2794 }
2795
2796 local_offsets = (PCRE2_SIZE *)(RWS + rws->size - rws->free);
2797 local_workspace = ((int *)local_offsets) + RWS_OVEC_OSIZE;
2798 rws->free -= RWS_RSIZE + RWS_OVEC_OSIZE;
2799
2800 while (*endasscode == OP_ALT) endasscode += GET(endasscode, 1);
2801
2802 rc = internal_dfa_match(
2803 mb, /* fixed match data */
2804 asscode, /* this subexpression's code */
2805 ptr, /* where we currently are */
2806 (PCRE2_SIZE)(ptr - start_subject), /* start offset */
2807 local_offsets, /* offset vector */
2808 RWS_OVEC_OSIZE/OVEC_UNIT, /* size of same */
2809 local_workspace, /* workspace vector */
2810 RWS_RSIZE, /* size of same */
2811 rlevel, /* function recursion level */
2812 RWS); /* recursion workspace */
2813
2814 rws->free += RWS_RSIZE + RWS_OVEC_OSIZE;
2815
2816 if (rc < 0 && rc != PCRE2_ERROR_NOMATCH) return rc;
2817 if ((rc >= 0) ==
2818 (condcode == OP_ASSERT || condcode == OP_ASSERTBACK))
2819 { ADD_ACTIVE((int)(endasscode + LINK_SIZE + 1 - start_code), 0); }
2820 else
2821 { ADD_ACTIVE(state_offset + codelink + LINK_SIZE + 1, 0); }
2822 }
2823 }
2824 break;
2825
2826 /*-----------------------------------------------------------------*/
2827 case OP_RECURSE:
2828 {
2829 int rc;
2830 int *local_workspace;
2831 PCRE2_SIZE *local_offsets;
2832 RWS_anchor *rws = (RWS_anchor *)RWS;
2833 dfa_recursion_info *ri;
2834 PCRE2_SPTR callpat = start_code + GET(code, 1);
2835 uint32_t recno = (callpat == mb->start_code)? 0 :
2836 GET2(callpat, 1 + LINK_SIZE);
2837
2838 if (rws->free < RWS_RSIZE + RWS_OVEC_RSIZE)
2839 {
2840 rc = more_workspace(&rws, RWS_OVEC_RSIZE, mb);
2841 if (rc != 0) return rc;
2842 RWS = (int *)rws;
2843 }
2844
2845 local_offsets = (PCRE2_SIZE *)(RWS + rws->size - rws->free);
2846 local_workspace = ((int *)local_offsets) + RWS_OVEC_RSIZE;
2847 rws->free -= RWS_RSIZE + RWS_OVEC_RSIZE;
2848
2849 /* Check for repeating a recursion without advancing the subject
2850 pointer. This should catch convoluted mutual recursions. (Some simple
2851 cases are caught at compile time.) */
2852
2853 for (ri = mb->recursive; ri != NULL; ri = ri->prevrec)
2854 if (recno == ri->group_num && ptr == ri->subject_position)
2855 return PCRE2_ERROR_RECURSELOOP;
2856
2857 /* Remember this recursion and where we started it so as to
2858 catch infinite loops. */
2859
2860 new_recursive.group_num = recno;
2861 new_recursive.subject_position = ptr;
2862 new_recursive.prevrec = mb->recursive;
2863 mb->recursive = &new_recursive;
2864
2865 rc = internal_dfa_match(
2866 mb, /* fixed match data */
2867 callpat, /* this subexpression's code */
2868 ptr, /* where we currently are */
2869 (PCRE2_SIZE)(ptr - start_subject), /* start offset */
2870 local_offsets, /* offset vector */
2871 RWS_OVEC_RSIZE/OVEC_UNIT, /* size of same */
2872 local_workspace, /* workspace vector */
2873 RWS_RSIZE, /* size of same */
2874 rlevel, /* function recursion level */
2875 RWS); /* recursion workspace */
2876
2877 rws->free += RWS_RSIZE + RWS_OVEC_RSIZE;
2878 mb->recursive = new_recursive.prevrec; /* Done this recursion */
2879
2880 /* Ran out of internal offsets */
2881
2882 if (rc == 0) return PCRE2_ERROR_DFA_RECURSE;
2883
2884 /* For each successful matched substring, set up the next state with a
2885 count of characters to skip before trying it. Note that the count is in
2886 characters, not bytes. */
2887
2888 if (rc > 0)
2889 {
2890 for (rc = rc*2 - 2; rc >= 0; rc -= 2)
2891 {
2892 PCRE2_SIZE charcount = local_offsets[rc+1] - local_offsets[rc];
2893 #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32
2894 if (utf)
2895 {
2896 PCRE2_SPTR p = start_subject + local_offsets[rc];
2897 PCRE2_SPTR pp = start_subject + local_offsets[rc+1];
2898 while (p < pp) if (NOT_FIRSTCU(*p++)) charcount--;
2899 }
2900 #endif
2901 if (charcount > 0)
2902 {
2903 ADD_NEW_DATA(-(state_offset + LINK_SIZE + 1), 0,
2904 (int)(charcount - 1));
2905 }
2906 else
2907 {
2908 ADD_ACTIVE(state_offset + LINK_SIZE + 1, 0);
2909 }
2910 }
2911 }
2912 else if (rc != PCRE2_ERROR_NOMATCH) return rc;
2913 }
2914 break;
2915
2916 /*-----------------------------------------------------------------*/
2917 case OP_BRAPOS:
2918 case OP_SBRAPOS:
2919 case OP_CBRAPOS:
2920 case OP_SCBRAPOS:
2921 case OP_BRAPOSZERO:
2922 {
2923 int rc;
2924 int *local_workspace;
2925 PCRE2_SIZE *local_offsets;
2926 PCRE2_SIZE charcount, matched_count;
2927 PCRE2_SPTR local_ptr = ptr;
2928 RWS_anchor *rws = (RWS_anchor *)RWS;
2929 BOOL allow_zero;
2930
2931 if (rws->free < RWS_RSIZE + RWS_OVEC_OSIZE)
2932 {
2933 rc = more_workspace(&rws, RWS_OVEC_OSIZE, mb);
2934 if (rc != 0) return rc;
2935 RWS = (int *)rws;
2936 }
2937
2938 local_offsets = (PCRE2_SIZE *)(RWS + rws->size - rws->free);
2939 local_workspace = ((int *)local_offsets) + RWS_OVEC_OSIZE;
2940 rws->free -= RWS_RSIZE + RWS_OVEC_OSIZE;
2941
2942 if (codevalue == OP_BRAPOSZERO)
2943 {
2944 allow_zero = TRUE;
2945 codevalue = *(++code); /* Codevalue will be one of above BRAs */
2946 }
2947 else allow_zero = FALSE;
2948
2949 /* Loop to match the subpattern as many times as possible as if it were
2950 a complete pattern. */
2951
2952 for (matched_count = 0;; matched_count++)
2953 {
2954 rc = internal_dfa_match(
2955 mb, /* fixed match data */
2956 code, /* this subexpression's code */
2957 local_ptr, /* where we currently are */
2958 (PCRE2_SIZE)(ptr - start_subject), /* start offset */
2959 local_offsets, /* offset vector */
2960 RWS_OVEC_OSIZE/OVEC_UNIT, /* size of same */
2961 local_workspace, /* workspace vector */
2962 RWS_RSIZE, /* size of same */
2963 rlevel, /* function recursion level */
2964 RWS); /* recursion workspace */
2965
2966 /* Failed to match */
2967
2968 if (rc < 0)
2969 {
2970 if (rc != PCRE2_ERROR_NOMATCH) return rc;
2971 break;
2972 }
2973
2974 /* Matched: break the loop if zero characters matched. */
2975
2976 charcount = local_offsets[1] - local_offsets[0];
2977 if (charcount == 0) break;
2978 local_ptr += charcount; /* Advance temporary position ptr */
2979 }
2980
2981 rws->free += RWS_RSIZE + RWS_OVEC_OSIZE;
2982
2983 /* At this point we have matched the subpattern matched_count
2984 times, and local_ptr is pointing to the character after the end of the
2985 last match. */
2986
2987 if (matched_count > 0 || allow_zero)
2988 {
2989 PCRE2_SPTR end_subpattern = code;
2990 int next_state_offset;
2991
2992 do { end_subpattern += GET(end_subpattern, 1); }
2993 while (*end_subpattern == OP_ALT);
2994 next_state_offset =
2995 (int)(end_subpattern - start_code + LINK_SIZE + 1);
2996
2997 /* Optimization: if there are no more active states, and there
2998 are no new states yet set up, then skip over the subject string
2999 right here, to save looping. Otherwise, set up the new state to swing
3000 into action when the end of the matched substring is reached. */
3001
3002 if (i + 1 >= active_count && new_count == 0)
3003 {
3004 ptr = local_ptr;
3005 clen = 0;
3006 ADD_NEW(next_state_offset, 0);
3007 }
3008 else
3009 {
3010 PCRE2_SPTR p = ptr;
3011 PCRE2_SPTR pp = local_ptr;
3012 charcount = (PCRE2_SIZE)(pp - p);
3013 #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32
3014 if (utf) while (p < pp) if (NOT_FIRSTCU(*p++)) charcount--;
3015 #endif
3016 ADD_NEW_DATA(-next_state_offset, 0, (int)(charcount - 1));
3017 }
3018 }
3019 }
3020 break;
3021
3022 /*-----------------------------------------------------------------*/
3023 case OP_ONCE:
3024 {
3025 int rc;
3026 int *local_workspace;
3027 PCRE2_SIZE *local_offsets;
3028 RWS_anchor *rws = (RWS_anchor *)RWS;
3029
3030 if (rws->free < RWS_RSIZE + RWS_OVEC_OSIZE)
3031 {
3032 rc = more_workspace(&rws, RWS_OVEC_OSIZE, mb);
3033 if (rc != 0) return rc;
3034 RWS = (int *)rws;
3035 }
3036
3037 local_offsets = (PCRE2_SIZE *)(RWS + rws->size - rws->free);
3038 local_workspace = ((int *)local_offsets) + RWS_OVEC_OSIZE;
3039 rws->free -= RWS_RSIZE + RWS_OVEC_OSIZE;
3040
3041 rc = internal_dfa_match(
3042 mb, /* fixed match data */
3043 code, /* this subexpression's code */
3044 ptr, /* where we currently are */
3045 (PCRE2_SIZE)(ptr - start_subject), /* start offset */
3046 local_offsets, /* offset vector */
3047 RWS_OVEC_OSIZE/OVEC_UNIT, /* size of same */
3048 local_workspace, /* workspace vector */
3049 RWS_RSIZE, /* size of same */
3050 rlevel, /* function recursion level */
3051 RWS); /* recursion workspace */
3052
3053 rws->free += RWS_RSIZE + RWS_OVEC_OSIZE;
3054
3055 if (rc >= 0)
3056 {
3057 PCRE2_SPTR end_subpattern = code;
3058 PCRE2_SIZE charcount = local_offsets[1] - local_offsets[0];
3059 int next_state_offset, repeat_state_offset;
3060
3061 do { end_subpattern += GET(end_subpattern, 1); }
3062 while (*end_subpattern == OP_ALT);
3063 next_state_offset =
3064 (int)(end_subpattern - start_code + LINK_SIZE + 1);
3065
3066 /* If the end of this subpattern is KETRMAX or KETRMIN, we must
3067 arrange for the repeat state also to be added to the relevant list.
3068 Calculate the offset, or set -1 for no repeat. */
3069
3070 repeat_state_offset = (*end_subpattern == OP_KETRMAX ||
3071 *end_subpattern == OP_KETRMIN)?
3072 (int)(end_subpattern - start_code - GET(end_subpattern, 1)) : -1;
3073
3074 /* If we have matched an empty string, add the next state at the
3075 current character pointer. This is important so that the duplicate
3076 checking kicks in, which is what breaks infinite loops that match an
3077 empty string. */
3078
3079 if (charcount == 0)
3080 {
3081 ADD_ACTIVE(next_state_offset, 0);
3082 }
3083
3084 /* Optimization: if there are no more active states, and there
3085 are no new states yet set up, then skip over the subject string
3086 right here, to save looping. Otherwise, set up the new state to swing
3087 into action when the end of the matched substring is reached. */
3088
3089 else if (i + 1 >= active_count && new_count == 0)
3090 {
3091 ptr += charcount;
3092 clen = 0;
3093 ADD_NEW(next_state_offset, 0);
3094
3095 /* If we are adding a repeat state at the new character position,
3096 we must fudge things so that it is the only current state.
3097 Otherwise, it might be a duplicate of one we processed before, and
3098 that would cause it to be skipped. */
3099
3100 if (repeat_state_offset >= 0)
3101 {
3102 next_active_state = active_states;
3103 active_count = 0;
3104 i = -1;
3105 ADD_ACTIVE(repeat_state_offset, 0);
3106 }
3107 }
3108 else
3109 {
3110 #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32
3111 if (utf)
3112 {
3113 PCRE2_SPTR p = start_subject + local_offsets[0];
3114 PCRE2_SPTR pp = start_subject + local_offsets[1];
3115 while (p < pp) if (NOT_FIRSTCU(*p++)) charcount--;
3116 }
3117 #endif
3118 ADD_NEW_DATA(-next_state_offset, 0, (int)(charcount - 1));
3119 if (repeat_state_offset >= 0)
3120 { ADD_NEW_DATA(-repeat_state_offset, 0, (int)(charcount - 1)); }
3121 }
3122 }
3123 else if (rc != PCRE2_ERROR_NOMATCH) return rc;
3124 }
3125 break;
3126
3127
3128 /* ========================================================================== */
3129 /* Handle callouts */
3130
3131 case OP_CALLOUT:
3132 case OP_CALLOUT_STR:
3133 {
3134 PCRE2_SIZE callout_length;
3135 rrc = do_callout(code, offsets, current_subject, ptr, mb, 0,
3136 &callout_length);
3137 if (rrc < 0) return rrc; /* Abandon */
3138 if (rrc == 0)
3139 { ADD_ACTIVE(state_offset + (int)callout_length, 0); }
3140 }
3141 break;
3142
3143
3144 /* ========================================================================== */
3145 default: /* Unsupported opcode */
3146 return PCRE2_ERROR_DFA_UITEM;
3147 }
3148
3149 NEXT_ACTIVE_STATE: continue;
3150
3151 } /* End of loop scanning active states */
3152
3153 /* We have finished the processing at the current subject character. If no
3154 new states have been set for the next character, we have found all the
3155 matches that we are going to find. If we are at the top level and partial
3156 matching has been requested, check for appropriate conditions.
3157
3158 The "forced_ fail" variable counts the number of (*F) encountered for the
3159 character. If it is equal to the original active_count (saved in
3160 workspace[1]) it means that (*F) was found on every active state. In this
3161 case we don't want to give a partial match.
3162
3163 The "could_continue" variable is true if a state could have continued but
3164 for the fact that the end of the subject was reached. */
3165
3166 if (new_count <= 0)
3167 {
3168 if (rlevel == 1 && /* Top level, and */
3169 could_continue && /* Some could go on, and */
3170 forced_fail != workspace[1] && /* Not all forced fail & */
3171 ( /* either... */
3172 (mb->moptions & PCRE2_PARTIAL_HARD) != 0 /* Hard partial */
3173 || /* or... */
3174 ((mb->moptions & PCRE2_PARTIAL_SOFT) != 0 && /* Soft partial and */
3175 match_count < 0) /* no matches */
3176 ) && /* And... */
3177 (
3178 partial_newline || /* Either partial NL */
3179 ( /* or ... */
3180 ptr >= end_subject && /* End of subject and */
3181 ptr > mb->start_used_ptr) /* Inspected non-empty string */
3182 )
3183 )
3184 match_count = PCRE2_ERROR_PARTIAL;
3185 break; /* Exit from loop along the subject string */
3186 }
3187
3188 /* One or more states are active for the next character. */
3189
3190 ptr += clen; /* Advance to next subject character */
3191 } /* Loop to move along the subject string */
3192
3193 /* Control gets here from "break" a few lines above. If we have a match and
3194 PCRE2_ENDANCHORED is set, the match fails. */
3195
3196 if (match_count >= 0 &&
3197 ((mb->moptions | mb->poptions) & PCRE2_ENDANCHORED) != 0 &&
3198 ptr < end_subject)
3199 match_count = PCRE2_ERROR_NOMATCH;
3200
3201 return match_count;
3202 }
3203
3204
3205
3206 /*************************************************
3207 * Match a pattern using the DFA algorithm *
3208 *************************************************/
3209
3210 /* This function matches a compiled pattern to a subject string, using the
3211 alternate matching algorithm that finds all matches at once.
3212
3213 Arguments:
3214 code points to the compiled pattern
3215 subject subject string
3216 length length of subject string
3217 startoffset where to start matching in the subject
3218 options option bits
3219 match_data points to a match data structure
3220 gcontext points to a match context
3221 workspace pointer to workspace
3222 wscount size of workspace
3223
3224 Returns: > 0 => number of match offset pairs placed in offsets
3225 = 0 => offsets overflowed; longest matches are present
3226 -1 => failed to match
3227 < -1 => some kind of unexpected problem
3228 */
3229
3230 PCRE2_EXP_DEFN int PCRE2_CALL_CONVENTION
pcre2_dfa_match(const pcre2_code * code,PCRE2_SPTR subject,PCRE2_SIZE length,PCRE2_SIZE start_offset,uint32_t options,pcre2_match_data * match_data,pcre2_match_context * mcontext,int * workspace,PCRE2_SIZE wscount)3231 pcre2_dfa_match(const pcre2_code *code, PCRE2_SPTR subject, PCRE2_SIZE length,
3232 PCRE2_SIZE start_offset, uint32_t options, pcre2_match_data *match_data,
3233 pcre2_match_context *mcontext, int *workspace, PCRE2_SIZE wscount)
3234 {
3235 int rc;
3236 int was_zero_terminated = 0;
3237
3238 const pcre2_real_code *re = (const pcre2_real_code *)code;
3239
3240 PCRE2_SPTR start_match;
3241 PCRE2_SPTR end_subject;
3242 PCRE2_SPTR bumpalong_limit;
3243 PCRE2_SPTR req_cu_ptr;
3244
3245 BOOL utf, anchored, startline, firstline;
3246 BOOL has_first_cu = FALSE;
3247 BOOL has_req_cu = FALSE;
3248
3249 PCRE2_UCHAR first_cu = 0;
3250 PCRE2_UCHAR first_cu2 = 0;
3251 PCRE2_UCHAR req_cu = 0;
3252 PCRE2_UCHAR req_cu2 = 0;
3253
3254 const uint8_t *start_bits = NULL;
3255
3256 /* We need to have mb pointing to a match block, because the IS_NEWLINE macro
3257 is used below, and it expects NLBLOCK to be defined as a pointer. */
3258
3259 pcre2_callout_block cb;
3260 dfa_match_block actual_match_block;
3261 dfa_match_block *mb = &actual_match_block;
3262
3263 /* Set up a starting block of memory for use during recursive calls to
3264 internal_dfa_match(). By putting this on the stack, it minimizes resource use
3265 in the case when it is not needed. If this is too small, more memory is
3266 obtained from the heap. At the start of each block is an anchor structure.*/
3267
3268 int base_recursion_workspace[RWS_BASE_SIZE];
3269 RWS_anchor *rws = (RWS_anchor *)base_recursion_workspace;
3270 rws->next = NULL;
3271 rws->size = RWS_BASE_SIZE;
3272 rws->free = RWS_BASE_SIZE - RWS_ANCHOR_SIZE;
3273
3274 /* A length equal to PCRE2_ZERO_TERMINATED implies a zero-terminated
3275 subject string. */
3276
3277 if (length == PCRE2_ZERO_TERMINATED)
3278 {
3279 length = PRIV(strlen)(subject);
3280 was_zero_terminated = 1;
3281 }
3282
3283 /* Plausibility checks */
3284
3285 if ((options & ~PUBLIC_DFA_MATCH_OPTIONS) != 0) return PCRE2_ERROR_BADOPTION;
3286 if (re == NULL || subject == NULL || workspace == NULL || match_data == NULL)
3287 return PCRE2_ERROR_NULL;
3288 if (wscount < 20) return PCRE2_ERROR_DFA_WSSIZE;
3289 if (start_offset > length) return PCRE2_ERROR_BADOFFSET;
3290
3291 /* Partial matching and PCRE2_ENDANCHORED are currently not allowed at the same
3292 time. */
3293
3294 if ((options & (PCRE2_PARTIAL_HARD|PCRE2_PARTIAL_SOFT)) != 0 &&
3295 ((re->overall_options | options) & PCRE2_ENDANCHORED) != 0)
3296 return PCRE2_ERROR_BADOPTION;
3297
3298 /* Check that the first field in the block is the magic number. If it is not,
3299 return with PCRE2_ERROR_BADMAGIC. */
3300
3301 if (re->magic_number != MAGIC_NUMBER) return PCRE2_ERROR_BADMAGIC;
3302
3303 /* Check the code unit width. */
3304
3305 if ((re->flags & PCRE2_MODE_MASK) != PCRE2_CODE_UNIT_WIDTH/8)
3306 return PCRE2_ERROR_BADMODE;
3307
3308 /* PCRE2_NOTEMPTY and PCRE2_NOTEMPTY_ATSTART are match-time flags in the
3309 options variable for this function. Users of PCRE2 who are not calling the
3310 function directly would like to have a way of setting these flags, in the same
3311 way that they can set pcre2_compile() flags like PCRE2_NO_AUTOPOSSESS with
3312 constructions like (*NO_AUTOPOSSESS). To enable this, (*NOTEMPTY) and
3313 (*NOTEMPTY_ATSTART) set bits in the pattern's "flag" function which can now be
3314 transferred to the options for this function. The bits are guaranteed to be
3315 adjacent, but do not have the same values. This bit of Boolean trickery assumes
3316 that the match-time bits are not more significant than the flag bits. If by
3317 accident this is not the case, a compile-time division by zero error will
3318 occur. */
3319
3320 #define FF (PCRE2_NOTEMPTY_SET|PCRE2_NE_ATST_SET)
3321 #define OO (PCRE2_NOTEMPTY|PCRE2_NOTEMPTY_ATSTART)
3322 options |= (re->flags & FF) / ((FF & (~FF+1)) / (OO & (~OO+1)));
3323 #undef FF
3324 #undef OO
3325
3326 /* If restarting after a partial match, do some sanity checks on the contents
3327 of the workspace. */
3328
3329 if ((options & PCRE2_DFA_RESTART) != 0)
3330 {
3331 if ((workspace[0] & (-2)) != 0 || workspace[1] < 1 ||
3332 workspace[1] > (int)((wscount - 2)/INTS_PER_STATEBLOCK))
3333 return PCRE2_ERROR_DFA_BADRESTART;
3334 }
3335
3336 /* Set some local values */
3337
3338 utf = (re->overall_options & PCRE2_UTF) != 0;
3339 start_match = subject + start_offset;
3340 end_subject = subject + length;
3341 req_cu_ptr = start_match - 1;
3342 anchored = (options & (PCRE2_ANCHORED|PCRE2_DFA_RESTART)) != 0 ||
3343 (re->overall_options & PCRE2_ANCHORED) != 0;
3344
3345 /* The "must be at the start of a line" flags are used in a loop when finding
3346 where to start. */
3347
3348 startline = (re->flags & PCRE2_STARTLINE) != 0;
3349 firstline = (re->overall_options & PCRE2_FIRSTLINE) != 0;
3350 bumpalong_limit = end_subject;
3351
3352 /* Initialize and set up the fixed fields in the callout block, with a pointer
3353 in the match block. */
3354
3355 mb->cb = &cb;
3356 cb.version = 2;
3357 cb.subject = subject;
3358 cb.subject_length = (PCRE2_SIZE)(end_subject - subject);
3359 cb.callout_flags = 0;
3360 cb.capture_top = 1; /* No capture support */
3361 cb.capture_last = 0;
3362 cb.mark = NULL; /* No (*MARK) support */
3363
3364 /* Get data from the match context, if present, and fill in the remaining
3365 fields in the match block. It is an error to set an offset limit without
3366 setting the flag at compile time. */
3367
3368 if (mcontext == NULL)
3369 {
3370 mb->callout = NULL;
3371 mb->memctl = re->memctl;
3372 mb->match_limit = PRIV(default_match_context).match_limit;
3373 mb->match_limit_depth = PRIV(default_match_context).depth_limit;
3374 mb->heap_limit = PRIV(default_match_context).heap_limit;
3375 }
3376 else
3377 {
3378 if (mcontext->offset_limit != PCRE2_UNSET)
3379 {
3380 if ((re->overall_options & PCRE2_USE_OFFSET_LIMIT) == 0)
3381 return PCRE2_ERROR_BADOFFSETLIMIT;
3382 bumpalong_limit = subject + mcontext->offset_limit;
3383 }
3384 mb->callout = mcontext->callout;
3385 mb->callout_data = mcontext->callout_data;
3386 mb->memctl = mcontext->memctl;
3387 mb->match_limit = mcontext->match_limit;
3388 mb->match_limit_depth = mcontext->depth_limit;
3389 mb->heap_limit = mcontext->heap_limit;
3390 }
3391
3392 if (mb->match_limit > re->limit_match)
3393 mb->match_limit = re->limit_match;
3394
3395 if (mb->match_limit_depth > re->limit_depth)
3396 mb->match_limit_depth = re->limit_depth;
3397
3398 if (mb->heap_limit > re->limit_heap)
3399 mb->heap_limit = re->limit_heap;
3400
3401 mb->start_code = (PCRE2_UCHAR *)((uint8_t *)re + sizeof(pcre2_real_code)) +
3402 re->name_count * re->name_entry_size;
3403 mb->tables = re->tables;
3404 mb->start_subject = subject;
3405 mb->end_subject = end_subject;
3406 mb->start_offset = start_offset;
3407 mb->moptions = options;
3408 mb->poptions = re->overall_options;
3409 mb->match_call_count = 0;
3410 mb->heap_used = 0;
3411
3412 /* Process the \R and newline settings. */
3413
3414 mb->bsr_convention = re->bsr_convention;
3415 mb->nltype = NLTYPE_FIXED;
3416 switch(re->newline_convention)
3417 {
3418 case PCRE2_NEWLINE_CR:
3419 mb->nllen = 1;
3420 mb->nl[0] = CHAR_CR;
3421 break;
3422
3423 case PCRE2_NEWLINE_LF:
3424 mb->nllen = 1;
3425 mb->nl[0] = CHAR_NL;
3426 break;
3427
3428 case PCRE2_NEWLINE_NUL:
3429 mb->nllen = 1;
3430 mb->nl[0] = CHAR_NUL;
3431 break;
3432
3433 case PCRE2_NEWLINE_CRLF:
3434 mb->nllen = 2;
3435 mb->nl[0] = CHAR_CR;
3436 mb->nl[1] = CHAR_NL;
3437 break;
3438
3439 case PCRE2_NEWLINE_ANY:
3440 mb->nltype = NLTYPE_ANY;
3441 break;
3442
3443 case PCRE2_NEWLINE_ANYCRLF:
3444 mb->nltype = NLTYPE_ANYCRLF;
3445 break;
3446
3447 default: return PCRE2_ERROR_INTERNAL;
3448 }
3449
3450 /* Check a UTF string for validity if required. For 8-bit and 16-bit strings,
3451 we must also check that a starting offset does not point into the middle of a
3452 multiunit character. We check only the portion of the subject that is going to
3453 be inspected during matching - from the offset minus the maximum back reference
3454 to the given length. This saves time when a small part of a large subject is
3455 being matched by the use of a starting offset. Note that the maximum lookbehind
3456 is a number of characters, not code units. */
3457
3458 #ifdef SUPPORT_UNICODE
3459 if (utf && (options & PCRE2_NO_UTF_CHECK) == 0)
3460 {
3461 PCRE2_SPTR check_subject = start_match; /* start_match includes offset */
3462
3463 if (start_offset > 0)
3464 {
3465 #if PCRE2_CODE_UNIT_WIDTH != 32
3466 unsigned int i;
3467 if (start_match < end_subject && NOT_FIRSTCU(*start_match))
3468 return PCRE2_ERROR_BADUTFOFFSET;
3469 for (i = re->max_lookbehind; i > 0 && check_subject > subject; i--)
3470 {
3471 check_subject--;
3472 while (check_subject > subject &&
3473 #if PCRE2_CODE_UNIT_WIDTH == 8
3474 (*check_subject & 0xc0) == 0x80)
3475 #else /* 16-bit */
3476 (*check_subject & 0xfc00) == 0xdc00)
3477 #endif /* PCRE2_CODE_UNIT_WIDTH == 8 */
3478 check_subject--;
3479 }
3480 #else /* In the 32-bit library, one code unit equals one character. */
3481 check_subject -= re->max_lookbehind;
3482 if (check_subject < subject) check_subject = subject;
3483 #endif /* PCRE2_CODE_UNIT_WIDTH != 32 */
3484 }
3485
3486 /* Validate the relevant portion of the subject. After an error, adjust the
3487 offset to be an absolute offset in the whole string. */
3488
3489 match_data->rc = PRIV(valid_utf)(check_subject,
3490 length - (PCRE2_SIZE)(check_subject - subject), &(match_data->startchar));
3491 if (match_data->rc != 0)
3492 {
3493 match_data->startchar += (PCRE2_SIZE)(check_subject - subject);
3494 return match_data->rc;
3495 }
3496 }
3497 #endif /* SUPPORT_UNICODE */
3498
3499 /* Set up the first code unit to match, if available. If there's no first code
3500 unit there may be a bitmap of possible first characters. */
3501
3502 if ((re->flags & PCRE2_FIRSTSET) != 0)
3503 {
3504 has_first_cu = TRUE;
3505 first_cu = first_cu2 = (PCRE2_UCHAR)(re->first_codeunit);
3506 if ((re->flags & PCRE2_FIRSTCASELESS) != 0)
3507 {
3508 first_cu2 = TABLE_GET(first_cu, mb->tables + fcc_offset, first_cu);
3509 #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 8
3510 if (utf && first_cu > 127)
3511 first_cu2 = (PCRE2_UCHAR)UCD_OTHERCASE(first_cu);
3512 #endif
3513 }
3514 }
3515 else
3516 if (!startline && (re->flags & PCRE2_FIRSTMAPSET) != 0)
3517 start_bits = re->start_bitmap;
3518
3519 /* There may be a "last known required code unit" set. */
3520
3521 if ((re->flags & PCRE2_LASTSET) != 0)
3522 {
3523 has_req_cu = TRUE;
3524 req_cu = req_cu2 = (PCRE2_UCHAR)(re->last_codeunit);
3525 if ((re->flags & PCRE2_LASTCASELESS) != 0)
3526 {
3527 req_cu2 = TABLE_GET(req_cu, mb->tables + fcc_offset, req_cu);
3528 #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 8
3529 if (utf && req_cu > 127) req_cu2 = (PCRE2_UCHAR)UCD_OTHERCASE(req_cu);
3530 #endif
3531 }
3532 }
3533
3534 /* If the match data block was previously used with PCRE2_COPY_MATCHED_SUBJECT,
3535 free the memory that was obtained. */
3536
3537 if ((match_data->flags & PCRE2_MD_COPIED_SUBJECT) != 0)
3538 {
3539 match_data->memctl.free((void *)match_data->subject,
3540 match_data->memctl.memory_data);
3541 match_data->flags &= ~PCRE2_MD_COPIED_SUBJECT;
3542 }
3543
3544 /* Fill in fields that are always returned in the match data. */
3545
3546 match_data->code = re;
3547 match_data->subject = NULL; /* Default for no match */
3548 match_data->mark = NULL;
3549 match_data->matchedby = PCRE2_MATCHEDBY_DFA_INTERPRETER;
3550
3551 /* Call the main matching function, looping for a non-anchored regex after a
3552 failed match. If not restarting, perform certain optimizations at the start of
3553 a match. */
3554
3555 for (;;)
3556 {
3557 /* ----------------- Start of match optimizations ---------------- */
3558
3559 /* There are some optimizations that avoid running the match if a known
3560 starting point is not found, or if a known later code unit is not present.
3561 However, there is an option (settable at compile time) that disables
3562 these, for testing and for ensuring that all callouts do actually occur.
3563 The optimizations must also be avoided when restarting a DFA match. */
3564
3565 if ((re->overall_options & PCRE2_NO_START_OPTIMIZE) == 0 &&
3566 (options & PCRE2_DFA_RESTART) == 0)
3567 {
3568 /* If firstline is TRUE, the start of the match is constrained to the first
3569 line of a multiline string. That is, the match must be before or at the
3570 first newline following the start of matching. Temporarily adjust
3571 end_subject so that we stop the optimization scans for a first code unit
3572 immediately after the first character of a newline (the first code unit can
3573 legitimately be a newline). If the match fails at the newline, later code
3574 breaks this loop. */
3575
3576 if (firstline)
3577 {
3578 PCRE2_SPTR t = start_match;
3579 #ifdef SUPPORT_UNICODE
3580 if (utf)
3581 {
3582 while (t < end_subject && !IS_NEWLINE(t))
3583 {
3584 t++;
3585 ACROSSCHAR(t < end_subject, t, t++);
3586 }
3587 }
3588 else
3589 #endif
3590 while (t < end_subject && !IS_NEWLINE(t)) t++;
3591 end_subject = t;
3592 }
3593
3594 /* Anchored: check the first code unit if one is recorded. This may seem
3595 pointless but it can help in detecting a no match case without scanning for
3596 the required code unit. */
3597
3598 if (anchored)
3599 {
3600 if (has_first_cu || start_bits != NULL)
3601 {
3602 BOOL ok = start_match < end_subject;
3603 if (ok)
3604 {
3605 PCRE2_UCHAR c = UCHAR21TEST(start_match);
3606 ok = has_first_cu && (c == first_cu || c == first_cu2);
3607 if (!ok && start_bits != NULL)
3608 {
3609 #if PCRE2_CODE_UNIT_WIDTH != 8
3610 if (c > 255) c = 255;
3611 #endif
3612 ok = (start_bits[c/8] & (1u << (c&7))) != 0;
3613 }
3614 }
3615 if (!ok) break;
3616 }
3617 }
3618
3619 /* Not anchored. Advance to a unique first code unit if there is one. In
3620 8-bit mode, the use of memchr() gives a big speed up, even though we have
3621 to call it twice in caseless mode, in order to find the earliest occurrence
3622 of the character in either of its cases. */
3623
3624 else
3625 {
3626 if (has_first_cu)
3627 {
3628 if (first_cu != first_cu2) /* Caseless */
3629 {
3630 #if PCRE2_CODE_UNIT_WIDTH != 8
3631 PCRE2_UCHAR smc;
3632 while (start_match < end_subject &&
3633 (smc = UCHAR21TEST(start_match)) != first_cu &&
3634 smc != first_cu2)
3635 start_match++;
3636 #else /* 8-bit code units */
3637 PCRE2_SPTR pp1 =
3638 memchr(start_match, first_cu, end_subject-start_match);
3639 PCRE2_SPTR pp2 =
3640 memchr(start_match, first_cu2, end_subject-start_match);
3641 if (pp1 == NULL)
3642 start_match = (pp2 == NULL)? end_subject : pp2;
3643 else
3644 start_match = (pp2 == NULL || pp1 < pp2)? pp1 : pp2;
3645 #endif
3646 }
3647
3648 /* The caseful case */
3649
3650 else
3651 {
3652 #if PCRE2_CODE_UNIT_WIDTH != 8
3653 while (start_match < end_subject && UCHAR21TEST(start_match) !=
3654 first_cu)
3655 start_match++;
3656 #else
3657 start_match = memchr(start_match, first_cu, end_subject - start_match);
3658 if (start_match == NULL) start_match = end_subject;
3659 #endif
3660 }
3661
3662 /* If we can't find the required code unit, having reached the true end
3663 of the subject, break the bumpalong loop, to force a match failure,
3664 except when doing partial matching, when we let the next cycle run at
3665 the end of the subject. To see why, consider the pattern /(?<=abc)def/,
3666 which partially matches "abc", even though the string does not contain
3667 the starting character "d". If we have not reached the true end of the
3668 subject (PCRE2_FIRSTLINE caused end_subject to be temporarily modified)
3669 we also let the cycle run, because the matching string is legitimately
3670 allowed to start with the first code unit of a newline. */
3671
3672 if ((mb->moptions & (PCRE2_PARTIAL_HARD|PCRE2_PARTIAL_SOFT)) == 0 &&
3673 start_match >= mb->end_subject)
3674 break;
3675 }
3676
3677 /* If there's no first code unit, advance to just after a linebreak for a
3678 multiline match if required. */
3679
3680 else if (startline)
3681 {
3682 if (start_match > mb->start_subject + start_offset)
3683 {
3684 #ifdef SUPPORT_UNICODE
3685 if (utf)
3686 {
3687 while (start_match < end_subject && !WAS_NEWLINE(start_match))
3688 {
3689 start_match++;
3690 ACROSSCHAR(start_match < end_subject, start_match, start_match++);
3691 }
3692 }
3693 else
3694 #endif
3695 while (start_match < end_subject && !WAS_NEWLINE(start_match))
3696 start_match++;
3697
3698 /* If we have just passed a CR and the newline option is ANY or
3699 ANYCRLF, and we are now at a LF, advance the match position by one
3700 more code unit. */
3701
3702 if (start_match[-1] == CHAR_CR &&
3703 (mb->nltype == NLTYPE_ANY || mb->nltype == NLTYPE_ANYCRLF) &&
3704 start_match < end_subject &&
3705 UCHAR21TEST(start_match) == CHAR_NL)
3706 start_match++;
3707 }
3708 }
3709
3710 /* If there's no first code unit or a requirement for a multiline line
3711 start, advance to a non-unique first code unit if any have been
3712 identified. The bitmap contains only 256 bits. When code units are 16 or
3713 32 bits wide, all code units greater than 254 set the 255 bit. */
3714
3715 else if (start_bits != NULL)
3716 {
3717 while (start_match < end_subject)
3718 {
3719 uint32_t c = UCHAR21TEST(start_match);
3720 #if PCRE2_CODE_UNIT_WIDTH != 8
3721 if (c > 255) c = 255;
3722 #endif
3723 if ((start_bits[c/8] & (1u << (c&7))) != 0) break;
3724 start_match++;
3725 }
3726
3727 /* See comment above in first_cu checking about the next line. */
3728
3729 if ((mb->moptions & (PCRE2_PARTIAL_HARD|PCRE2_PARTIAL_SOFT)) == 0 &&
3730 start_match >= mb->end_subject)
3731 break;
3732 }
3733 } /* End of first code unit handling */
3734
3735 /* Restore fudged end_subject */
3736
3737 end_subject = mb->end_subject;
3738
3739 /* The following two optimizations are disabled for partial matching. */
3740
3741 if ((mb->moptions & (PCRE2_PARTIAL_HARD|PCRE2_PARTIAL_SOFT)) == 0)
3742 {
3743 /* The minimum matching length is a lower bound; no actual string of that
3744 length may actually match the pattern. Although the value is, strictly,
3745 in characters, we treat it as code units to avoid spending too much time
3746 in this optimization. */
3747
3748 if (end_subject - start_match < re->minlength) goto NOMATCH_EXIT;
3749
3750 /* If req_cu is set, we know that that code unit must appear in the
3751 subject for the match to succeed. If the first code unit is set, req_cu
3752 must be later in the subject; otherwise the test starts at the match
3753 point. This optimization can save a huge amount of backtracking in
3754 patterns with nested unlimited repeats that aren't going to match.
3755 Writing separate code for cased/caseless versions makes it go faster, as
3756 does using an autoincrement and backing off on a match.
3757
3758 HOWEVER: when the subject string is very, very long, searching to its end
3759 can take a long time, and give bad performance on quite ordinary
3760 patterns. This showed up when somebody was matching something like
3761 /^\d+C/ on a 32-megabyte string... so we don't do this when the string is
3762 sufficiently long. */
3763
3764 if (has_req_cu && end_subject - start_match < REQ_CU_MAX)
3765 {
3766 PCRE2_SPTR p = start_match + (has_first_cu? 1:0);
3767
3768 /* We don't need to repeat the search if we haven't yet reached the
3769 place we found it at last time. */
3770
3771 if (p > req_cu_ptr)
3772 {
3773 if (req_cu != req_cu2)
3774 {
3775 while (p < end_subject)
3776 {
3777 uint32_t pp = UCHAR21INCTEST(p);
3778 if (pp == req_cu || pp == req_cu2) { p--; break; }
3779 }
3780 }
3781 else
3782 {
3783 while (p < end_subject)
3784 {
3785 if (UCHAR21INCTEST(p) == req_cu) { p--; break; }
3786 }
3787 }
3788
3789 /* If we can't find the required code unit, break the matching loop,
3790 forcing a match failure. */
3791
3792 if (p >= end_subject) break;
3793
3794 /* If we have found the required code unit, save the point where we
3795 found it, so that we don't search again next time round the loop if
3796 the start hasn't passed this code unit yet. */
3797
3798 req_cu_ptr = p;
3799 }
3800 }
3801 }
3802 }
3803
3804 /* ------------ End of start of match optimizations ------------ */
3805
3806 /* Give no match if we have passed the bumpalong limit. */
3807
3808 if (start_match > bumpalong_limit) break;
3809
3810 /* OK, now we can do the business */
3811
3812 mb->start_used_ptr = start_match;
3813 mb->last_used_ptr = start_match;
3814 mb->recursive = NULL;
3815
3816 rc = internal_dfa_match(
3817 mb, /* fixed match data */
3818 mb->start_code, /* this subexpression's code */
3819 start_match, /* where we currently are */
3820 start_offset, /* start offset in subject */
3821 match_data->ovector, /* offset vector */
3822 (uint32_t)match_data->oveccount * 2, /* actual size of same */
3823 workspace, /* workspace vector */
3824 (int)wscount, /* size of same */
3825 0, /* function recurse level */
3826 base_recursion_workspace); /* initial workspace for recursion */
3827
3828 /* Anything other than "no match" means we are done, always; otherwise, carry
3829 on only if not anchored. */
3830
3831 if (rc != PCRE2_ERROR_NOMATCH || anchored)
3832 {
3833 if (rc == PCRE2_ERROR_PARTIAL && match_data->oveccount > 0)
3834 {
3835 match_data->ovector[0] = (PCRE2_SIZE)(start_match - subject);
3836 match_data->ovector[1] = (PCRE2_SIZE)(end_subject - subject);
3837 }
3838 match_data->leftchar = (PCRE2_SIZE)(mb->start_used_ptr - subject);
3839 match_data->rightchar = (PCRE2_SIZE)( mb->last_used_ptr - subject);
3840 match_data->startchar = (PCRE2_SIZE)(start_match - subject);
3841 match_data->rc = rc;
3842
3843 if (rc >= 0 &&(options & PCRE2_COPY_MATCHED_SUBJECT) != 0)
3844 {
3845 length = CU2BYTES(length + was_zero_terminated);
3846 match_data->subject = match_data->memctl.malloc(length,
3847 match_data->memctl.memory_data);
3848 if (match_data->subject == NULL) return PCRE2_ERROR_NOMEMORY;
3849 memcpy((void *)match_data->subject, subject, length);
3850 match_data->flags |= PCRE2_MD_COPIED_SUBJECT;
3851 }
3852 else
3853 {
3854 if (rc >= 0 || rc == PCRE2_ERROR_PARTIAL) match_data->subject = subject;
3855 }
3856 goto EXIT;
3857 }
3858
3859 /* Advance to the next subject character unless we are at the end of a line
3860 and firstline is set. */
3861
3862 if (firstline && IS_NEWLINE(start_match)) break;
3863 start_match++;
3864 #ifdef SUPPORT_UNICODE
3865 if (utf)
3866 {
3867 ACROSSCHAR(start_match < end_subject, start_match, start_match++);
3868 }
3869 #endif
3870 if (start_match > end_subject) break;
3871
3872 /* If we have just passed a CR and we are now at a LF, and the pattern does
3873 not contain any explicit matches for \r or \n, and the newline option is CRLF
3874 or ANY or ANYCRLF, advance the match position by one more character. */
3875
3876 if (UCHAR21TEST(start_match - 1) == CHAR_CR &&
3877 start_match < end_subject &&
3878 UCHAR21TEST(start_match) == CHAR_NL &&
3879 (re->flags & PCRE2_HASCRORLF) == 0 &&
3880 (mb->nltype == NLTYPE_ANY ||
3881 mb->nltype == NLTYPE_ANYCRLF ||
3882 mb->nllen == 2))
3883 start_match++;
3884
3885 } /* "Bumpalong" loop */
3886
3887 NOMATCH_EXIT:
3888 rc = PCRE2_ERROR_NOMATCH;
3889
3890 EXIT:
3891 while (rws->next != NULL)
3892 {
3893 RWS_anchor *next = rws->next;
3894 rws->next = next->next;
3895 mb->memctl.free(next, mb->memctl.memory_data);
3896 }
3897
3898 return rc;
3899 }
3900
3901 /* End of pcre2_dfa_match.c */
3902