• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*************************************************
2 *      Perl-Compatible Regular Expressions       *
3 *************************************************/
4 
5 /* PCRE is a library of functions to support regular expressions whose syntax
6 and semantics are as close as possible to those of the Perl 5 language.
7 
8                        Written by Philip Hazel
9      Original API code Copyright (c) 1997-2012 University of Cambridge
10          New API code Copyright (c) 2016 University of Cambridge
11 
12 -----------------------------------------------------------------------------
13 Redistribution and use in source and binary forms, with or without
14 modification, are permitted provided that the following conditions are met:
15 
16     * Redistributions of source code must retain the above copyright notice,
17       this list of conditions and the following disclaimer.
18 
19     * Redistributions in binary form must reproduce the above copyright
20       notice, this list of conditions and the following disclaimer in the
21       documentation and/or other materials provided with the distribution.
22 
23     * Neither the name of the University of Cambridge nor the names of its
24       contributors may be used to endorse or promote products derived from
25       this software without specific prior written permission.
26 
27 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
28 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
29 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
30 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
31 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
32 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
33 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
34 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
35 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
36 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
37 POSSIBILITY OF SUCH DAMAGE.
38 -----------------------------------------------------------------------------
39 */
40 
41 
42 /* This module contains the external function pcre2_dfa_match(), which is an
43 alternative matching function that uses a sort of DFA algorithm (not a true
44 FSM). This is NOT Perl-compatible, but it has advantages in certain
45 applications. */
46 
47 
48 /* NOTE ABOUT PERFORMANCE: A user of this function sent some code that improved
49 the performance of his patterns greatly. I could not use it as it stood, as it
50 was not thread safe, and made assumptions about pattern sizes. Also, it caused
51 test 7 to loop, and test 9 to crash with a segfault.
52 
53 The issue is the check for duplicate states, which is done by a simple linear
54 search up the state list. (Grep for "duplicate" below to find the code.) For
55 many patterns, there will never be many states active at one time, so a simple
56 linear search is fine. In patterns that have many active states, it might be a
57 bottleneck. The suggested code used an indexing scheme to remember which states
58 had previously been used for each character, and avoided the linear search when
59 it knew there was no chance of a duplicate. This was implemented when adding
60 states to the state lists.
61 
62 I wrote some thread-safe, not-limited code to try something similar at the time
63 of checking for duplicates (instead of when adding states), using index vectors
64 on the stack. It did give a 13% improvement with one specially constructed
65 pattern for certain subject strings, but on other strings and on many of the
66 simpler patterns in the test suite it did worse. The major problem, I think,
67 was the extra time to initialize the index. This had to be done for each call
68 of internal_dfa_match(). (The supplied patch used a static vector, initialized
69 only once - I suspect this was the cause of the problems with the tests.)
70 
71 Overall, I concluded that the gains in some cases did not outweigh the losses
72 in others, so I abandoned this code. */
73 
74 
75 #ifdef HAVE_CONFIG_H
76 #include "config.h"
77 #endif
78 
79 #define NLBLOCK mb             /* Block containing newline information */
80 #define PSSTART start_subject  /* Field containing processed string start */
81 #define PSEND   end_subject    /* Field containing processed string end */
82 
83 #include "pcre2_internal.h"
84 
85 #define PUBLIC_DFA_MATCH_OPTIONS \
86   (PCRE2_ANCHORED|PCRE2_NOTBOL|PCRE2_NOTEOL|PCRE2_NOTEMPTY| \
87    PCRE2_NOTEMPTY_ATSTART|PCRE2_NO_UTF_CHECK|PCRE2_PARTIAL_HARD| \
88    PCRE2_PARTIAL_SOFT|PCRE2_DFA_SHORTEST|PCRE2_DFA_RESTART)
89 
90 
91 /*************************************************
92 *      Code parameters and static tables         *
93 *************************************************/
94 
95 /* These are offsets that are used to turn the OP_TYPESTAR and friends opcodes
96 into others, under special conditions. A gap of 20 between the blocks should be
97 enough. The resulting opcodes don't have to be less than 256 because they are
98 never stored, so we push them well clear of the normal opcodes. */
99 
100 #define OP_PROP_EXTRA       300
101 #define OP_EXTUNI_EXTRA     320
102 #define OP_ANYNL_EXTRA      340
103 #define OP_HSPACE_EXTRA     360
104 #define OP_VSPACE_EXTRA     380
105 
106 
107 /* This table identifies those opcodes that are followed immediately by a
108 character that is to be tested in some way. This makes it possible to
109 centralize the loading of these characters. In the case of Type * etc, the
110 "character" is the opcode for \D, \d, \S, \s, \W, or \w, which will always be a
111 small value. Non-zero values in the table are the offsets from the opcode where
112 the character is to be found. ***NOTE*** If the start of this table is
113 modified, the three tables that follow must also be modified. */
114 
115 static const uint8_t coptable[] = {
116   0,                             /* End                                    */
117   0, 0, 0, 0, 0,                 /* \A, \G, \K, \B, \b                     */
118   0, 0, 0, 0, 0, 0,              /* \D, \d, \S, \s, \W, \w                 */
119   0, 0, 0,                       /* Any, AllAny, Anybyte                   */
120   0, 0,                          /* \P, \p                                 */
121   0, 0, 0, 0, 0,                 /* \R, \H, \h, \V, \v                     */
122   0,                             /* \X                                     */
123   0, 0, 0, 0, 0, 0,              /* \Z, \z, $, $M, ^, ^M                   */
124   1,                             /* Char                                   */
125   1,                             /* Chari                                  */
126   1,                             /* not                                    */
127   1,                             /* noti                                   */
128   /* Positive single-char repeats                                          */
129   1, 1, 1, 1, 1, 1,              /* *, *?, +, +?, ?, ??                    */
130   1+IMM2_SIZE, 1+IMM2_SIZE,      /* upto, minupto                          */
131   1+IMM2_SIZE,                   /* exact                                  */
132   1, 1, 1, 1+IMM2_SIZE,          /* *+, ++, ?+, upto+                      */
133   1, 1, 1, 1, 1, 1,              /* *I, *?I, +I, +?I, ?I, ??I              */
134   1+IMM2_SIZE, 1+IMM2_SIZE,      /* upto I, minupto I                      */
135   1+IMM2_SIZE,                   /* exact I                                */
136   1, 1, 1, 1+IMM2_SIZE,          /* *+I, ++I, ?+I, upto+I                  */
137   /* Negative single-char repeats - only for chars < 256                   */
138   1, 1, 1, 1, 1, 1,              /* NOT *, *?, +, +?, ?, ??                */
139   1+IMM2_SIZE, 1+IMM2_SIZE,      /* NOT upto, minupto                      */
140   1+IMM2_SIZE,                   /* NOT exact                              */
141   1, 1, 1, 1+IMM2_SIZE,          /* NOT *+, ++, ?+, upto+                  */
142   1, 1, 1, 1, 1, 1,              /* NOT *I, *?I, +I, +?I, ?I, ??I          */
143   1+IMM2_SIZE, 1+IMM2_SIZE,      /* NOT upto I, minupto I                  */
144   1+IMM2_SIZE,                   /* NOT exact I                            */
145   1, 1, 1, 1+IMM2_SIZE,          /* NOT *+I, ++I, ?+I, upto+I              */
146   /* Positive type repeats                                                 */
147   1, 1, 1, 1, 1, 1,              /* Type *, *?, +, +?, ?, ??               */
148   1+IMM2_SIZE, 1+IMM2_SIZE,      /* Type upto, minupto                     */
149   1+IMM2_SIZE,                   /* Type exact                             */
150   1, 1, 1, 1+IMM2_SIZE,          /* Type *+, ++, ?+, upto+                 */
151   /* Character class & ref repeats                                         */
152   0, 0, 0, 0, 0, 0,              /* *, *?, +, +?, ?, ??                    */
153   0, 0,                          /* CRRANGE, CRMINRANGE                    */
154   0, 0, 0, 0,                    /* Possessive *+, ++, ?+, CRPOSRANGE      */
155   0,                             /* CLASS                                  */
156   0,                             /* NCLASS                                 */
157   0,                             /* XCLASS - variable length               */
158   0,                             /* REF                                    */
159   0,                             /* REFI                                   */
160   0,                             /* DNREF                                  */
161   0,                             /* DNREFI                                 */
162   0,                             /* RECURSE                                */
163   0,                             /* CALLOUT                                */
164   0,                             /* CALLOUT_STR                            */
165   0,                             /* Alt                                    */
166   0,                             /* Ket                                    */
167   0,                             /* KetRmax                                */
168   0,                             /* KetRmin                                */
169   0,                             /* KetRpos                                */
170   0,                             /* Reverse                                */
171   0,                             /* Assert                                 */
172   0,                             /* Assert not                             */
173   0,                             /* Assert behind                          */
174   0,                             /* Assert behind not                      */
175   0, 0,                          /* ONCE, ONCE_NC                          */
176   0, 0, 0, 0, 0,                 /* BRA, BRAPOS, CBRA, CBRAPOS, COND       */
177   0, 0, 0, 0, 0,                 /* SBRA, SBRAPOS, SCBRA, SCBRAPOS, SCOND  */
178   0, 0,                          /* CREF, DNCREF                           */
179   0, 0,                          /* RREF, DNRREF                           */
180   0, 0,                          /* FALSE, TRUE                            */
181   0, 0, 0,                       /* BRAZERO, BRAMINZERO, BRAPOSZERO        */
182   0, 0, 0,                       /* MARK, PRUNE, PRUNE_ARG                 */
183   0, 0, 0, 0,                    /* SKIP, SKIP_ARG, THEN, THEN_ARG         */
184   0, 0, 0, 0,                    /* COMMIT, FAIL, ACCEPT, ASSERT_ACCEPT    */
185   0, 0, 0                        /* CLOSE, SKIPZERO, DEFINE                */
186 };
187 
188 /* This table identifies those opcodes that inspect a character. It is used to
189 remember the fact that a character could have been inspected when the end of
190 the subject is reached. ***NOTE*** If the start of this table is modified, the
191 two tables that follow must also be modified. */
192 
193 static const uint8_t poptable[] = {
194   0,                             /* End                                    */
195   0, 0, 0, 1, 1,                 /* \A, \G, \K, \B, \b                     */
196   1, 1, 1, 1, 1, 1,              /* \D, \d, \S, \s, \W, \w                 */
197   1, 1, 1,                       /* Any, AllAny, Anybyte                   */
198   1, 1,                          /* \P, \p                                 */
199   1, 1, 1, 1, 1,                 /* \R, \H, \h, \V, \v                     */
200   1,                             /* \X                                     */
201   0, 0, 0, 0, 0, 0,              /* \Z, \z, $, $M, ^, ^M                   */
202   1,                             /* Char                                   */
203   1,                             /* Chari                                  */
204   1,                             /* not                                    */
205   1,                             /* noti                                   */
206   /* Positive single-char repeats                                          */
207   1, 1, 1, 1, 1, 1,              /* *, *?, +, +?, ?, ??                    */
208   1, 1, 1,                       /* upto, minupto, exact                   */
209   1, 1, 1, 1,                    /* *+, ++, ?+, upto+                      */
210   1, 1, 1, 1, 1, 1,              /* *I, *?I, +I, +?I, ?I, ??I              */
211   1, 1, 1,                       /* upto I, minupto I, exact I             */
212   1, 1, 1, 1,                    /* *+I, ++I, ?+I, upto+I                  */
213   /* Negative single-char repeats - only for chars < 256                   */
214   1, 1, 1, 1, 1, 1,              /* NOT *, *?, +, +?, ?, ??                */
215   1, 1, 1,                       /* NOT upto, minupto, exact               */
216   1, 1, 1, 1,                    /* NOT *+, ++, ?+, upto+                  */
217   1, 1, 1, 1, 1, 1,              /* NOT *I, *?I, +I, +?I, ?I, ??I          */
218   1, 1, 1,                       /* NOT upto I, minupto I, exact I         */
219   1, 1, 1, 1,                    /* NOT *+I, ++I, ?+I, upto+I              */
220   /* Positive type repeats                                                 */
221   1, 1, 1, 1, 1, 1,              /* Type *, *?, +, +?, ?, ??               */
222   1, 1, 1,                       /* Type upto, minupto, exact              */
223   1, 1, 1, 1,                    /* Type *+, ++, ?+, upto+                 */
224   /* Character class & ref repeats                                         */
225   1, 1, 1, 1, 1, 1,              /* *, *?, +, +?, ?, ??                    */
226   1, 1,                          /* CRRANGE, CRMINRANGE                    */
227   1, 1, 1, 1,                    /* Possessive *+, ++, ?+, CRPOSRANGE      */
228   1,                             /* CLASS                                  */
229   1,                             /* NCLASS                                 */
230   1,                             /* XCLASS - variable length               */
231   0,                             /* REF                                    */
232   0,                             /* REFI                                   */
233   0,                             /* DNREF                                  */
234   0,                             /* DNREFI                                 */
235   0,                             /* RECURSE                                */
236   0,                             /* CALLOUT                                */
237   0,                             /* CALLOUT_STR                            */
238   0,                             /* Alt                                    */
239   0,                             /* Ket                                    */
240   0,                             /* KetRmax                                */
241   0,                             /* KetRmin                                */
242   0,                             /* KetRpos                                */
243   0,                             /* Reverse                                */
244   0,                             /* Assert                                 */
245   0,                             /* Assert not                             */
246   0,                             /* Assert behind                          */
247   0,                             /* Assert behind not                      */
248   0, 0,                          /* ONCE, ONCE_NC                          */
249   0, 0, 0, 0, 0,                 /* BRA, BRAPOS, CBRA, CBRAPOS, COND       */
250   0, 0, 0, 0, 0,                 /* SBRA, SBRAPOS, SCBRA, SCBRAPOS, SCOND  */
251   0, 0,                          /* CREF, DNCREF                           */
252   0, 0,                          /* RREF, DNRREF                           */
253   0, 0,                          /* FALSE, TRUE                            */
254   0, 0, 0,                       /* BRAZERO, BRAMINZERO, BRAPOSZERO        */
255   0, 0, 0,                       /* MARK, PRUNE, PRUNE_ARG                 */
256   0, 0, 0, 0,                    /* SKIP, SKIP_ARG, THEN, THEN_ARG         */
257   0, 0, 0, 0,                    /* COMMIT, FAIL, ACCEPT, ASSERT_ACCEPT    */
258   0, 0, 0                        /* CLOSE, SKIPZERO, DEFINE                */
259 };
260 
261 /* These 2 tables allow for compact code for testing for \D, \d, \S, \s, \W,
262 and \w */
263 
264 static const uint8_t toptable1[] = {
265   0, 0, 0, 0, 0, 0,
266   ctype_digit, ctype_digit,
267   ctype_space, ctype_space,
268   ctype_word,  ctype_word,
269   0, 0                            /* OP_ANY, OP_ALLANY */
270 };
271 
272 static const uint8_t toptable2[] = {
273   0, 0, 0, 0, 0, 0,
274   ctype_digit, 0,
275   ctype_space, 0,
276   ctype_word,  0,
277   1, 1                            /* OP_ANY, OP_ALLANY */
278 };
279 
280 
281 /* Structure for holding data about a particular state, which is in effect the
282 current data for an active path through the match tree. It must consist
283 entirely of ints because the working vector we are passed, and which we put
284 these structures in, is a vector of ints. */
285 
286 typedef struct stateblock {
287   int offset;                     /* Offset to opcode (-ve has meaning) */
288   int count;                      /* Count for repeats */
289   int data;                       /* Some use extra data */
290 } stateblock;
291 
292 #define INTS_PER_STATEBLOCK  (int)(sizeof(stateblock)/sizeof(int))
293 
294 
295 
296 /*************************************************
297 *     Match a Regular Expression - DFA engine    *
298 *************************************************/
299 
300 /* This internal function applies a compiled pattern to a subject string,
301 starting at a given point, using a DFA engine. This function is called from the
302 external one, possibly multiple times if the pattern is not anchored. The
303 function calls itself recursively for some kinds of subpattern.
304 
305 Arguments:
306   mb                the match_data block with fixed information
307   this_start_code   the opening bracket of this subexpression's code
308   current_subject   where we currently are in the subject string
309   start_offset      start offset in the subject string
310   offsets           vector to contain the matching string offsets
311   offsetcount       size of same
312   workspace         vector of workspace
313   wscount           size of same
314   rlevel            function call recursion level
315 
316 Returns:            > 0 => number of match offset pairs placed in offsets
317                     = 0 => offsets overflowed; longest matches are present
318                      -1 => failed to match
319                    < -1 => some kind of unexpected problem
320 
321 The following macros are used for adding states to the two state vectors (one
322 for the current character, one for the following character). */
323 
324 #define ADD_ACTIVE(x,y) \
325   if (active_count++ < wscount) \
326     { \
327     next_active_state->offset = (x); \
328     next_active_state->count  = (y); \
329     next_active_state++; \
330     } \
331   else return PCRE2_ERROR_DFA_WSSIZE
332 
333 #define ADD_ACTIVE_DATA(x,y,z) \
334   if (active_count++ < wscount) \
335     { \
336     next_active_state->offset = (x); \
337     next_active_state->count  = (y); \
338     next_active_state->data   = (z); \
339     next_active_state++; \
340     } \
341   else return PCRE2_ERROR_DFA_WSSIZE
342 
343 #define ADD_NEW(x,y) \
344   if (new_count++ < wscount) \
345     { \
346     next_new_state->offset = (x); \
347     next_new_state->count  = (y); \
348     next_new_state++; \
349     } \
350   else return PCRE2_ERROR_DFA_WSSIZE
351 
352 #define ADD_NEW_DATA(x,y,z) \
353   if (new_count++ < wscount) \
354     { \
355     next_new_state->offset = (x); \
356     next_new_state->count  = (y); \
357     next_new_state->data   = (z); \
358     next_new_state++; \
359     } \
360   else return PCRE2_ERROR_DFA_WSSIZE
361 
362 /* And now, here is the code */
363 
364 static int
internal_dfa_match(dfa_match_block * mb,PCRE2_SPTR this_start_code,PCRE2_SPTR current_subject,PCRE2_SIZE start_offset,PCRE2_SIZE * offsets,uint32_t offsetcount,int * workspace,int wscount,int rlevel)365 internal_dfa_match(
366   dfa_match_block *mb,
367   PCRE2_SPTR this_start_code,
368   PCRE2_SPTR current_subject,
369   PCRE2_SIZE start_offset,
370   PCRE2_SIZE *offsets,
371   uint32_t offsetcount,
372   int *workspace,
373   int wscount,
374   int  rlevel)
375 {
376 stateblock *active_states, *new_states, *temp_states;
377 stateblock *next_active_state, *next_new_state;
378 
379 const uint8_t *ctypes, *lcc, *fcc;
380 PCRE2_SPTR ptr;
381 PCRE2_SPTR end_code;
382 PCRE2_SPTR first_op;
383 
384 dfa_recursion_info new_recursive;
385 
386 int active_count, new_count, match_count;
387 
388 /* Some fields in the mb block are frequently referenced, so we load them into
389 independent variables in the hope that this will perform better. */
390 
391 PCRE2_SPTR start_subject = mb->start_subject;
392 PCRE2_SPTR end_subject = mb->end_subject;
393 PCRE2_SPTR start_code = mb->start_code;
394 
395 #ifdef SUPPORT_UNICODE
396 BOOL utf = (mb->poptions & PCRE2_UTF) != 0;
397 #else
398 BOOL utf = FALSE;
399 #endif
400 
401 BOOL reset_could_continue = FALSE;
402 
403 rlevel++;
404 offsetcount &= (uint32_t)(-2);  /* Round down */
405 
406 wscount -= 2;
407 wscount = (wscount - (wscount % (INTS_PER_STATEBLOCK * 2))) /
408           (2 * INTS_PER_STATEBLOCK);
409 
410 ctypes = mb->tables + ctypes_offset;
411 lcc = mb->tables + lcc_offset;
412 fcc = mb->tables + fcc_offset;
413 
414 match_count = PCRE2_ERROR_NOMATCH;   /* A negative number */
415 
416 active_states = (stateblock *)(workspace + 2);
417 next_new_state = new_states = active_states + wscount;
418 new_count = 0;
419 
420 first_op = this_start_code + 1 + LINK_SIZE +
421   ((*this_start_code == OP_CBRA || *this_start_code == OP_SCBRA ||
422     *this_start_code == OP_CBRAPOS || *this_start_code == OP_SCBRAPOS)
423     ? IMM2_SIZE:0);
424 
425 /* The first thing in any (sub) pattern is a bracket of some sort. Push all
426 the alternative states onto the list, and find out where the end is. This
427 makes is possible to use this function recursively, when we want to stop at a
428 matching internal ket rather than at the end.
429 
430 If the first opcode in the first alternative is OP_REVERSE, we are dealing with
431 a backward assertion. In that case, we have to find out the maximum amount to
432 move back, and set up each alternative appropriately. */
433 
434 if (*first_op == OP_REVERSE)
435   {
436   size_t max_back = 0;
437   size_t gone_back;
438 
439   end_code = this_start_code;
440   do
441     {
442     size_t back = (size_t)GET(end_code, 2+LINK_SIZE);
443     if (back > max_back) max_back = back;
444     end_code += GET(end_code, 1);
445     }
446   while (*end_code == OP_ALT);
447 
448   /* If we can't go back the amount required for the longest lookbehind
449   pattern, go back as far as we can; some alternatives may still be viable. */
450 
451 #ifdef SUPPORT_UNICODE
452   /* In character mode we have to step back character by character */
453 
454   if (utf)
455     {
456     for (gone_back = 0; gone_back < max_back; gone_back++)
457       {
458       if (current_subject <= start_subject) break;
459       current_subject--;
460       ACROSSCHAR(current_subject > start_subject, *current_subject, current_subject--);
461       }
462     }
463   else
464 #endif
465 
466   /* In byte-mode we can do this quickly. */
467 
468     {
469     size_t current_offset = (size_t)(current_subject - start_subject);
470     gone_back = (current_offset < max_back)? current_offset : max_back;
471     current_subject -= gone_back;
472     }
473 
474   /* Save the earliest consulted character */
475 
476   if (current_subject < mb->start_used_ptr)
477     mb->start_used_ptr = current_subject;
478 
479   /* Now we can process the individual branches. */
480 
481   end_code = this_start_code;
482   do
483     {
484     size_t back = (size_t)GET(end_code, 2+LINK_SIZE);
485     if (back <= gone_back)
486       {
487       int bstate = (int)(end_code - start_code + 2 + 2*LINK_SIZE);
488       ADD_NEW_DATA(-bstate, 0, (int)(gone_back - back));
489       }
490     end_code += GET(end_code, 1);
491     }
492   while (*end_code == OP_ALT);
493  }
494 
495 /* This is the code for a "normal" subpattern (not a backward assertion). The
496 start of a whole pattern is always one of these. If we are at the top level,
497 we may be asked to restart matching from the same point that we reached for a
498 previous partial match. We still have to scan through the top-level branches to
499 find the end state. */
500 
501 else
502   {
503   end_code = this_start_code;
504 
505   /* Restarting */
506 
507   if (rlevel == 1 && (mb->moptions & PCRE2_DFA_RESTART) != 0)
508     {
509     do { end_code += GET(end_code, 1); } while (*end_code == OP_ALT);
510     new_count = workspace[1];
511     if (!workspace[0])
512       memcpy(new_states, active_states, (size_t)new_count * sizeof(stateblock));
513     }
514 
515   /* Not restarting */
516 
517   else
518     {
519     int length = 1 + LINK_SIZE +
520       ((*this_start_code == OP_CBRA || *this_start_code == OP_SCBRA ||
521         *this_start_code == OP_CBRAPOS || *this_start_code == OP_SCBRAPOS)
522         ? IMM2_SIZE:0);
523     do
524       {
525       ADD_NEW((int)(end_code - start_code + length), 0);
526       end_code += GET(end_code, 1);
527       length = 1 + LINK_SIZE;
528       }
529     while (*end_code == OP_ALT);
530     }
531   }
532 
533 workspace[0] = 0;    /* Bit indicating which vector is current */
534 
535 /* Loop for scanning the subject */
536 
537 ptr = current_subject;
538 for (;;)
539   {
540   int i, j;
541   int clen, dlen;
542   uint32_t c, d;
543   int forced_fail = 0;
544   BOOL partial_newline = FALSE;
545   BOOL could_continue = reset_could_continue;
546   reset_could_continue = FALSE;
547 
548   if (ptr > mb->last_used_ptr) mb->last_used_ptr = ptr;
549 
550   /* Make the new state list into the active state list and empty the
551   new state list. */
552 
553   temp_states = active_states;
554   active_states = new_states;
555   new_states = temp_states;
556   active_count = new_count;
557   new_count = 0;
558 
559   workspace[0] ^= 1;              /* Remember for the restarting feature */
560   workspace[1] = active_count;
561 
562   /* Set the pointers for adding new states */
563 
564   next_active_state = active_states + active_count;
565   next_new_state = new_states;
566 
567   /* Load the current character from the subject outside the loop, as many
568   different states may want to look at it, and we assume that at least one
569   will. */
570 
571   if (ptr < end_subject)
572     {
573     clen = 1;        /* Number of data items in the character */
574 #ifdef SUPPORT_UNICODE
575     GETCHARLENTEST(c, ptr, clen);
576 #else
577     c = *ptr;
578 #endif  /* SUPPORT_UNICODE */
579     }
580   else
581     {
582     clen = 0;        /* This indicates the end of the subject */
583     c = NOTACHAR;    /* This value should never actually be used */
584     }
585 
586   /* Scan up the active states and act on each one. The result of an action
587   may be to add more states to the currently active list (e.g. on hitting a
588   parenthesis) or it may be to put states on the new list, for considering
589   when we move the character pointer on. */
590 
591   for (i = 0; i < active_count; i++)
592     {
593     stateblock *current_state = active_states + i;
594     BOOL caseless = FALSE;
595     PCRE2_SPTR code;
596     uint32_t codevalue;
597     int state_offset = current_state->offset;
598     int rrc;
599     int count;
600 
601     /* A negative offset is a special case meaning "hold off going to this
602     (negated) state until the number of characters in the data field have
603     been skipped". If the could_continue flag was passed over from a previous
604     state, arrange for it to passed on. */
605 
606     if (state_offset < 0)
607       {
608       if (current_state->data > 0)
609         {
610         ADD_NEW_DATA(state_offset, current_state->count,
611           current_state->data - 1);
612         if (could_continue) reset_could_continue = TRUE;
613         continue;
614         }
615       else
616         {
617         current_state->offset = state_offset = -state_offset;
618         }
619       }
620 
621     /* Check for a duplicate state with the same count, and skip if found.
622     See the note at the head of this module about the possibility of improving
623     performance here. */
624 
625     for (j = 0; j < i; j++)
626       {
627       if (active_states[j].offset == state_offset &&
628           active_states[j].count == current_state->count)
629         goto NEXT_ACTIVE_STATE;
630       }
631 
632     /* The state offset is the offset to the opcode */
633 
634     code = start_code + state_offset;
635     codevalue = *code;
636 
637     /* If this opcode inspects a character, but we are at the end of the
638     subject, remember the fact for use when testing for a partial match. */
639 
640     if (clen == 0 && poptable[codevalue] != 0)
641       could_continue = TRUE;
642 
643     /* If this opcode is followed by an inline character, load it. It is
644     tempting to test for the presence of a subject character here, but that
645     is wrong, because sometimes zero repetitions of the subject are
646     permitted.
647 
648     We also use this mechanism for opcodes such as OP_TYPEPLUS that take an
649     argument that is not a data character - but is always one byte long because
650     the values are small. We have to take special action to deal with  \P, \p,
651     \H, \h, \V, \v and \X in this case. To keep the other cases fast, convert
652     these ones to new opcodes. */
653 
654     if (coptable[codevalue] > 0)
655       {
656       dlen = 1;
657 #ifdef SUPPORT_UNICODE
658       if (utf) { GETCHARLEN(d, (code + coptable[codevalue]), dlen); } else
659 #endif  /* SUPPORT_UNICODE */
660       d = code[coptable[codevalue]];
661       if (codevalue >= OP_TYPESTAR)
662         {
663         switch(d)
664           {
665           case OP_ANYBYTE: return PCRE2_ERROR_DFA_UITEM;
666           case OP_NOTPROP:
667           case OP_PROP: codevalue += OP_PROP_EXTRA; break;
668           case OP_ANYNL: codevalue += OP_ANYNL_EXTRA; break;
669           case OP_EXTUNI: codevalue += OP_EXTUNI_EXTRA; break;
670           case OP_NOT_HSPACE:
671           case OP_HSPACE: codevalue += OP_HSPACE_EXTRA; break;
672           case OP_NOT_VSPACE:
673           case OP_VSPACE: codevalue += OP_VSPACE_EXTRA; break;
674           default: break;
675           }
676         }
677       }
678     else
679       {
680       dlen = 0;         /* Not strictly necessary, but compilers moan */
681       d = NOTACHAR;     /* if these variables are not set. */
682       }
683 
684 
685     /* Now process the individual opcodes */
686 
687     switch (codevalue)
688       {
689 /* ========================================================================== */
690       /* These cases are never obeyed. This is a fudge that causes a compile-
691       time error if the vectors coptable or poptable, which are indexed by
692       opcode, are not the correct length. It seems to be the only way to do
693       such a check at compile time, as the sizeof() operator does not work
694       in the C preprocessor. */
695 
696       case OP_TABLE_LENGTH:
697       case OP_TABLE_LENGTH +
698         ((sizeof(coptable) == OP_TABLE_LENGTH) &&
699          (sizeof(poptable) == OP_TABLE_LENGTH)):
700       break;
701 
702 /* ========================================================================== */
703       /* Reached a closing bracket. If not at the end of the pattern, carry
704       on with the next opcode. For repeating opcodes, also add the repeat
705       state. Note that KETRPOS will always be encountered at the end of the
706       subpattern, because the possessive subpattern repeats are always handled
707       using recursive calls. Thus, it never adds any new states.
708 
709       At the end of the (sub)pattern, unless we have an empty string and
710       PCRE2_NOTEMPTY is set, or PCRE2_NOTEMPTY_ATSTART is set and we are at the
711       start of the subject, save the match data, shifting up all previous
712       matches so we always have the longest first. */
713 
714       case OP_KET:
715       case OP_KETRMIN:
716       case OP_KETRMAX:
717       case OP_KETRPOS:
718       if (code != end_code)
719         {
720         ADD_ACTIVE(state_offset + 1 + LINK_SIZE, 0);
721         if (codevalue != OP_KET)
722           {
723           ADD_ACTIVE(state_offset - (int)GET(code, 1), 0);
724           }
725         }
726       else
727         {
728         if (ptr > current_subject ||
729             ((mb->moptions & PCRE2_NOTEMPTY) == 0 &&
730               ((mb->moptions & PCRE2_NOTEMPTY_ATSTART) == 0 ||
731                 current_subject > start_subject + mb->start_offset)))
732           {
733           if (match_count < 0) match_count = (offsetcount >= 2)? 1 : 0;
734             else if (match_count > 0 && ++match_count * 2 > (int)offsetcount)
735               match_count = 0;
736           count = ((match_count == 0)? (int)offsetcount : match_count * 2) - 2;
737           if (count > 0) memmove(offsets + 2, offsets,
738             (size_t)count * sizeof(PCRE2_SIZE));
739           if (offsetcount >= 2)
740             {
741             offsets[0] = (PCRE2_SIZE)(current_subject - start_subject);
742             offsets[1] = (PCRE2_SIZE)(ptr - start_subject);
743             }
744           if ((mb->moptions & PCRE2_DFA_SHORTEST) != 0) return match_count;
745           }
746         }
747       break;
748 
749 /* ========================================================================== */
750       /* These opcodes add to the current list of states without looking
751       at the current character. */
752 
753       /*-----------------------------------------------------------------*/
754       case OP_ALT:
755       do { code += GET(code, 1); } while (*code == OP_ALT);
756       ADD_ACTIVE((int)(code - start_code), 0);
757       break;
758 
759       /*-----------------------------------------------------------------*/
760       case OP_BRA:
761       case OP_SBRA:
762       do
763         {
764         ADD_ACTIVE((int)(code - start_code + 1 + LINK_SIZE), 0);
765         code += GET(code, 1);
766         }
767       while (*code == OP_ALT);
768       break;
769 
770       /*-----------------------------------------------------------------*/
771       case OP_CBRA:
772       case OP_SCBRA:
773       ADD_ACTIVE((int)(code - start_code + 1 + LINK_SIZE + IMM2_SIZE),  0);
774       code += GET(code, 1);
775       while (*code == OP_ALT)
776         {
777         ADD_ACTIVE((int)(code - start_code + 1 + LINK_SIZE),  0);
778         code += GET(code, 1);
779         }
780       break;
781 
782       /*-----------------------------------------------------------------*/
783       case OP_BRAZERO:
784       case OP_BRAMINZERO:
785       ADD_ACTIVE(state_offset + 1, 0);
786       code += 1 + GET(code, 2);
787       while (*code == OP_ALT) code += GET(code, 1);
788       ADD_ACTIVE((int)(code - start_code + 1 + LINK_SIZE), 0);
789       break;
790 
791       /*-----------------------------------------------------------------*/
792       case OP_SKIPZERO:
793       code += 1 + GET(code, 2);
794       while (*code == OP_ALT) code += GET(code, 1);
795       ADD_ACTIVE((int)(code - start_code + 1 + LINK_SIZE), 0);
796       break;
797 
798       /*-----------------------------------------------------------------*/
799       case OP_CIRC:
800       if (ptr == start_subject && (mb->moptions & PCRE2_NOTBOL) == 0)
801         { ADD_ACTIVE(state_offset + 1, 0); }
802       break;
803 
804       /*-----------------------------------------------------------------*/
805       case OP_CIRCM:
806       if ((ptr == start_subject && (mb->moptions & PCRE2_NOTBOL) == 0) ||
807           ((ptr != end_subject || (mb->poptions & PCRE2_ALT_CIRCUMFLEX) != 0 )
808             && WAS_NEWLINE(ptr)))
809         { ADD_ACTIVE(state_offset + 1, 0); }
810       break;
811 
812       /*-----------------------------------------------------------------*/
813       case OP_EOD:
814       if (ptr >= end_subject)
815         {
816         if ((mb->moptions & PCRE2_PARTIAL_HARD) != 0)
817           could_continue = TRUE;
818         else { ADD_ACTIVE(state_offset + 1, 0); }
819         }
820       break;
821 
822       /*-----------------------------------------------------------------*/
823       case OP_SOD:
824       if (ptr == start_subject) { ADD_ACTIVE(state_offset + 1, 0); }
825       break;
826 
827       /*-----------------------------------------------------------------*/
828       case OP_SOM:
829       if (ptr == start_subject + start_offset) { ADD_ACTIVE(state_offset + 1, 0); }
830       break;
831 
832 
833 /* ========================================================================== */
834       /* These opcodes inspect the next subject character, and sometimes
835       the previous one as well, but do not have an argument. The variable
836       clen contains the length of the current character and is zero if we are
837       at the end of the subject. */
838 
839       /*-----------------------------------------------------------------*/
840       case OP_ANY:
841       if (clen > 0 && !IS_NEWLINE(ptr))
842         {
843         if (ptr + 1 >= mb->end_subject &&
844             (mb->moptions & (PCRE2_PARTIAL_HARD)) != 0 &&
845             NLBLOCK->nltype == NLTYPE_FIXED &&
846             NLBLOCK->nllen == 2 &&
847             c == NLBLOCK->nl[0])
848           {
849           could_continue = partial_newline = TRUE;
850           }
851         else
852           {
853           ADD_NEW(state_offset + 1, 0);
854           }
855         }
856       break;
857 
858       /*-----------------------------------------------------------------*/
859       case OP_ALLANY:
860       if (clen > 0)
861         { ADD_NEW(state_offset + 1, 0); }
862       break;
863 
864       /*-----------------------------------------------------------------*/
865       case OP_EODN:
866       if (clen == 0 && (mb->moptions & PCRE2_PARTIAL_HARD) != 0)
867         could_continue = TRUE;
868       else if (clen == 0 || (IS_NEWLINE(ptr) && ptr == end_subject - mb->nllen))
869         { ADD_ACTIVE(state_offset + 1, 0); }
870       break;
871 
872       /*-----------------------------------------------------------------*/
873       case OP_DOLL:
874       if ((mb->moptions & PCRE2_NOTEOL) == 0)
875         {
876         if (clen == 0 && (mb->moptions & PCRE2_PARTIAL_HARD) != 0)
877           could_continue = TRUE;
878         else if (clen == 0 ||
879             ((mb->poptions & PCRE2_DOLLAR_ENDONLY) == 0 && IS_NEWLINE(ptr) &&
880                (ptr == end_subject - mb->nllen)
881             ))
882           { ADD_ACTIVE(state_offset + 1, 0); }
883         else if (ptr + 1 >= mb->end_subject &&
884                  (mb->moptions & (PCRE2_PARTIAL_HARD|PCRE2_PARTIAL_SOFT)) != 0 &&
885                  NLBLOCK->nltype == NLTYPE_FIXED &&
886                  NLBLOCK->nllen == 2 &&
887                  c == NLBLOCK->nl[0])
888           {
889           if ((mb->moptions & PCRE2_PARTIAL_HARD) != 0)
890             {
891             reset_could_continue = TRUE;
892             ADD_NEW_DATA(-(state_offset + 1), 0, 1);
893             }
894           else could_continue = partial_newline = TRUE;
895           }
896         }
897       break;
898 
899       /*-----------------------------------------------------------------*/
900       case OP_DOLLM:
901       if ((mb->moptions & PCRE2_NOTEOL) == 0)
902         {
903         if (clen == 0 && (mb->moptions & PCRE2_PARTIAL_HARD) != 0)
904           could_continue = TRUE;
905         else if (clen == 0 ||
906             ((mb->poptions & PCRE2_DOLLAR_ENDONLY) == 0 && IS_NEWLINE(ptr)))
907           { ADD_ACTIVE(state_offset + 1, 0); }
908         else if (ptr + 1 >= mb->end_subject &&
909                  (mb->moptions & (PCRE2_PARTIAL_HARD|PCRE2_PARTIAL_SOFT)) != 0 &&
910                  NLBLOCK->nltype == NLTYPE_FIXED &&
911                  NLBLOCK->nllen == 2 &&
912                  c == NLBLOCK->nl[0])
913           {
914           if ((mb->moptions & PCRE2_PARTIAL_HARD) != 0)
915             {
916             reset_could_continue = TRUE;
917             ADD_NEW_DATA(-(state_offset + 1), 0, 1);
918             }
919           else could_continue = partial_newline = TRUE;
920           }
921         }
922       else if (IS_NEWLINE(ptr))
923         { ADD_ACTIVE(state_offset + 1, 0); }
924       break;
925 
926       /*-----------------------------------------------------------------*/
927 
928       case OP_DIGIT:
929       case OP_WHITESPACE:
930       case OP_WORDCHAR:
931       if (clen > 0 && c < 256 &&
932             ((ctypes[c] & toptable1[codevalue]) ^ toptable2[codevalue]) != 0)
933         { ADD_NEW(state_offset + 1, 0); }
934       break;
935 
936       /*-----------------------------------------------------------------*/
937       case OP_NOT_DIGIT:
938       case OP_NOT_WHITESPACE:
939       case OP_NOT_WORDCHAR:
940       if (clen > 0 && (c >= 256 ||
941             ((ctypes[c] & toptable1[codevalue]) ^ toptable2[codevalue]) != 0))
942         { ADD_NEW(state_offset + 1, 0); }
943       break;
944 
945       /*-----------------------------------------------------------------*/
946       case OP_WORD_BOUNDARY:
947       case OP_NOT_WORD_BOUNDARY:
948         {
949         int left_word, right_word;
950 
951         if (ptr > start_subject)
952           {
953           PCRE2_SPTR temp = ptr - 1;
954           if (temp < mb->start_used_ptr) mb->start_used_ptr = temp;
955 #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32
956           if (utf) { BACKCHAR(temp); }
957 #endif
958           GETCHARTEST(d, temp);
959 #ifdef SUPPORT_UNICODE
960           if ((mb->poptions & PCRE2_UCP) != 0)
961             {
962             if (d == '_') left_word = TRUE; else
963               {
964               uint32_t cat = UCD_CATEGORY(d);
965               left_word = (cat == ucp_L || cat == ucp_N);
966               }
967             }
968           else
969 #endif
970           left_word = d < 256 && (ctypes[d] & ctype_word) != 0;
971           }
972         else left_word = FALSE;
973 
974         if (clen > 0)
975           {
976           if (ptr >= mb->last_used_ptr)
977             {
978             PCRE2_SPTR temp = ptr + 1;
979 #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32
980             if (utf) { FORWARDCHARTEST(temp, mb->end_subject); }
981 #endif
982             mb->last_used_ptr = temp;
983             }
984 #ifdef SUPPORT_UNICODE
985           if ((mb->poptions & PCRE2_UCP) != 0)
986             {
987             if (c == '_') right_word = TRUE; else
988               {
989               uint32_t cat = UCD_CATEGORY(c);
990               right_word = (cat == ucp_L || cat == ucp_N);
991               }
992             }
993           else
994 #endif
995           right_word = c < 256 && (ctypes[c] & ctype_word) != 0;
996           }
997         else right_word = FALSE;
998 
999         if ((left_word == right_word) == (codevalue == OP_NOT_WORD_BOUNDARY))
1000           { ADD_ACTIVE(state_offset + 1, 0); }
1001         }
1002       break;
1003 
1004 
1005       /*-----------------------------------------------------------------*/
1006       /* Check the next character by Unicode property. We will get here only
1007       if the support is in the binary; otherwise a compile-time error occurs.
1008       */
1009 
1010 #ifdef SUPPORT_UNICODE
1011       case OP_PROP:
1012       case OP_NOTPROP:
1013       if (clen > 0)
1014         {
1015         BOOL OK;
1016         const uint32_t *cp;
1017         const ucd_record * prop = GET_UCD(c);
1018         switch(code[1])
1019           {
1020           case PT_ANY:
1021           OK = TRUE;
1022           break;
1023 
1024           case PT_LAMP:
1025           OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll ||
1026                prop->chartype == ucp_Lt;
1027           break;
1028 
1029           case PT_GC:
1030           OK = PRIV(ucp_gentype)[prop->chartype] == code[2];
1031           break;
1032 
1033           case PT_PC:
1034           OK = prop->chartype == code[2];
1035           break;
1036 
1037           case PT_SC:
1038           OK = prop->script == code[2];
1039           break;
1040 
1041           /* These are specials for combination cases. */
1042 
1043           case PT_ALNUM:
1044           OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
1045                PRIV(ucp_gentype)[prop->chartype] == ucp_N;
1046           break;
1047 
1048           /* Perl space used to exclude VT, but from Perl 5.18 it is included,
1049           which means that Perl space and POSIX space are now identical. PCRE
1050           was changed at release 8.34. */
1051 
1052           case PT_SPACE:    /* Perl space */
1053           case PT_PXSPACE:  /* POSIX space */
1054           switch(c)
1055             {
1056             HSPACE_CASES:
1057             VSPACE_CASES:
1058             OK = TRUE;
1059             break;
1060 
1061             default:
1062             OK = PRIV(ucp_gentype)[prop->chartype] == ucp_Z;
1063             break;
1064             }
1065           break;
1066 
1067           case PT_WORD:
1068           OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
1069                PRIV(ucp_gentype)[prop->chartype] == ucp_N ||
1070                c == CHAR_UNDERSCORE;
1071           break;
1072 
1073           case PT_CLIST:
1074           cp = PRIV(ucd_caseless_sets) + code[2];
1075           for (;;)
1076             {
1077             if (c < *cp) { OK = FALSE; break; }
1078             if (c == *cp++) { OK = TRUE; break; }
1079             }
1080           break;
1081 
1082           case PT_UCNC:
1083           OK = c == CHAR_DOLLAR_SIGN || c == CHAR_COMMERCIAL_AT ||
1084                c == CHAR_GRAVE_ACCENT || (c >= 0xa0 && c <= 0xd7ff) ||
1085                c >= 0xe000;
1086           break;
1087 
1088           /* Should never occur, but keep compilers from grumbling. */
1089 
1090           default:
1091           OK = codevalue != OP_PROP;
1092           break;
1093           }
1094 
1095         if (OK == (codevalue == OP_PROP)) { ADD_NEW(state_offset + 3, 0); }
1096         }
1097       break;
1098 #endif
1099 
1100 
1101 
1102 /* ========================================================================== */
1103       /* These opcodes likewise inspect the subject character, but have an
1104       argument that is not a data character. It is one of these opcodes:
1105       OP_ANY, OP_ALLANY, OP_DIGIT, OP_NOT_DIGIT, OP_WHITESPACE, OP_NOT_SPACE,
1106       OP_WORDCHAR, OP_NOT_WORDCHAR. The value is loaded into d. */
1107 
1108       case OP_TYPEPLUS:
1109       case OP_TYPEMINPLUS:
1110       case OP_TYPEPOSPLUS:
1111       count = current_state->count;  /* Already matched */
1112       if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1113       if (clen > 0)
1114         {
1115         if (d == OP_ANY && ptr + 1 >= mb->end_subject &&
1116             (mb->moptions & (PCRE2_PARTIAL_HARD)) != 0 &&
1117             NLBLOCK->nltype == NLTYPE_FIXED &&
1118             NLBLOCK->nllen == 2 &&
1119             c == NLBLOCK->nl[0])
1120           {
1121           could_continue = partial_newline = TRUE;
1122           }
1123         else if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
1124             (c < 256 &&
1125               (d != OP_ANY || !IS_NEWLINE(ptr)) &&
1126               ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
1127           {
1128           if (count > 0 && codevalue == OP_TYPEPOSPLUS)
1129             {
1130             active_count--;            /* Remove non-match possibility */
1131             next_active_state--;
1132             }
1133           count++;
1134           ADD_NEW(state_offset, count);
1135           }
1136         }
1137       break;
1138 
1139       /*-----------------------------------------------------------------*/
1140       case OP_TYPEQUERY:
1141       case OP_TYPEMINQUERY:
1142       case OP_TYPEPOSQUERY:
1143       ADD_ACTIVE(state_offset + 2, 0);
1144       if (clen > 0)
1145         {
1146         if (d == OP_ANY && ptr + 1 >= mb->end_subject &&
1147             (mb->moptions & (PCRE2_PARTIAL_HARD)) != 0 &&
1148             NLBLOCK->nltype == NLTYPE_FIXED &&
1149             NLBLOCK->nllen == 2 &&
1150             c == NLBLOCK->nl[0])
1151           {
1152           could_continue = partial_newline = TRUE;
1153           }
1154         else if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
1155             (c < 256 &&
1156               (d != OP_ANY || !IS_NEWLINE(ptr)) &&
1157               ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
1158           {
1159           if (codevalue == OP_TYPEPOSQUERY)
1160             {
1161             active_count--;            /* Remove non-match possibility */
1162             next_active_state--;
1163             }
1164           ADD_NEW(state_offset + 2, 0);
1165           }
1166         }
1167       break;
1168 
1169       /*-----------------------------------------------------------------*/
1170       case OP_TYPESTAR:
1171       case OP_TYPEMINSTAR:
1172       case OP_TYPEPOSSTAR:
1173       ADD_ACTIVE(state_offset + 2, 0);
1174       if (clen > 0)
1175         {
1176         if (d == OP_ANY && ptr + 1 >= mb->end_subject &&
1177             (mb->moptions & (PCRE2_PARTIAL_HARD)) != 0 &&
1178             NLBLOCK->nltype == NLTYPE_FIXED &&
1179             NLBLOCK->nllen == 2 &&
1180             c == NLBLOCK->nl[0])
1181           {
1182           could_continue = partial_newline = TRUE;
1183           }
1184         else if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
1185             (c < 256 &&
1186               (d != OP_ANY || !IS_NEWLINE(ptr)) &&
1187               ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
1188           {
1189           if (codevalue == OP_TYPEPOSSTAR)
1190             {
1191             active_count--;            /* Remove non-match possibility */
1192             next_active_state--;
1193             }
1194           ADD_NEW(state_offset, 0);
1195           }
1196         }
1197       break;
1198 
1199       /*-----------------------------------------------------------------*/
1200       case OP_TYPEEXACT:
1201       count = current_state->count;  /* Number already matched */
1202       if (clen > 0)
1203         {
1204         if (d == OP_ANY && ptr + 1 >= mb->end_subject &&
1205             (mb->moptions & (PCRE2_PARTIAL_HARD)) != 0 &&
1206             NLBLOCK->nltype == NLTYPE_FIXED &&
1207             NLBLOCK->nllen == 2 &&
1208             c == NLBLOCK->nl[0])
1209           {
1210           could_continue = partial_newline = TRUE;
1211           }
1212         else if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
1213             (c < 256 &&
1214               (d != OP_ANY || !IS_NEWLINE(ptr)) &&
1215               ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
1216           {
1217           if (++count >= (int)GET2(code, 1))
1218             { ADD_NEW(state_offset + 1 + IMM2_SIZE + 1, 0); }
1219           else
1220             { ADD_NEW(state_offset, count); }
1221           }
1222         }
1223       break;
1224 
1225       /*-----------------------------------------------------------------*/
1226       case OP_TYPEUPTO:
1227       case OP_TYPEMINUPTO:
1228       case OP_TYPEPOSUPTO:
1229       ADD_ACTIVE(state_offset + 2 + IMM2_SIZE, 0);
1230       count = current_state->count;  /* Number already matched */
1231       if (clen > 0)
1232         {
1233         if (d == OP_ANY && ptr + 1 >= mb->end_subject &&
1234             (mb->moptions & (PCRE2_PARTIAL_HARD)) != 0 &&
1235             NLBLOCK->nltype == NLTYPE_FIXED &&
1236             NLBLOCK->nllen == 2 &&
1237             c == NLBLOCK->nl[0])
1238           {
1239           could_continue = partial_newline = TRUE;
1240           }
1241         else if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
1242             (c < 256 &&
1243               (d != OP_ANY || !IS_NEWLINE(ptr)) &&
1244               ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
1245           {
1246           if (codevalue == OP_TYPEPOSUPTO)
1247             {
1248             active_count--;           /* Remove non-match possibility */
1249             next_active_state--;
1250             }
1251           if (++count >= (int)GET2(code, 1))
1252             { ADD_NEW(state_offset + 2 + IMM2_SIZE, 0); }
1253           else
1254             { ADD_NEW(state_offset, count); }
1255           }
1256         }
1257       break;
1258 
1259 /* ========================================================================== */
1260       /* These are virtual opcodes that are used when something like
1261       OP_TYPEPLUS has OP_PROP, OP_NOTPROP, OP_ANYNL, or OP_EXTUNI as its
1262       argument. It keeps the code above fast for the other cases. The argument
1263       is in the d variable. */
1264 
1265 #ifdef SUPPORT_UNICODE
1266       case OP_PROP_EXTRA + OP_TYPEPLUS:
1267       case OP_PROP_EXTRA + OP_TYPEMINPLUS:
1268       case OP_PROP_EXTRA + OP_TYPEPOSPLUS:
1269       count = current_state->count;           /* Already matched */
1270       if (count > 0) { ADD_ACTIVE(state_offset + 4, 0); }
1271       if (clen > 0)
1272         {
1273         BOOL OK;
1274         const uint32_t *cp;
1275         const ucd_record * prop = GET_UCD(c);
1276         switch(code[2])
1277           {
1278           case PT_ANY:
1279           OK = TRUE;
1280           break;
1281 
1282           case PT_LAMP:
1283           OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll ||
1284             prop->chartype == ucp_Lt;
1285           break;
1286 
1287           case PT_GC:
1288           OK = PRIV(ucp_gentype)[prop->chartype] == code[3];
1289           break;
1290 
1291           case PT_PC:
1292           OK = prop->chartype == code[3];
1293           break;
1294 
1295           case PT_SC:
1296           OK = prop->script == code[3];
1297           break;
1298 
1299           /* These are specials for combination cases. */
1300 
1301           case PT_ALNUM:
1302           OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
1303                PRIV(ucp_gentype)[prop->chartype] == ucp_N;
1304           break;
1305 
1306           /* Perl space used to exclude VT, but from Perl 5.18 it is included,
1307           which means that Perl space and POSIX space are now identical. PCRE
1308           was changed at release 8.34. */
1309 
1310           case PT_SPACE:    /* Perl space */
1311           case PT_PXSPACE:  /* POSIX space */
1312           switch(c)
1313             {
1314             HSPACE_CASES:
1315             VSPACE_CASES:
1316             OK = TRUE;
1317             break;
1318 
1319             default:
1320             OK = PRIV(ucp_gentype)[prop->chartype] == ucp_Z;
1321             break;
1322             }
1323           break;
1324 
1325           case PT_WORD:
1326           OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
1327                PRIV(ucp_gentype)[prop->chartype] == ucp_N ||
1328                c == CHAR_UNDERSCORE;
1329           break;
1330 
1331           case PT_CLIST:
1332           cp = PRIV(ucd_caseless_sets) + code[3];
1333           for (;;)
1334             {
1335             if (c < *cp) { OK = FALSE; break; }
1336             if (c == *cp++) { OK = TRUE; break; }
1337             }
1338           break;
1339 
1340           case PT_UCNC:
1341           OK = c == CHAR_DOLLAR_SIGN || c == CHAR_COMMERCIAL_AT ||
1342                c == CHAR_GRAVE_ACCENT || (c >= 0xa0 && c <= 0xd7ff) ||
1343                c >= 0xe000;
1344           break;
1345 
1346           /* Should never occur, but keep compilers from grumbling. */
1347 
1348           default:
1349           OK = codevalue != OP_PROP;
1350           break;
1351           }
1352 
1353         if (OK == (d == OP_PROP))
1354           {
1355           if (count > 0 && codevalue == OP_PROP_EXTRA + OP_TYPEPOSPLUS)
1356             {
1357             active_count--;           /* Remove non-match possibility */
1358             next_active_state--;
1359             }
1360           count++;
1361           ADD_NEW(state_offset, count);
1362           }
1363         }
1364       break;
1365 
1366       /*-----------------------------------------------------------------*/
1367       case OP_EXTUNI_EXTRA + OP_TYPEPLUS:
1368       case OP_EXTUNI_EXTRA + OP_TYPEMINPLUS:
1369       case OP_EXTUNI_EXTRA + OP_TYPEPOSPLUS:
1370       count = current_state->count;  /* Already matched */
1371       if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1372       if (clen > 0)
1373         {
1374         uint32_t lgb, rgb;
1375         PCRE2_SPTR nptr = ptr + clen;
1376         int ncount = 0;
1377         if (count > 0 && codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSPLUS)
1378           {
1379           active_count--;           /* Remove non-match possibility */
1380           next_active_state--;
1381           }
1382         lgb = UCD_GRAPHBREAK(c);
1383         while (nptr < end_subject)
1384           {
1385           dlen = 1;
1386           if (!utf) d = *nptr; else { GETCHARLEN(d, nptr, dlen); }
1387           rgb = UCD_GRAPHBREAK(d);
1388           if ((PRIV(ucp_gbtable)[lgb] & (1u << rgb)) == 0) break;
1389           ncount++;
1390           lgb = rgb;
1391           nptr += dlen;
1392           }
1393         count++;
1394         ADD_NEW_DATA(-state_offset, count, ncount);
1395         }
1396       break;
1397 #endif
1398 
1399       /*-----------------------------------------------------------------*/
1400       case OP_ANYNL_EXTRA + OP_TYPEPLUS:
1401       case OP_ANYNL_EXTRA + OP_TYPEMINPLUS:
1402       case OP_ANYNL_EXTRA + OP_TYPEPOSPLUS:
1403       count = current_state->count;  /* Already matched */
1404       if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1405       if (clen > 0)
1406         {
1407         int ncount = 0;
1408         switch (c)
1409           {
1410           case CHAR_VT:
1411           case CHAR_FF:
1412           case CHAR_NEL:
1413 #ifndef EBCDIC
1414           case 0x2028:
1415           case 0x2029:
1416 #endif  /* Not EBCDIC */
1417           if (mb->bsr_convention == PCRE2_BSR_ANYCRLF) break;
1418           goto ANYNL01;
1419 
1420           case CHAR_CR:
1421           if (ptr + 1 < end_subject && UCHAR21TEST(ptr + 1) == CHAR_LF) ncount = 1;
1422           /* Fall through */
1423 
1424           ANYNL01:
1425           case CHAR_LF:
1426           if (count > 0 && codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSPLUS)
1427             {
1428             active_count--;           /* Remove non-match possibility */
1429             next_active_state--;
1430             }
1431           count++;
1432           ADD_NEW_DATA(-state_offset, count, ncount);
1433           break;
1434 
1435           default:
1436           break;
1437           }
1438         }
1439       break;
1440 
1441       /*-----------------------------------------------------------------*/
1442       case OP_VSPACE_EXTRA + OP_TYPEPLUS:
1443       case OP_VSPACE_EXTRA + OP_TYPEMINPLUS:
1444       case OP_VSPACE_EXTRA + OP_TYPEPOSPLUS:
1445       count = current_state->count;  /* Already matched */
1446       if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1447       if (clen > 0)
1448         {
1449         BOOL OK;
1450         switch (c)
1451           {
1452           VSPACE_CASES:
1453           OK = TRUE;
1454           break;
1455 
1456           default:
1457           OK = FALSE;
1458           break;
1459           }
1460 
1461         if (OK == (d == OP_VSPACE))
1462           {
1463           if (count > 0 && codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSPLUS)
1464             {
1465             active_count--;           /* Remove non-match possibility */
1466             next_active_state--;
1467             }
1468           count++;
1469           ADD_NEW_DATA(-state_offset, count, 0);
1470           }
1471         }
1472       break;
1473 
1474       /*-----------------------------------------------------------------*/
1475       case OP_HSPACE_EXTRA + OP_TYPEPLUS:
1476       case OP_HSPACE_EXTRA + OP_TYPEMINPLUS:
1477       case OP_HSPACE_EXTRA + OP_TYPEPOSPLUS:
1478       count = current_state->count;  /* Already matched */
1479       if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1480       if (clen > 0)
1481         {
1482         BOOL OK;
1483         switch (c)
1484           {
1485           HSPACE_CASES:
1486           OK = TRUE;
1487           break;
1488 
1489           default:
1490           OK = FALSE;
1491           break;
1492           }
1493 
1494         if (OK == (d == OP_HSPACE))
1495           {
1496           if (count > 0 && codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSPLUS)
1497             {
1498             active_count--;           /* Remove non-match possibility */
1499             next_active_state--;
1500             }
1501           count++;
1502           ADD_NEW_DATA(-state_offset, count, 0);
1503           }
1504         }
1505       break;
1506 
1507       /*-----------------------------------------------------------------*/
1508 #ifdef SUPPORT_UNICODE
1509       case OP_PROP_EXTRA + OP_TYPEQUERY:
1510       case OP_PROP_EXTRA + OP_TYPEMINQUERY:
1511       case OP_PROP_EXTRA + OP_TYPEPOSQUERY:
1512       count = 4;
1513       goto QS1;
1514 
1515       case OP_PROP_EXTRA + OP_TYPESTAR:
1516       case OP_PROP_EXTRA + OP_TYPEMINSTAR:
1517       case OP_PROP_EXTRA + OP_TYPEPOSSTAR:
1518       count = 0;
1519 
1520       QS1:
1521 
1522       ADD_ACTIVE(state_offset + 4, 0);
1523       if (clen > 0)
1524         {
1525         BOOL OK;
1526         const uint32_t *cp;
1527         const ucd_record * prop = GET_UCD(c);
1528         switch(code[2])
1529           {
1530           case PT_ANY:
1531           OK = TRUE;
1532           break;
1533 
1534           case PT_LAMP:
1535           OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll ||
1536             prop->chartype == ucp_Lt;
1537           break;
1538 
1539           case PT_GC:
1540           OK = PRIV(ucp_gentype)[prop->chartype] == code[3];
1541           break;
1542 
1543           case PT_PC:
1544           OK = prop->chartype == code[3];
1545           break;
1546 
1547           case PT_SC:
1548           OK = prop->script == code[3];
1549           break;
1550 
1551           /* These are specials for combination cases. */
1552 
1553           case PT_ALNUM:
1554           OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
1555                PRIV(ucp_gentype)[prop->chartype] == ucp_N;
1556           break;
1557 
1558           /* Perl space used to exclude VT, but from Perl 5.18 it is included,
1559           which means that Perl space and POSIX space are now identical. PCRE
1560           was changed at release 8.34. */
1561 
1562           case PT_SPACE:    /* Perl space */
1563           case PT_PXSPACE:  /* POSIX space */
1564           switch(c)
1565             {
1566             HSPACE_CASES:
1567             VSPACE_CASES:
1568             OK = TRUE;
1569             break;
1570 
1571             default:
1572             OK = PRIV(ucp_gentype)[prop->chartype] == ucp_Z;
1573             break;
1574             }
1575           break;
1576 
1577           case PT_WORD:
1578           OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
1579                PRIV(ucp_gentype)[prop->chartype] == ucp_N ||
1580                c == CHAR_UNDERSCORE;
1581           break;
1582 
1583           case PT_CLIST:
1584           cp = PRIV(ucd_caseless_sets) + code[3];
1585           for (;;)
1586             {
1587             if (c < *cp) { OK = FALSE; break; }
1588             if (c == *cp++) { OK = TRUE; break; }
1589             }
1590           break;
1591 
1592           case PT_UCNC:
1593           OK = c == CHAR_DOLLAR_SIGN || c == CHAR_COMMERCIAL_AT ||
1594                c == CHAR_GRAVE_ACCENT || (c >= 0xa0 && c <= 0xd7ff) ||
1595                c >= 0xe000;
1596           break;
1597 
1598           /* Should never occur, but keep compilers from grumbling. */
1599 
1600           default:
1601           OK = codevalue != OP_PROP;
1602           break;
1603           }
1604 
1605         if (OK == (d == OP_PROP))
1606           {
1607           if (codevalue == OP_PROP_EXTRA + OP_TYPEPOSSTAR ||
1608               codevalue == OP_PROP_EXTRA + OP_TYPEPOSQUERY)
1609             {
1610             active_count--;           /* Remove non-match possibility */
1611             next_active_state--;
1612             }
1613           ADD_NEW(state_offset + count, 0);
1614           }
1615         }
1616       break;
1617 
1618       /*-----------------------------------------------------------------*/
1619       case OP_EXTUNI_EXTRA + OP_TYPEQUERY:
1620       case OP_EXTUNI_EXTRA + OP_TYPEMINQUERY:
1621       case OP_EXTUNI_EXTRA + OP_TYPEPOSQUERY:
1622       count = 2;
1623       goto QS2;
1624 
1625       case OP_EXTUNI_EXTRA + OP_TYPESTAR:
1626       case OP_EXTUNI_EXTRA + OP_TYPEMINSTAR:
1627       case OP_EXTUNI_EXTRA + OP_TYPEPOSSTAR:
1628       count = 0;
1629 
1630       QS2:
1631 
1632       ADD_ACTIVE(state_offset + 2, 0);
1633       if (clen > 0)
1634         {
1635         uint32_t lgb, rgb;
1636         PCRE2_SPTR nptr = ptr + clen;
1637         int ncount = 0;
1638         if (codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSSTAR ||
1639             codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSQUERY)
1640           {
1641           active_count--;           /* Remove non-match possibility */
1642           next_active_state--;
1643           }
1644         lgb = UCD_GRAPHBREAK(c);
1645         while (nptr < end_subject)
1646           {
1647           dlen = 1;
1648           if (!utf) d = *nptr; else { GETCHARLEN(d, nptr, dlen); }
1649           rgb = UCD_GRAPHBREAK(d);
1650           if ((PRIV(ucp_gbtable)[lgb] & (1u << rgb)) == 0) break;
1651           ncount++;
1652           lgb = rgb;
1653           nptr += dlen;
1654           }
1655         ADD_NEW_DATA(-(state_offset + count), 0, ncount);
1656         }
1657       break;
1658 #endif
1659 
1660       /*-----------------------------------------------------------------*/
1661       case OP_ANYNL_EXTRA + OP_TYPEQUERY:
1662       case OP_ANYNL_EXTRA + OP_TYPEMINQUERY:
1663       case OP_ANYNL_EXTRA + OP_TYPEPOSQUERY:
1664       count = 2;
1665       goto QS3;
1666 
1667       case OP_ANYNL_EXTRA + OP_TYPESTAR:
1668       case OP_ANYNL_EXTRA + OP_TYPEMINSTAR:
1669       case OP_ANYNL_EXTRA + OP_TYPEPOSSTAR:
1670       count = 0;
1671 
1672       QS3:
1673       ADD_ACTIVE(state_offset + 2, 0);
1674       if (clen > 0)
1675         {
1676         int ncount = 0;
1677         switch (c)
1678           {
1679           case CHAR_VT:
1680           case CHAR_FF:
1681           case CHAR_NEL:
1682 #ifndef EBCDIC
1683           case 0x2028:
1684           case 0x2029:
1685 #endif  /* Not EBCDIC */
1686           if (mb->bsr_convention == PCRE2_BSR_ANYCRLF) break;
1687           goto ANYNL02;
1688 
1689           case CHAR_CR:
1690           if (ptr + 1 < end_subject && UCHAR21TEST(ptr + 1) == CHAR_LF) ncount = 1;
1691           /* Fall through */
1692 
1693           ANYNL02:
1694           case CHAR_LF:
1695           if (codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSSTAR ||
1696               codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSQUERY)
1697             {
1698             active_count--;           /* Remove non-match possibility */
1699             next_active_state--;
1700             }
1701           ADD_NEW_DATA(-(state_offset + (int)count), 0, ncount);
1702           break;
1703 
1704           default:
1705           break;
1706           }
1707         }
1708       break;
1709 
1710       /*-----------------------------------------------------------------*/
1711       case OP_VSPACE_EXTRA + OP_TYPEQUERY:
1712       case OP_VSPACE_EXTRA + OP_TYPEMINQUERY:
1713       case OP_VSPACE_EXTRA + OP_TYPEPOSQUERY:
1714       count = 2;
1715       goto QS4;
1716 
1717       case OP_VSPACE_EXTRA + OP_TYPESTAR:
1718       case OP_VSPACE_EXTRA + OP_TYPEMINSTAR:
1719       case OP_VSPACE_EXTRA + OP_TYPEPOSSTAR:
1720       count = 0;
1721 
1722       QS4:
1723       ADD_ACTIVE(state_offset + 2, 0);
1724       if (clen > 0)
1725         {
1726         BOOL OK;
1727         switch (c)
1728           {
1729           VSPACE_CASES:
1730           OK = TRUE;
1731           break;
1732 
1733           default:
1734           OK = FALSE;
1735           break;
1736           }
1737         if (OK == (d == OP_VSPACE))
1738           {
1739           if (codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSSTAR ||
1740               codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSQUERY)
1741             {
1742             active_count--;           /* Remove non-match possibility */
1743             next_active_state--;
1744             }
1745           ADD_NEW_DATA(-(state_offset + (int)count), 0, 0);
1746           }
1747         }
1748       break;
1749 
1750       /*-----------------------------------------------------------------*/
1751       case OP_HSPACE_EXTRA + OP_TYPEQUERY:
1752       case OP_HSPACE_EXTRA + OP_TYPEMINQUERY:
1753       case OP_HSPACE_EXTRA + OP_TYPEPOSQUERY:
1754       count = 2;
1755       goto QS5;
1756 
1757       case OP_HSPACE_EXTRA + OP_TYPESTAR:
1758       case OP_HSPACE_EXTRA + OP_TYPEMINSTAR:
1759       case OP_HSPACE_EXTRA + OP_TYPEPOSSTAR:
1760       count = 0;
1761 
1762       QS5:
1763       ADD_ACTIVE(state_offset + 2, 0);
1764       if (clen > 0)
1765         {
1766         BOOL OK;
1767         switch (c)
1768           {
1769           HSPACE_CASES:
1770           OK = TRUE;
1771           break;
1772 
1773           default:
1774           OK = FALSE;
1775           break;
1776           }
1777 
1778         if (OK == (d == OP_HSPACE))
1779           {
1780           if (codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSSTAR ||
1781               codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSQUERY)
1782             {
1783             active_count--;           /* Remove non-match possibility */
1784             next_active_state--;
1785             }
1786           ADD_NEW_DATA(-(state_offset + (int)count), 0, 0);
1787           }
1788         }
1789       break;
1790 
1791       /*-----------------------------------------------------------------*/
1792 #ifdef SUPPORT_UNICODE
1793       case OP_PROP_EXTRA + OP_TYPEEXACT:
1794       case OP_PROP_EXTRA + OP_TYPEUPTO:
1795       case OP_PROP_EXTRA + OP_TYPEMINUPTO:
1796       case OP_PROP_EXTRA + OP_TYPEPOSUPTO:
1797       if (codevalue != OP_PROP_EXTRA + OP_TYPEEXACT)
1798         { ADD_ACTIVE(state_offset + 1 + IMM2_SIZE + 3, 0); }
1799       count = current_state->count;  /* Number already matched */
1800       if (clen > 0)
1801         {
1802         BOOL OK;
1803         const uint32_t *cp;
1804         const ucd_record * prop = GET_UCD(c);
1805         switch(code[1 + IMM2_SIZE + 1])
1806           {
1807           case PT_ANY:
1808           OK = TRUE;
1809           break;
1810 
1811           case PT_LAMP:
1812           OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll ||
1813             prop->chartype == ucp_Lt;
1814           break;
1815 
1816           case PT_GC:
1817           OK = PRIV(ucp_gentype)[prop->chartype] == code[1 + IMM2_SIZE + 2];
1818           break;
1819 
1820           case PT_PC:
1821           OK = prop->chartype == code[1 + IMM2_SIZE + 2];
1822           break;
1823 
1824           case PT_SC:
1825           OK = prop->script == code[1 + IMM2_SIZE + 2];
1826           break;
1827 
1828           /* These are specials for combination cases. */
1829 
1830           case PT_ALNUM:
1831           OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
1832                PRIV(ucp_gentype)[prop->chartype] == ucp_N;
1833           break;
1834 
1835           /* Perl space used to exclude VT, but from Perl 5.18 it is included,
1836           which means that Perl space and POSIX space are now identical. PCRE
1837           was changed at release 8.34. */
1838 
1839           case PT_SPACE:    /* Perl space */
1840           case PT_PXSPACE:  /* POSIX space */
1841           switch(c)
1842             {
1843             HSPACE_CASES:
1844             VSPACE_CASES:
1845             OK = TRUE;
1846             break;
1847 
1848             default:
1849             OK = PRIV(ucp_gentype)[prop->chartype] == ucp_Z;
1850             break;
1851             }
1852           break;
1853 
1854           case PT_WORD:
1855           OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
1856                PRIV(ucp_gentype)[prop->chartype] == ucp_N ||
1857                c == CHAR_UNDERSCORE;
1858           break;
1859 
1860           case PT_CLIST:
1861           cp = PRIV(ucd_caseless_sets) + code[1 + IMM2_SIZE + 2];
1862           for (;;)
1863             {
1864             if (c < *cp) { OK = FALSE; break; }
1865             if (c == *cp++) { OK = TRUE; break; }
1866             }
1867           break;
1868 
1869           case PT_UCNC:
1870           OK = c == CHAR_DOLLAR_SIGN || c == CHAR_COMMERCIAL_AT ||
1871                c == CHAR_GRAVE_ACCENT || (c >= 0xa0 && c <= 0xd7ff) ||
1872                c >= 0xe000;
1873           break;
1874 
1875           /* Should never occur, but keep compilers from grumbling. */
1876 
1877           default:
1878           OK = codevalue != OP_PROP;
1879           break;
1880           }
1881 
1882         if (OK == (d == OP_PROP))
1883           {
1884           if (codevalue == OP_PROP_EXTRA + OP_TYPEPOSUPTO)
1885             {
1886             active_count--;           /* Remove non-match possibility */
1887             next_active_state--;
1888             }
1889           if (++count >= (int)GET2(code, 1))
1890             { ADD_NEW(state_offset + 1 + IMM2_SIZE + 3, 0); }
1891           else
1892             { ADD_NEW(state_offset, count); }
1893           }
1894         }
1895       break;
1896 
1897       /*-----------------------------------------------------------------*/
1898       case OP_EXTUNI_EXTRA + OP_TYPEEXACT:
1899       case OP_EXTUNI_EXTRA + OP_TYPEUPTO:
1900       case OP_EXTUNI_EXTRA + OP_TYPEMINUPTO:
1901       case OP_EXTUNI_EXTRA + OP_TYPEPOSUPTO:
1902       if (codevalue != OP_EXTUNI_EXTRA + OP_TYPEEXACT)
1903         { ADD_ACTIVE(state_offset + 2 + IMM2_SIZE, 0); }
1904       count = current_state->count;  /* Number already matched */
1905       if (clen > 0)
1906         {
1907         uint32_t lgb, rgb;
1908         PCRE2_SPTR nptr = ptr + clen;
1909         int ncount = 0;
1910         if (codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSUPTO)
1911           {
1912           active_count--;           /* Remove non-match possibility */
1913           next_active_state--;
1914           }
1915         lgb = UCD_GRAPHBREAK(c);
1916         while (nptr < end_subject)
1917           {
1918           dlen = 1;
1919           if (!utf) d = *nptr; else { GETCHARLEN(d, nptr, dlen); }
1920           rgb = UCD_GRAPHBREAK(d);
1921           if ((PRIV(ucp_gbtable)[lgb] & (1u << rgb)) == 0) break;
1922           ncount++;
1923           lgb = rgb;
1924           nptr += dlen;
1925           }
1926         if (nptr >= end_subject && (mb->moptions & PCRE2_PARTIAL_HARD) != 0)
1927             reset_could_continue = TRUE;
1928         if (++count >= (int)GET2(code, 1))
1929           { ADD_NEW_DATA(-(state_offset + 2 + IMM2_SIZE), 0, ncount); }
1930         else
1931           { ADD_NEW_DATA(-state_offset, count, ncount); }
1932         }
1933       break;
1934 #endif
1935 
1936       /*-----------------------------------------------------------------*/
1937       case OP_ANYNL_EXTRA + OP_TYPEEXACT:
1938       case OP_ANYNL_EXTRA + OP_TYPEUPTO:
1939       case OP_ANYNL_EXTRA + OP_TYPEMINUPTO:
1940       case OP_ANYNL_EXTRA + OP_TYPEPOSUPTO:
1941       if (codevalue != OP_ANYNL_EXTRA + OP_TYPEEXACT)
1942         { ADD_ACTIVE(state_offset + 2 + IMM2_SIZE, 0); }
1943       count = current_state->count;  /* Number already matched */
1944       if (clen > 0)
1945         {
1946         int ncount = 0;
1947         switch (c)
1948           {
1949           case CHAR_VT:
1950           case CHAR_FF:
1951           case CHAR_NEL:
1952 #ifndef EBCDIC
1953           case 0x2028:
1954           case 0x2029:
1955 #endif  /* Not EBCDIC */
1956           if (mb->bsr_convention == PCRE2_BSR_ANYCRLF) break;
1957           goto ANYNL03;
1958 
1959           case CHAR_CR:
1960           if (ptr + 1 < end_subject && UCHAR21TEST(ptr + 1) == CHAR_LF) ncount = 1;
1961           /* Fall through */
1962 
1963           ANYNL03:
1964           case CHAR_LF:
1965           if (codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSUPTO)
1966             {
1967             active_count--;           /* Remove non-match possibility */
1968             next_active_state--;
1969             }
1970           if (++count >= (int)GET2(code, 1))
1971             { ADD_NEW_DATA(-(state_offset + 2 + IMM2_SIZE), 0, ncount); }
1972           else
1973             { ADD_NEW_DATA(-state_offset, count, ncount); }
1974           break;
1975 
1976           default:
1977           break;
1978           }
1979         }
1980       break;
1981 
1982       /*-----------------------------------------------------------------*/
1983       case OP_VSPACE_EXTRA + OP_TYPEEXACT:
1984       case OP_VSPACE_EXTRA + OP_TYPEUPTO:
1985       case OP_VSPACE_EXTRA + OP_TYPEMINUPTO:
1986       case OP_VSPACE_EXTRA + OP_TYPEPOSUPTO:
1987       if (codevalue != OP_VSPACE_EXTRA + OP_TYPEEXACT)
1988         { ADD_ACTIVE(state_offset + 2 + IMM2_SIZE, 0); }
1989       count = current_state->count;  /* Number already matched */
1990       if (clen > 0)
1991         {
1992         BOOL OK;
1993         switch (c)
1994           {
1995           VSPACE_CASES:
1996           OK = TRUE;
1997           break;
1998 
1999           default:
2000           OK = FALSE;
2001           }
2002 
2003         if (OK == (d == OP_VSPACE))
2004           {
2005           if (codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSUPTO)
2006             {
2007             active_count--;           /* Remove non-match possibility */
2008             next_active_state--;
2009             }
2010           if (++count >= (int)GET2(code, 1))
2011             { ADD_NEW_DATA(-(state_offset + 2 + IMM2_SIZE), 0, 0); }
2012           else
2013             { ADD_NEW_DATA(-state_offset, count, 0); }
2014           }
2015         }
2016       break;
2017 
2018       /*-----------------------------------------------------------------*/
2019       case OP_HSPACE_EXTRA + OP_TYPEEXACT:
2020       case OP_HSPACE_EXTRA + OP_TYPEUPTO:
2021       case OP_HSPACE_EXTRA + OP_TYPEMINUPTO:
2022       case OP_HSPACE_EXTRA + OP_TYPEPOSUPTO:
2023       if (codevalue != OP_HSPACE_EXTRA + OP_TYPEEXACT)
2024         { ADD_ACTIVE(state_offset + 2 + IMM2_SIZE, 0); }
2025       count = current_state->count;  /* Number already matched */
2026       if (clen > 0)
2027         {
2028         BOOL OK;
2029         switch (c)
2030           {
2031           HSPACE_CASES:
2032           OK = TRUE;
2033           break;
2034 
2035           default:
2036           OK = FALSE;
2037           break;
2038           }
2039 
2040         if (OK == (d == OP_HSPACE))
2041           {
2042           if (codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSUPTO)
2043             {
2044             active_count--;           /* Remove non-match possibility */
2045             next_active_state--;
2046             }
2047           if (++count >= (int)GET2(code, 1))
2048             { ADD_NEW_DATA(-(state_offset + 2 + IMM2_SIZE), 0, 0); }
2049           else
2050             { ADD_NEW_DATA(-state_offset, count, 0); }
2051           }
2052         }
2053       break;
2054 
2055 /* ========================================================================== */
2056       /* These opcodes are followed by a character that is usually compared
2057       to the current subject character; it is loaded into d. We still get
2058       here even if there is no subject character, because in some cases zero
2059       repetitions are permitted. */
2060 
2061       /*-----------------------------------------------------------------*/
2062       case OP_CHAR:
2063       if (clen > 0 && c == d) { ADD_NEW(state_offset + dlen + 1, 0); }
2064       break;
2065 
2066       /*-----------------------------------------------------------------*/
2067       case OP_CHARI:
2068       if (clen == 0) break;
2069 
2070 #ifdef SUPPORT_UNICODE
2071       if (utf)
2072         {
2073         if (c == d) { ADD_NEW(state_offset + dlen + 1, 0); } else
2074           {
2075           unsigned int othercase;
2076           if (c < 128)
2077             othercase = fcc[c];
2078           else
2079             othercase = UCD_OTHERCASE(c);
2080           if (d == othercase) { ADD_NEW(state_offset + dlen + 1, 0); }
2081           }
2082         }
2083       else
2084 #endif  /* SUPPORT_UNICODE */
2085       /* Not UTF mode */
2086         {
2087         if (TABLE_GET(c, lcc, c) == TABLE_GET(d, lcc, d))
2088           { ADD_NEW(state_offset + 2, 0); }
2089         }
2090       break;
2091 
2092 
2093 #ifdef SUPPORT_UNICODE
2094       /*-----------------------------------------------------------------*/
2095       /* This is a tricky one because it can match more than one character.
2096       Find out how many characters to skip, and then set up a negative state
2097       to wait for them to pass before continuing. */
2098 
2099       case OP_EXTUNI:
2100       if (clen > 0)
2101         {
2102         uint32_t lgb, rgb;
2103         PCRE2_SPTR nptr = ptr + clen;
2104         int ncount = 0;
2105         lgb = UCD_GRAPHBREAK(c);
2106         while (nptr < end_subject)
2107           {
2108           dlen = 1;
2109           if (!utf) d = *nptr; else { GETCHARLEN(d, nptr, dlen); }
2110           rgb = UCD_GRAPHBREAK(d);
2111           if ((PRIV(ucp_gbtable)[lgb] & (1u << rgb)) == 0) break;
2112           ncount++;
2113           lgb = rgb;
2114           nptr += dlen;
2115           }
2116         if (nptr >= end_subject && (mb->moptions & PCRE2_PARTIAL_HARD) != 0)
2117             reset_could_continue = TRUE;
2118         ADD_NEW_DATA(-(state_offset + 1), 0, ncount);
2119         }
2120       break;
2121 #endif
2122 
2123       /*-----------------------------------------------------------------*/
2124       /* This is a tricky like EXTUNI because it too can match more than one
2125       character (when CR is followed by LF). In this case, set up a negative
2126       state to wait for one character to pass before continuing. */
2127 
2128       case OP_ANYNL:
2129       if (clen > 0) switch(c)
2130         {
2131         case CHAR_VT:
2132         case CHAR_FF:
2133         case CHAR_NEL:
2134 #ifndef EBCDIC
2135         case 0x2028:
2136         case 0x2029:
2137 #endif  /* Not EBCDIC */
2138         if (mb->bsr_convention == PCRE2_BSR_ANYCRLF) break;
2139 
2140         case CHAR_LF:
2141         ADD_NEW(state_offset + 1, 0);
2142         break;
2143 
2144         case CHAR_CR:
2145         if (ptr + 1 >= end_subject)
2146           {
2147           ADD_NEW(state_offset + 1, 0);
2148           if ((mb->moptions & PCRE2_PARTIAL_HARD) != 0)
2149             reset_could_continue = TRUE;
2150           }
2151         else if (UCHAR21TEST(ptr + 1) == CHAR_LF)
2152           {
2153           ADD_NEW_DATA(-(state_offset + 1), 0, 1);
2154           }
2155         else
2156           {
2157           ADD_NEW(state_offset + 1, 0);
2158           }
2159         break;
2160         }
2161       break;
2162 
2163       /*-----------------------------------------------------------------*/
2164       case OP_NOT_VSPACE:
2165       if (clen > 0) switch(c)
2166         {
2167         VSPACE_CASES:
2168         break;
2169 
2170         default:
2171         ADD_NEW(state_offset + 1, 0);
2172         break;
2173         }
2174       break;
2175 
2176       /*-----------------------------------------------------------------*/
2177       case OP_VSPACE:
2178       if (clen > 0) switch(c)
2179         {
2180         VSPACE_CASES:
2181         ADD_NEW(state_offset + 1, 0);
2182         break;
2183 
2184         default:
2185         break;
2186         }
2187       break;
2188 
2189       /*-----------------------------------------------------------------*/
2190       case OP_NOT_HSPACE:
2191       if (clen > 0) switch(c)
2192         {
2193         HSPACE_CASES:
2194         break;
2195 
2196         default:
2197         ADD_NEW(state_offset + 1, 0);
2198         break;
2199         }
2200       break;
2201 
2202       /*-----------------------------------------------------------------*/
2203       case OP_HSPACE:
2204       if (clen > 0) switch(c)
2205         {
2206         HSPACE_CASES:
2207         ADD_NEW(state_offset + 1, 0);
2208         break;
2209 
2210         default:
2211         break;
2212         }
2213       break;
2214 
2215       /*-----------------------------------------------------------------*/
2216       /* Match a negated single character casefully. */
2217 
2218       case OP_NOT:
2219       if (clen > 0 && c != d) { ADD_NEW(state_offset + dlen + 1, 0); }
2220       break;
2221 
2222       /*-----------------------------------------------------------------*/
2223       /* Match a negated single character caselessly. */
2224 
2225       case OP_NOTI:
2226       if (clen > 0)
2227         {
2228         unsigned int otherd;
2229 #ifdef SUPPORT_UNICODE
2230         if (utf && d >= 128)
2231           otherd = UCD_OTHERCASE(d);
2232         else
2233 #endif  /* SUPPORT_UNICODE */
2234         otherd = TABLE_GET(d, fcc, d);
2235         if (c != d && c != otherd)
2236           { ADD_NEW(state_offset + dlen + 1, 0); }
2237         }
2238       break;
2239 
2240       /*-----------------------------------------------------------------*/
2241       case OP_PLUSI:
2242       case OP_MINPLUSI:
2243       case OP_POSPLUSI:
2244       case OP_NOTPLUSI:
2245       case OP_NOTMINPLUSI:
2246       case OP_NOTPOSPLUSI:
2247       caseless = TRUE;
2248       codevalue -= OP_STARI - OP_STAR;
2249 
2250       /* Fall through */
2251       case OP_PLUS:
2252       case OP_MINPLUS:
2253       case OP_POSPLUS:
2254       case OP_NOTPLUS:
2255       case OP_NOTMINPLUS:
2256       case OP_NOTPOSPLUS:
2257       count = current_state->count;  /* Already matched */
2258       if (count > 0) { ADD_ACTIVE(state_offset + dlen + 1, 0); }
2259       if (clen > 0)
2260         {
2261         uint32_t otherd = NOTACHAR;
2262         if (caseless)
2263           {
2264 #ifdef SUPPORT_UNICODE
2265           if (utf && d >= 128)
2266             otherd = UCD_OTHERCASE(d);
2267           else
2268 #endif  /* SUPPORT_UNICODE */
2269           otherd = TABLE_GET(d, fcc, d);
2270           }
2271         if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
2272           {
2273           if (count > 0 &&
2274               (codevalue == OP_POSPLUS || codevalue == OP_NOTPOSPLUS))
2275             {
2276             active_count--;             /* Remove non-match possibility */
2277             next_active_state--;
2278             }
2279           count++;
2280           ADD_NEW(state_offset, count);
2281           }
2282         }
2283       break;
2284 
2285       /*-----------------------------------------------------------------*/
2286       case OP_QUERYI:
2287       case OP_MINQUERYI:
2288       case OP_POSQUERYI:
2289       case OP_NOTQUERYI:
2290       case OP_NOTMINQUERYI:
2291       case OP_NOTPOSQUERYI:
2292       caseless = TRUE;
2293       codevalue -= OP_STARI - OP_STAR;
2294       /* Fall through */
2295       case OP_QUERY:
2296       case OP_MINQUERY:
2297       case OP_POSQUERY:
2298       case OP_NOTQUERY:
2299       case OP_NOTMINQUERY:
2300       case OP_NOTPOSQUERY:
2301       ADD_ACTIVE(state_offset + dlen + 1, 0);
2302       if (clen > 0)
2303         {
2304         uint32_t otherd = NOTACHAR;
2305         if (caseless)
2306           {
2307 #ifdef SUPPORT_UNICODE
2308           if (utf && d >= 128)
2309             otherd = UCD_OTHERCASE(d);
2310           else
2311 #endif  /* SUPPORT_UNICODE */
2312           otherd = TABLE_GET(d, fcc, d);
2313           }
2314         if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
2315           {
2316           if (codevalue == OP_POSQUERY || codevalue == OP_NOTPOSQUERY)
2317             {
2318             active_count--;            /* Remove non-match possibility */
2319             next_active_state--;
2320             }
2321           ADD_NEW(state_offset + dlen + 1, 0);
2322           }
2323         }
2324       break;
2325 
2326       /*-----------------------------------------------------------------*/
2327       case OP_STARI:
2328       case OP_MINSTARI:
2329       case OP_POSSTARI:
2330       case OP_NOTSTARI:
2331       case OP_NOTMINSTARI:
2332       case OP_NOTPOSSTARI:
2333       caseless = TRUE;
2334       codevalue -= OP_STARI - OP_STAR;
2335       /* Fall through */
2336       case OP_STAR:
2337       case OP_MINSTAR:
2338       case OP_POSSTAR:
2339       case OP_NOTSTAR:
2340       case OP_NOTMINSTAR:
2341       case OP_NOTPOSSTAR:
2342       ADD_ACTIVE(state_offset + dlen + 1, 0);
2343       if (clen > 0)
2344         {
2345         uint32_t otherd = NOTACHAR;
2346         if (caseless)
2347           {
2348 #ifdef SUPPORT_UNICODE
2349           if (utf && d >= 128)
2350             otherd = UCD_OTHERCASE(d);
2351           else
2352 #endif  /* SUPPORT_UNICODE */
2353           otherd = TABLE_GET(d, fcc, d);
2354           }
2355         if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
2356           {
2357           if (codevalue == OP_POSSTAR || codevalue == OP_NOTPOSSTAR)
2358             {
2359             active_count--;            /* Remove non-match possibility */
2360             next_active_state--;
2361             }
2362           ADD_NEW(state_offset, 0);
2363           }
2364         }
2365       break;
2366 
2367       /*-----------------------------------------------------------------*/
2368       case OP_EXACTI:
2369       case OP_NOTEXACTI:
2370       caseless = TRUE;
2371       codevalue -= OP_STARI - OP_STAR;
2372       /* Fall through */
2373       case OP_EXACT:
2374       case OP_NOTEXACT:
2375       count = current_state->count;  /* Number already matched */
2376       if (clen > 0)
2377         {
2378         uint32_t otherd = NOTACHAR;
2379         if (caseless)
2380           {
2381 #ifdef SUPPORT_UNICODE
2382           if (utf && d >= 128)
2383             otherd = UCD_OTHERCASE(d);
2384           else
2385 #endif  /* SUPPORT_UNICODE */
2386           otherd = TABLE_GET(d, fcc, d);
2387           }
2388         if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
2389           {
2390           if (++count >= (int)GET2(code, 1))
2391             { ADD_NEW(state_offset + dlen + 1 + IMM2_SIZE, 0); }
2392           else
2393             { ADD_NEW(state_offset, count); }
2394           }
2395         }
2396       break;
2397 
2398       /*-----------------------------------------------------------------*/
2399       case OP_UPTOI:
2400       case OP_MINUPTOI:
2401       case OP_POSUPTOI:
2402       case OP_NOTUPTOI:
2403       case OP_NOTMINUPTOI:
2404       case OP_NOTPOSUPTOI:
2405       caseless = TRUE;
2406       codevalue -= OP_STARI - OP_STAR;
2407       /* Fall through */
2408       case OP_UPTO:
2409       case OP_MINUPTO:
2410       case OP_POSUPTO:
2411       case OP_NOTUPTO:
2412       case OP_NOTMINUPTO:
2413       case OP_NOTPOSUPTO:
2414       ADD_ACTIVE(state_offset + dlen + 1 + IMM2_SIZE, 0);
2415       count = current_state->count;  /* Number already matched */
2416       if (clen > 0)
2417         {
2418         uint32_t otherd = NOTACHAR;
2419         if (caseless)
2420           {
2421 #ifdef SUPPORT_UNICODE
2422           if (utf && d >= 128)
2423             otherd = UCD_OTHERCASE(d);
2424           else
2425 #endif  /* SUPPORT_UNICODE */
2426           otherd = TABLE_GET(d, fcc, d);
2427           }
2428         if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
2429           {
2430           if (codevalue == OP_POSUPTO || codevalue == OP_NOTPOSUPTO)
2431             {
2432             active_count--;             /* Remove non-match possibility */
2433             next_active_state--;
2434             }
2435           if (++count >= (int)GET2(code, 1))
2436             { ADD_NEW(state_offset + dlen + 1 + IMM2_SIZE, 0); }
2437           else
2438             { ADD_NEW(state_offset, count); }
2439           }
2440         }
2441       break;
2442 
2443 
2444 /* ========================================================================== */
2445       /* These are the class-handling opcodes */
2446 
2447       case OP_CLASS:
2448       case OP_NCLASS:
2449       case OP_XCLASS:
2450         {
2451         BOOL isinclass = FALSE;
2452         int next_state_offset;
2453         PCRE2_SPTR ecode;
2454 
2455         /* For a simple class, there is always just a 32-byte table, and we
2456         can set isinclass from it. */
2457 
2458         if (codevalue != OP_XCLASS)
2459           {
2460           ecode = code + 1 + (32 / sizeof(PCRE2_UCHAR));
2461           if (clen > 0)
2462             {
2463             isinclass = (c > 255)? (codevalue == OP_NCLASS) :
2464               ((((uint8_t *)(code + 1))[c/8] & (1 << (c&7))) != 0);
2465             }
2466           }
2467 
2468         /* An extended class may have a table or a list of single characters,
2469         ranges, or both, and it may be positive or negative. There's a
2470         function that sorts all this out. */
2471 
2472         else
2473          {
2474          ecode = code + GET(code, 1);
2475          if (clen > 0) isinclass = PRIV(xclass)(c, code + 1 + LINK_SIZE, utf);
2476          }
2477 
2478         /* At this point, isinclass is set for all kinds of class, and ecode
2479         points to the byte after the end of the class. If there is a
2480         quantifier, this is where it will be. */
2481 
2482         next_state_offset = (int)(ecode - start_code);
2483 
2484         switch (*ecode)
2485           {
2486           case OP_CRSTAR:
2487           case OP_CRMINSTAR:
2488           case OP_CRPOSSTAR:
2489           ADD_ACTIVE(next_state_offset + 1, 0);
2490           if (isinclass)
2491             {
2492             if (*ecode == OP_CRPOSSTAR)
2493               {
2494               active_count--;           /* Remove non-match possibility */
2495               next_active_state--;
2496               }
2497             ADD_NEW(state_offset, 0);
2498             }
2499           break;
2500 
2501           case OP_CRPLUS:
2502           case OP_CRMINPLUS:
2503           case OP_CRPOSPLUS:
2504           count = current_state->count;  /* Already matched */
2505           if (count > 0) { ADD_ACTIVE(next_state_offset + 1, 0); }
2506           if (isinclass)
2507             {
2508             if (count > 0 && *ecode == OP_CRPOSPLUS)
2509               {
2510               active_count--;           /* Remove non-match possibility */
2511               next_active_state--;
2512               }
2513             count++;
2514             ADD_NEW(state_offset, count);
2515             }
2516           break;
2517 
2518           case OP_CRQUERY:
2519           case OP_CRMINQUERY:
2520           case OP_CRPOSQUERY:
2521           ADD_ACTIVE(next_state_offset + 1, 0);
2522           if (isinclass)
2523             {
2524             if (*ecode == OP_CRPOSQUERY)
2525               {
2526               active_count--;           /* Remove non-match possibility */
2527               next_active_state--;
2528               }
2529             ADD_NEW(next_state_offset + 1, 0);
2530             }
2531           break;
2532 
2533           case OP_CRRANGE:
2534           case OP_CRMINRANGE:
2535           case OP_CRPOSRANGE:
2536           count = current_state->count;  /* Already matched */
2537           if (count >= (int)GET2(ecode, 1))
2538             { ADD_ACTIVE(next_state_offset + 1 + 2 * IMM2_SIZE, 0); }
2539           if (isinclass)
2540             {
2541             int max = (int)GET2(ecode, 1 + IMM2_SIZE);
2542             if (*ecode == OP_CRPOSRANGE)
2543               {
2544               active_count--;           /* Remove non-match possibility */
2545               next_active_state--;
2546               }
2547             if (++count >= max && max != 0)   /* Max 0 => no limit */
2548               { ADD_NEW(next_state_offset + 1 + 2 * IMM2_SIZE, 0); }
2549             else
2550               { ADD_NEW(state_offset, count); }
2551             }
2552           break;
2553 
2554           default:
2555           if (isinclass) { ADD_NEW(next_state_offset, 0); }
2556           break;
2557           }
2558         }
2559       break;
2560 
2561 /* ========================================================================== */
2562       /* These are the opcodes for fancy brackets of various kinds. We have
2563       to use recursion in order to handle them. The "always failing" assertion
2564       (?!) is optimised to OP_FAIL when compiling, so we have to support that,
2565       though the other "backtracking verbs" are not supported. */
2566 
2567       case OP_FAIL:
2568       forced_fail++;    /* Count FAILs for multiple states */
2569       break;
2570 
2571       case OP_ASSERT:
2572       case OP_ASSERT_NOT:
2573       case OP_ASSERTBACK:
2574       case OP_ASSERTBACK_NOT:
2575         {
2576         PCRE2_SPTR endasscode = code + GET(code, 1);
2577         PCRE2_SIZE local_offsets[2];
2578         int rc;
2579         int local_workspace[1000];
2580 
2581         while (*endasscode == OP_ALT) endasscode += GET(endasscode, 1);
2582 
2583         rc = internal_dfa_match(
2584           mb,                                   /* static match data */
2585           code,                                 /* this subexpression's code */
2586           ptr,                                  /* where we currently are */
2587           (PCRE2_SIZE)(ptr - start_subject),    /* start offset */
2588           local_offsets,                        /* offset vector */
2589           sizeof(local_offsets)/sizeof(PCRE2_SIZE), /* size of same */
2590           local_workspace,                      /* workspace vector */
2591           sizeof(local_workspace)/sizeof(int),  /* size of same */
2592           rlevel);                              /* function recursion level */
2593 
2594         if (rc == PCRE2_ERROR_DFA_UITEM) return rc;
2595         if ((rc >= 0) == (codevalue == OP_ASSERT || codevalue == OP_ASSERTBACK))
2596             { ADD_ACTIVE((int)(endasscode + LINK_SIZE + 1 - start_code), 0); }
2597         }
2598       break;
2599 
2600       /*-----------------------------------------------------------------*/
2601       case OP_COND:
2602       case OP_SCOND:
2603         {
2604         PCRE2_SIZE local_offsets[1000];
2605         int local_workspace[1000];
2606         int codelink = (int)GET(code, 1);
2607         PCRE2_UCHAR condcode;
2608 
2609         /* Because of the way auto-callout works during compile, a callout item
2610         is inserted between OP_COND and an assertion condition. This does not
2611         happen for the other conditions. */
2612 
2613         if (code[LINK_SIZE + 1] == OP_CALLOUT
2614             || code[LINK_SIZE + 1] == OP_CALLOUT_STR)
2615           {
2616           PCRE2_SIZE callout_length = (code[LINK_SIZE + 1] == OP_CALLOUT)?
2617             (PCRE2_SIZE)PRIV(OP_lengths)[OP_CALLOUT] :
2618             (PCRE2_SIZE)GET(code, 2 + 3*LINK_SIZE);
2619 
2620           rrc = 0;
2621           if (mb->callout != NULL)
2622             {
2623             pcre2_callout_block cb;
2624             cb.version          = 1;
2625             cb.capture_top      = 1;
2626             cb.capture_last     = 0;
2627             cb.offset_vector    = offsets;
2628             cb.mark             = NULL;   /* No (*MARK) support */
2629             cb.subject          = start_subject;
2630             cb.subject_length   = (PCRE2_SIZE)(end_subject - start_subject);
2631             cb.start_match      = (PCRE2_SIZE)(current_subject - start_subject);
2632             cb.current_position = (PCRE2_SIZE)(ptr - start_subject);
2633             cb.pattern_position = GET(code, LINK_SIZE + 2);
2634             cb.next_item_length = GET(code, LINK_SIZE + 2 + LINK_SIZE);
2635 
2636             if (code[LINK_SIZE + 1] == OP_CALLOUT)
2637               {
2638               cb.callout_number = code[2 + 3*LINK_SIZE];
2639               cb.callout_string_offset = 0;
2640               cb.callout_string = NULL;
2641               cb.callout_string_length = 0;
2642               }
2643             else
2644               {
2645               cb.callout_number = 0;
2646               cb.callout_string_offset = GET(code, 2 + 4*LINK_SIZE);
2647               cb.callout_string = code + (2 + 5*LINK_SIZE) + 1;
2648               cb.callout_string_length =
2649                 callout_length - (1 + 4*LINK_SIZE) - 2;
2650               }
2651 
2652             if ((rrc = (mb->callout)(&cb, mb->callout_data)) < 0)
2653               return rrc;   /* Abandon */
2654             }
2655           if (rrc > 0) break;                      /* Fail this thread */
2656           code += callout_length;                  /* Skip callout data */
2657           }
2658 
2659         condcode = code[LINK_SIZE+1];
2660 
2661         /* Back reference conditions and duplicate named recursion conditions
2662         are not supported */
2663 
2664         if (condcode == OP_CREF || condcode == OP_DNCREF ||
2665             condcode == OP_DNRREF)
2666           return PCRE2_ERROR_DFA_UCOND;
2667 
2668         /* The DEFINE condition is always false, and the assertion (?!) is
2669         converted to OP_FAIL. */
2670 
2671         if (condcode == OP_FALSE || condcode == OP_FAIL)
2672           { ADD_ACTIVE(state_offset + codelink + LINK_SIZE + 1, 0); }
2673 
2674         /* There is also an always-true condition */
2675 
2676         else if (condcode == OP_TRUE)
2677           { ADD_ACTIVE(state_offset + LINK_SIZE + 2 + IMM2_SIZE, 0); }
2678 
2679         /* The only supported version of OP_RREF is for the value RREF_ANY,
2680         which means "test if in any recursion". We can't test for specifically
2681         recursed groups. */
2682 
2683         else if (condcode == OP_RREF)
2684           {
2685           unsigned int value = GET2(code, LINK_SIZE + 2);
2686           if (value != RREF_ANY) return PCRE2_ERROR_DFA_UCOND;
2687           if (mb->recursive != NULL)
2688             { ADD_ACTIVE(state_offset + LINK_SIZE + 2 + IMM2_SIZE, 0); }
2689           else { ADD_ACTIVE(state_offset + codelink + LINK_SIZE + 1, 0); }
2690           }
2691 
2692         /* Otherwise, the condition is an assertion */
2693 
2694         else
2695           {
2696           int rc;
2697           PCRE2_SPTR asscode = code + LINK_SIZE + 1;
2698           PCRE2_SPTR endasscode = asscode + GET(asscode, 1);
2699 
2700           while (*endasscode == OP_ALT) endasscode += GET(endasscode, 1);
2701 
2702           rc = internal_dfa_match(
2703             mb,                                   /* fixed match data */
2704             asscode,                              /* this subexpression's code */
2705             ptr,                                  /* where we currently are */
2706             (PCRE2_SIZE)(ptr - start_subject),    /* start offset */
2707             local_offsets,                        /* offset vector */
2708             sizeof(local_offsets)/sizeof(PCRE2_SIZE), /* size of same */
2709             local_workspace,                      /* workspace vector */
2710             sizeof(local_workspace)/sizeof(int),  /* size of same */
2711             rlevel);                              /* function recursion level */
2712 
2713           if (rc == PCRE2_ERROR_DFA_UITEM) return rc;
2714           if ((rc >= 0) ==
2715                 (condcode == OP_ASSERT || condcode == OP_ASSERTBACK))
2716             { ADD_ACTIVE((int)(endasscode + LINK_SIZE + 1 - start_code), 0); }
2717           else
2718             { ADD_ACTIVE(state_offset + codelink + LINK_SIZE + 1, 0); }
2719           }
2720         }
2721       break;
2722 
2723       /*-----------------------------------------------------------------*/
2724       case OP_RECURSE:
2725         {
2726         dfa_recursion_info *ri;
2727         PCRE2_SIZE local_offsets[1000];
2728         int local_workspace[1000];
2729         PCRE2_SPTR callpat = start_code + GET(code, 1);
2730         uint32_t recno = (callpat == mb->start_code)? 0 :
2731           GET2(callpat, 1 + LINK_SIZE);
2732         int rc;
2733 
2734         /* Check for repeating a recursion without advancing the subject
2735         pointer. This should catch convoluted mutual recursions. (Some simple
2736         cases are caught at compile time.) */
2737 
2738         for (ri = mb->recursive; ri != NULL; ri = ri->prevrec)
2739           if (recno == ri->group_num && ptr == ri->subject_position)
2740             return PCRE2_ERROR_RECURSELOOP;
2741 
2742         /* Remember this recursion and where we started it so as to
2743         catch infinite loops. */
2744 
2745         new_recursive.group_num = recno;
2746         new_recursive.subject_position = ptr;
2747         new_recursive.prevrec = mb->recursive;
2748         mb->recursive = &new_recursive;
2749 
2750         rc = internal_dfa_match(
2751           mb,                                   /* fixed match data */
2752           callpat,                              /* this subexpression's code */
2753           ptr,                                  /* where we currently are */
2754           (PCRE2_SIZE)(ptr - start_subject),    /* start offset */
2755           local_offsets,                        /* offset vector */
2756           sizeof(local_offsets)/sizeof(PCRE2_SIZE), /* size of same */
2757           local_workspace,                      /* workspace vector */
2758           sizeof(local_workspace)/sizeof(int),  /* size of same */
2759           rlevel);                              /* function recursion level */
2760 
2761         mb->recursive = new_recursive.prevrec;  /* Done this recursion */
2762 
2763         /* Ran out of internal offsets */
2764 
2765         if (rc == 0) return PCRE2_ERROR_DFA_RECURSE;
2766 
2767         /* For each successful matched substring, set up the next state with a
2768         count of characters to skip before trying it. Note that the count is in
2769         characters, not bytes. */
2770 
2771         if (rc > 0)
2772           {
2773           for (rc = rc*2 - 2; rc >= 0; rc -= 2)
2774             {
2775             PCRE2_SIZE charcount = local_offsets[rc+1] - local_offsets[rc];
2776 #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32
2777             if (utf)
2778               {
2779               PCRE2_SPTR p = start_subject + local_offsets[rc];
2780               PCRE2_SPTR pp = start_subject + local_offsets[rc+1];
2781               while (p < pp) if (NOT_FIRSTCU(*p++)) charcount--;
2782               }
2783 #endif
2784             if (charcount > 0)
2785               {
2786               ADD_NEW_DATA(-(state_offset + LINK_SIZE + 1), 0,
2787                 (int)(charcount - 1));
2788               }
2789             else
2790               {
2791               ADD_ACTIVE(state_offset + LINK_SIZE + 1, 0);
2792               }
2793             }
2794           }
2795         else if (rc != PCRE2_ERROR_NOMATCH) return rc;
2796         }
2797       break;
2798 
2799       /*-----------------------------------------------------------------*/
2800       case OP_BRAPOS:
2801       case OP_SBRAPOS:
2802       case OP_CBRAPOS:
2803       case OP_SCBRAPOS:
2804       case OP_BRAPOSZERO:
2805         {
2806         PCRE2_SIZE charcount, matched_count;
2807         PCRE2_SPTR local_ptr = ptr;
2808         BOOL allow_zero;
2809 
2810         if (codevalue == OP_BRAPOSZERO)
2811           {
2812           allow_zero = TRUE;
2813           codevalue = *(++code);  /* Codevalue will be one of above BRAs */
2814           }
2815         else allow_zero = FALSE;
2816 
2817         /* Loop to match the subpattern as many times as possible as if it were
2818         a complete pattern. */
2819 
2820         for (matched_count = 0;; matched_count++)
2821           {
2822           PCRE2_SIZE local_offsets[2];
2823           int local_workspace[1000];
2824 
2825           int rc = internal_dfa_match(
2826             mb,                                   /* fixed match data */
2827             code,                                 /* this subexpression's code */
2828             local_ptr,                            /* where we currently are */
2829             (PCRE2_SIZE)(ptr - start_subject),    /* start offset */
2830             local_offsets,                        /* offset vector */
2831             sizeof(local_offsets)/sizeof(PCRE2_SIZE), /* size of same */
2832             local_workspace,                      /* workspace vector */
2833             sizeof(local_workspace)/sizeof(int),  /* size of same */
2834             rlevel);                              /* function recursion level */
2835 
2836           /* Failed to match */
2837 
2838           if (rc < 0)
2839             {
2840             if (rc != PCRE2_ERROR_NOMATCH) return rc;
2841             break;
2842             }
2843 
2844           /* Matched: break the loop if zero characters matched. */
2845 
2846           charcount = local_offsets[1] - local_offsets[0];
2847           if (charcount == 0) break;
2848           local_ptr += charcount;    /* Advance temporary position ptr */
2849           }
2850 
2851         /* At this point we have matched the subpattern matched_count
2852         times, and local_ptr is pointing to the character after the end of the
2853         last match. */
2854 
2855         if (matched_count > 0 || allow_zero)
2856           {
2857           PCRE2_SPTR end_subpattern = code;
2858           int next_state_offset;
2859 
2860           do { end_subpattern += GET(end_subpattern, 1); }
2861             while (*end_subpattern == OP_ALT);
2862           next_state_offset =
2863             (int)(end_subpattern - start_code + LINK_SIZE + 1);
2864 
2865           /* Optimization: if there are no more active states, and there
2866           are no new states yet set up, then skip over the subject string
2867           right here, to save looping. Otherwise, set up the new state to swing
2868           into action when the end of the matched substring is reached. */
2869 
2870           if (i + 1 >= active_count && new_count == 0)
2871             {
2872             ptr = local_ptr;
2873             clen = 0;
2874             ADD_NEW(next_state_offset, 0);
2875             }
2876           else
2877             {
2878             PCRE2_SPTR p = ptr;
2879             PCRE2_SPTR pp = local_ptr;
2880             charcount = (PCRE2_SIZE)(pp - p);
2881 #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32
2882             if (utf) while (p < pp) if (NOT_FIRSTCU(*p++)) charcount--;
2883 #endif
2884             ADD_NEW_DATA(-next_state_offset, 0, (int)(charcount - 1));
2885             }
2886           }
2887         }
2888       break;
2889 
2890       /*-----------------------------------------------------------------*/
2891       case OP_ONCE:
2892       case OP_ONCE_NC:
2893         {
2894         PCRE2_SIZE local_offsets[2];
2895         int local_workspace[1000];
2896 
2897         int rc = internal_dfa_match(
2898           mb,                                   /* fixed match data */
2899           code,                                 /* this subexpression's code */
2900           ptr,                                  /* where we currently are */
2901           (PCRE2_SIZE)(ptr - start_subject),    /* start offset */
2902           local_offsets,                        /* offset vector */
2903           sizeof(local_offsets)/sizeof(PCRE2_SIZE), /* size of same */
2904           local_workspace,                      /* workspace vector */
2905           sizeof(local_workspace)/sizeof(int),  /* size of same */
2906           rlevel);                              /* function recursion level */
2907 
2908         if (rc >= 0)
2909           {
2910           PCRE2_SPTR end_subpattern = code;
2911           PCRE2_SIZE charcount = local_offsets[1] - local_offsets[0];
2912           int next_state_offset, repeat_state_offset;
2913 
2914           do { end_subpattern += GET(end_subpattern, 1); }
2915             while (*end_subpattern == OP_ALT);
2916           next_state_offset =
2917             (int)(end_subpattern - start_code + LINK_SIZE + 1);
2918 
2919           /* If the end of this subpattern is KETRMAX or KETRMIN, we must
2920           arrange for the repeat state also to be added to the relevant list.
2921           Calculate the offset, or set -1 for no repeat. */
2922 
2923           repeat_state_offset = (*end_subpattern == OP_KETRMAX ||
2924                                  *end_subpattern == OP_KETRMIN)?
2925             (int)(end_subpattern - start_code - GET(end_subpattern, 1)) : -1;
2926 
2927           /* If we have matched an empty string, add the next state at the
2928           current character pointer. This is important so that the duplicate
2929           checking kicks in, which is what breaks infinite loops that match an
2930           empty string. */
2931 
2932           if (charcount == 0)
2933             {
2934             ADD_ACTIVE(next_state_offset, 0);
2935             }
2936 
2937           /* Optimization: if there are no more active states, and there
2938           are no new states yet set up, then skip over the subject string
2939           right here, to save looping. Otherwise, set up the new state to swing
2940           into action when the end of the matched substring is reached. */
2941 
2942           else if (i + 1 >= active_count && new_count == 0)
2943             {
2944             ptr += charcount;
2945             clen = 0;
2946             ADD_NEW(next_state_offset, 0);
2947 
2948             /* If we are adding a repeat state at the new character position,
2949             we must fudge things so that it is the only current state.
2950             Otherwise, it might be a duplicate of one we processed before, and
2951             that would cause it to be skipped. */
2952 
2953             if (repeat_state_offset >= 0)
2954               {
2955               next_active_state = active_states;
2956               active_count = 0;
2957               i = -1;
2958               ADD_ACTIVE(repeat_state_offset, 0);
2959               }
2960             }
2961           else
2962             {
2963 #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32
2964             if (utf)
2965               {
2966               PCRE2_SPTR p = start_subject + local_offsets[0];
2967               PCRE2_SPTR pp = start_subject + local_offsets[1];
2968               while (p < pp) if (NOT_FIRSTCU(*p++)) charcount--;
2969               }
2970 #endif
2971             ADD_NEW_DATA(-next_state_offset, 0, (int)(charcount - 1));
2972             if (repeat_state_offset >= 0)
2973               { ADD_NEW_DATA(-repeat_state_offset, 0, (int)(charcount - 1)); }
2974             }
2975           }
2976         else if (rc != PCRE2_ERROR_NOMATCH) return rc;
2977         }
2978       break;
2979 
2980 
2981 /* ========================================================================== */
2982       /* Handle callouts */
2983 
2984       case OP_CALLOUT:
2985       case OP_CALLOUT_STR:
2986         {
2987         unsigned int callout_length = (*code == OP_CALLOUT)
2988             ? PRIV(OP_lengths)[OP_CALLOUT] : GET(code, 1 + 2*LINK_SIZE);
2989         rrc = 0;
2990 
2991         if (mb->callout != NULL)
2992           {
2993           pcre2_callout_block cb;
2994           cb.version          = 1;
2995           cb.capture_top      = 1;
2996           cb.capture_last     = 0;
2997           cb.offset_vector    = offsets;
2998           cb.mark             = NULL;   /* No (*MARK) support */
2999           cb.subject          = start_subject;
3000           cb.subject_length   = (PCRE2_SIZE)(end_subject - start_subject);
3001           cb.start_match      = (PCRE2_SIZE)(current_subject - start_subject);
3002           cb.current_position = (PCRE2_SIZE)(ptr - start_subject);
3003           cb.pattern_position = GET(code, 1);
3004           cb.next_item_length = GET(code, 1 + LINK_SIZE);
3005 
3006           if (*code == OP_CALLOUT)
3007             {
3008             cb.callout_number = code[1 + 2*LINK_SIZE];
3009             cb.callout_string_offset = 0;
3010             cb.callout_string = NULL;
3011             cb.callout_string_length = 0;
3012             }
3013           else
3014             {
3015             cb.callout_number = 0;
3016             cb.callout_string_offset = GET(code, 1 + 3*LINK_SIZE);
3017             cb.callout_string = code + (1 + 4*LINK_SIZE) + 1;
3018             cb.callout_string_length =
3019               callout_length - (1 + 4*LINK_SIZE) - 2;
3020             }
3021 
3022           if ((rrc = (mb->callout)(&cb, mb->callout_data)) < 0)
3023             return rrc;   /* Abandon */
3024           }
3025         if (rrc == 0)
3026           { ADD_ACTIVE(state_offset + (int)callout_length, 0); }
3027         }
3028       break;
3029 
3030 
3031 /* ========================================================================== */
3032       default:        /* Unsupported opcode */
3033       return PCRE2_ERROR_DFA_UITEM;
3034       }
3035 
3036     NEXT_ACTIVE_STATE: continue;
3037 
3038     }      /* End of loop scanning active states */
3039 
3040   /* We have finished the processing at the current subject character. If no
3041   new states have been set for the next character, we have found all the
3042   matches that we are going to find. If we are at the top level and partial
3043   matching has been requested, check for appropriate conditions.
3044 
3045   The "forced_ fail" variable counts the number of (*F) encountered for the
3046   character. If it is equal to the original active_count (saved in
3047   workspace[1]) it means that (*F) was found on every active state. In this
3048   case we don't want to give a partial match.
3049 
3050   The "could_continue" variable is true if a state could have continued but
3051   for the fact that the end of the subject was reached. */
3052 
3053   if (new_count <= 0)
3054     {
3055     if (rlevel == 1 &&                               /* Top level, and */
3056         could_continue &&                            /* Some could go on, and */
3057         forced_fail != workspace[1] &&               /* Not all forced fail & */
3058         (                                            /* either... */
3059         (mb->moptions & PCRE2_PARTIAL_HARD) != 0      /* Hard partial */
3060         ||                                           /* or... */
3061         ((mb->moptions & PCRE2_PARTIAL_SOFT) != 0 &&  /* Soft partial and */
3062          match_count < 0)                            /* no matches */
3063         ) &&                                         /* And... */
3064         (
3065         partial_newline ||                           /* Either partial NL */
3066           (                                          /* or ... */
3067           ptr >= end_subject &&                /* End of subject and */
3068           ptr > mb->start_used_ptr)            /* Inspected non-empty string */
3069           )
3070         )
3071       match_count = PCRE2_ERROR_PARTIAL;
3072     break;        /* In effect, "return", but see the comment below */
3073     }
3074 
3075   /* One or more states are active for the next character. */
3076 
3077   ptr += clen;    /* Advance to next subject character */
3078   }               /* Loop to move along the subject string */
3079 
3080 /* Control gets here from "break" a few lines above. We do it this way because
3081 if we use "return" above, we have compiler trouble. Some compilers warn if
3082 there's nothing here because they think the function doesn't return a value. On
3083 the other hand, if we put a dummy statement here, some more clever compilers
3084 complain that it can't be reached. Sigh. */
3085 
3086 return match_count;
3087 }
3088 
3089 
3090 
3091 /*************************************************
3092 *     Match a pattern using the DFA algorithm    *
3093 *************************************************/
3094 
3095 /* This function matches a compiled pattern to a subject string, using the
3096 alternate matching algorithm that finds all matches at once.
3097 
3098 Arguments:
3099   code          points to the compiled pattern
3100   subject       subject string
3101   length        length of subject string
3102   startoffset   where to start matching in the subject
3103   options       option bits
3104   match_data    points to a match data structure
3105   gcontext      points to a match context
3106   workspace     pointer to workspace
3107   wscount       size of workspace
3108 
3109 Returns:        > 0 => number of match offset pairs placed in offsets
3110                 = 0 => offsets overflowed; longest matches are present
3111                  -1 => failed to match
3112                < -1 => some kind of unexpected problem
3113 */
3114 
3115 PCRE2_EXP_DEFN int PCRE2_CALL_CONVENTION
pcre2_dfa_match(const pcre2_code * code,PCRE2_SPTR subject,PCRE2_SIZE length,PCRE2_SIZE start_offset,uint32_t options,pcre2_match_data * match_data,pcre2_match_context * mcontext,int * workspace,size_t wscount)3116 pcre2_dfa_match(const pcre2_code *code, PCRE2_SPTR subject, PCRE2_SIZE length,
3117   PCRE2_SIZE start_offset, uint32_t options, pcre2_match_data *match_data,
3118   pcre2_match_context *mcontext, int *workspace, size_t wscount)
3119 {
3120 const pcre2_real_code *re = (const pcre2_real_code *)code;
3121 
3122 PCRE2_SPTR start_match;
3123 PCRE2_SPTR end_subject;
3124 PCRE2_SPTR bumpalong_limit;
3125 PCRE2_SPTR req_cu_ptr;
3126 
3127 BOOL utf, anchored, startline, firstline;
3128 
3129 BOOL has_first_cu = FALSE;
3130 BOOL has_req_cu = FALSE;
3131 PCRE2_UCHAR first_cu = 0;
3132 PCRE2_UCHAR first_cu2 = 0;
3133 PCRE2_UCHAR req_cu = 0;
3134 PCRE2_UCHAR req_cu2 = 0;
3135 
3136 const uint8_t *start_bits = NULL;
3137 
3138 /* We need to have mb pointing to a match block, because the IS_NEWLINE macro
3139 is used below, and it expects NLBLOCK to be defined as a pointer. */
3140 
3141 dfa_match_block actual_match_block;
3142 dfa_match_block *mb = &actual_match_block;
3143 
3144 /* A length equal to PCRE2_ZERO_TERMINATED implies a zero-terminated
3145 subject string. */
3146 
3147 if (length == PCRE2_ZERO_TERMINATED) length = PRIV(strlen)(subject);
3148 
3149 /* Plausibility checks */
3150 
3151 if ((options & ~PUBLIC_DFA_MATCH_OPTIONS) != 0) return PCRE2_ERROR_BADOPTION;
3152 if (re == NULL || subject == NULL || workspace == NULL || match_data == NULL)
3153   return PCRE2_ERROR_NULL;
3154 if (wscount < 20) return PCRE2_ERROR_DFA_WSSIZE;
3155 if (start_offset > length) return PCRE2_ERROR_BADOFFSET;
3156 
3157 /* Check that the first field in the block is the magic number. If it is not,
3158 return with PCRE2_ERROR_BADMAGIC. */
3159 
3160 if (re->magic_number != MAGIC_NUMBER) return PCRE2_ERROR_BADMAGIC;
3161 
3162 /* Check the code unit width. */
3163 
3164 if ((re->flags & PCRE2_MODE_MASK) != PCRE2_CODE_UNIT_WIDTH/8)
3165   return PCRE2_ERROR_BADMODE;
3166 
3167 /* PCRE2_NOTEMPTY and PCRE2_NOTEMPTY_ATSTART are match-time flags in the
3168 options variable for this function. Users of PCRE2 who are not calling the
3169 function directly would like to have a way of setting these flags, in the same
3170 way that they can set pcre2_compile() flags like PCRE2_NO_AUTOPOSSESS with
3171 constructions like (*NO_AUTOPOSSESS). To enable this, (*NOTEMPTY) and
3172 (*NOTEMPTY_ATSTART) set bits in the pattern's "flag" function which can now be
3173 transferred to the options for this function. The bits are guaranteed to be
3174 adjacent, but do not have the same values. This bit of Boolean trickery assumes
3175 that the match-time bits are not more significant than the flag bits. If by
3176 accident this is not the case, a compile-time division by zero error will
3177 occur. */
3178 
3179 #define FF (PCRE2_NOTEMPTY_SET|PCRE2_NE_ATST_SET)
3180 #define OO (PCRE2_NOTEMPTY|PCRE2_NOTEMPTY_ATSTART)
3181 options |= (re->flags & FF) / ((FF & (~FF+1)) / (OO & (~OO+1)));
3182 #undef FF
3183 #undef OO
3184 
3185 /* If restarting after a partial match, do some sanity checks on the contents
3186 of the workspace. */
3187 
3188 if ((options & PCRE2_DFA_RESTART) != 0)
3189   {
3190   if ((workspace[0] & (-2)) != 0 || workspace[1] < 1 ||
3191     workspace[1] > (int)((wscount - 2)/INTS_PER_STATEBLOCK))
3192       return PCRE2_ERROR_DFA_BADRESTART;
3193   }
3194 
3195 /* Set some local values */
3196 
3197 utf = (re->overall_options & PCRE2_UTF) != 0;
3198 start_match = subject + start_offset;
3199 end_subject = subject + length;
3200 req_cu_ptr = start_match - 1;
3201 anchored = (options & (PCRE2_ANCHORED|PCRE2_DFA_RESTART)) != 0 ||
3202   (re->overall_options & PCRE2_ANCHORED) != 0;
3203 
3204 /* The "must be at the start of a line" flags are used in a loop when finding
3205 where to start. */
3206 
3207 startline = (re->flags & PCRE2_STARTLINE) != 0;
3208 firstline = (re->overall_options & PCRE2_FIRSTLINE) != 0;
3209 bumpalong_limit = end_subject;
3210 
3211 /* Get data from the match context, if present, and fill in the fields in the
3212 match block. It is an error to set an offset limit without setting the flag at
3213 compile time. */
3214 
3215 if (mcontext == NULL)
3216   {
3217   mb->callout = NULL;
3218   mb->memctl = re->memctl;
3219   }
3220 else
3221   {
3222   if (mcontext->offset_limit != PCRE2_UNSET)
3223     {
3224     if ((re->overall_options & PCRE2_USE_OFFSET_LIMIT) == 0)
3225       return PCRE2_ERROR_BADOFFSETLIMIT;
3226     bumpalong_limit = subject + mcontext->offset_limit;
3227     }
3228   mb->callout = mcontext->callout;
3229   mb->callout_data = mcontext->callout_data;
3230   mb->memctl = mcontext->memctl;
3231   }
3232 
3233 mb->start_code = (PCRE2_UCHAR *)((uint8_t *)re + sizeof(pcre2_real_code)) +
3234   re->name_count * re->name_entry_size;
3235 mb->tables = re->tables;
3236 mb->start_subject = subject;
3237 mb->end_subject = end_subject;
3238 mb->start_offset = start_offset;
3239 mb->moptions = options;
3240 mb->poptions = re->overall_options;
3241 
3242 /* Process the \R and newline settings. */
3243 
3244 mb->bsr_convention = re->bsr_convention;
3245 mb->nltype = NLTYPE_FIXED;
3246 switch(re->newline_convention)
3247   {
3248   case PCRE2_NEWLINE_CR:
3249   mb->nllen = 1;
3250   mb->nl[0] = CHAR_CR;
3251   break;
3252 
3253   case PCRE2_NEWLINE_LF:
3254   mb->nllen = 1;
3255   mb->nl[0] = CHAR_NL;
3256   break;
3257 
3258   case PCRE2_NEWLINE_CRLF:
3259   mb->nllen = 2;
3260   mb->nl[0] = CHAR_CR;
3261   mb->nl[1] = CHAR_NL;
3262   break;
3263 
3264   case PCRE2_NEWLINE_ANY:
3265   mb->nltype = NLTYPE_ANY;
3266   break;
3267 
3268   case PCRE2_NEWLINE_ANYCRLF:
3269   mb->nltype = NLTYPE_ANYCRLF;
3270   break;
3271 
3272   default: return PCRE2_ERROR_INTERNAL;
3273   }
3274 
3275 /* Check a UTF string for validity if required. For 8-bit and 16-bit strings,
3276 we must also check that a starting offset does not point into the middle of a
3277 multiunit character. We check only the portion of the subject that is going to
3278 be inspected during matching - from the offset minus the maximum back reference
3279 to the given length. This saves time when a small part of a large subject is
3280 being matched by the use of a starting offset. Note that the maximum lookbehind
3281 is a number of characters, not code units. */
3282 
3283 #ifdef SUPPORT_UNICODE
3284 if (utf && (options & PCRE2_NO_UTF_CHECK) == 0)
3285   {
3286   PCRE2_SPTR check_subject = start_match;  /* start_match includes offset */
3287 
3288   if (start_offset > 0)
3289     {
3290 #if PCRE2_CODE_UNIT_WIDTH != 32
3291     unsigned int i;
3292     if (start_match < end_subject && NOT_FIRSTCU(*start_match))
3293       return PCRE2_ERROR_BADUTFOFFSET;
3294     for (i = re->max_lookbehind; i > 0 && check_subject > subject; i--)
3295       {
3296       check_subject--;
3297       while (check_subject > subject &&
3298 #if PCRE2_CODE_UNIT_WIDTH == 8
3299       (*check_subject & 0xc0) == 0x80)
3300 #else  /* 16-bit */
3301       (*check_subject & 0xfc00) == 0xdc00)
3302 #endif /* PCRE2_CODE_UNIT_WIDTH == 8 */
3303         check_subject--;
3304       }
3305 #else   /* In the 32-bit library, one code unit equals one character. */
3306     check_subject -= re->max_lookbehind;
3307     if (check_subject < subject) check_subject = subject;
3308 #endif  /* PCRE2_CODE_UNIT_WIDTH != 32 */
3309     }
3310 
3311   /* Validate the relevant portion of the subject. After an error, adjust the
3312   offset to be an absolute offset in the whole string. */
3313 
3314   match_data->rc = PRIV(valid_utf)(check_subject,
3315     length - (PCRE2_SIZE)(check_subject - subject), &(match_data->startchar));
3316   if (match_data->rc != 0)
3317     {
3318     match_data->startchar += (PCRE2_SIZE)(check_subject - subject);
3319     return match_data->rc;
3320     }
3321   }
3322 #endif  /* SUPPORT_UNICODE */
3323 
3324 /* Set up the first code unit to match, if available. The first_codeunit value
3325 is never set for an anchored regular expression, but the anchoring may be
3326 forced at run time, so we have to test for anchoring. The first code unit may
3327 be unset for an unanchored pattern, of course. If there's no first code unit
3328 there may be a bitmap of possible first characters. */
3329 
3330 if (!anchored)
3331   {
3332   if ((re->flags & PCRE2_FIRSTSET) != 0)
3333     {
3334     has_first_cu = TRUE;
3335     first_cu = first_cu2 = (PCRE2_UCHAR)(re->first_codeunit);
3336     if ((re->flags & PCRE2_FIRSTCASELESS) != 0)
3337       {
3338       first_cu2 = TABLE_GET(first_cu, mb->tables + fcc_offset, first_cu);
3339 #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 8
3340       if (utf && first_cu > 127)
3341         first_cu2 = (PCRE2_UCHAR)UCD_OTHERCASE(first_cu);
3342 #endif
3343       }
3344     }
3345   else
3346     if (!startline && (re->flags & PCRE2_FIRSTMAPSET) != 0)
3347       start_bits = re->start_bitmap;
3348   }
3349 
3350 /* For anchored or unanchored matches, there may be a "last known required
3351 character" set. */
3352 
3353 if ((re->flags & PCRE2_LASTSET) != 0)
3354   {
3355   has_req_cu = TRUE;
3356   req_cu = req_cu2 = (PCRE2_UCHAR)(re->last_codeunit);
3357   if ((re->flags & PCRE2_LASTCASELESS) != 0)
3358     {
3359     req_cu2 = TABLE_GET(req_cu, mb->tables + fcc_offset, req_cu);
3360 #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 8
3361     if (utf && req_cu > 127) req_cu2 = (PCRE2_UCHAR)UCD_OTHERCASE(req_cu);
3362 #endif
3363     }
3364   }
3365 
3366 /* Fill in fields that are always returned in the match data. */
3367 
3368 match_data->code = re;
3369 match_data->subject = subject;
3370 match_data->mark = NULL;
3371 match_data->matchedby = PCRE2_MATCHEDBY_DFA_INTERPRETER;
3372 
3373 /* Call the main matching function, looping for a non-anchored regex after a
3374 failed match. If not restarting, perform certain optimizations at the start of
3375 a match. */
3376 
3377 for (;;)
3378   {
3379   int rc;
3380 
3381   /* ----------------- Start of match optimizations ---------------- */
3382 
3383   /* There are some optimizations that avoid running the match if a known
3384   starting point is not found, or if a known later code unit is not present.
3385   However, there is an option (settable at compile time) that disables
3386   these, for testing and for ensuring that all callouts do actually occur.
3387   The optimizations must also be avoided when restarting a DFA match. */
3388 
3389   if ((re->overall_options & PCRE2_NO_START_OPTIMIZE) == 0 &&
3390       (options & PCRE2_DFA_RESTART) == 0)
3391     {
3392     PCRE2_SPTR save_end_subject = end_subject;
3393 
3394     /* If firstline is TRUE, the start of the match is constrained to the first
3395     line of a multiline string. That is, the match must be before or at the
3396     first newline. Implement this by temporarily adjusting end_subject so that
3397     we stop the optimization scans at a newline. If the match fails at the
3398     newline, later code breaks this loop. */
3399 
3400     if (firstline)
3401       {
3402       PCRE2_SPTR t = start_match;
3403 #ifdef SUPPORT_UNICODE
3404       if (utf)
3405         {
3406         while (t < mb->end_subject && !IS_NEWLINE(t))
3407           {
3408           t++;
3409           ACROSSCHAR(t < end_subject, *t, t++);
3410           }
3411         }
3412       else
3413 #endif
3414       while (t < mb->end_subject && !IS_NEWLINE(t)) t++;
3415       end_subject = t;
3416       }
3417 
3418     /* Advance to a unique first code unit if there is one. */
3419 
3420     if (has_first_cu)
3421       {
3422       PCRE2_UCHAR smc;
3423       if (first_cu != first_cu2)
3424         while (start_match < end_subject &&
3425           (smc = UCHAR21TEST(start_match)) != first_cu && smc != first_cu2)
3426           start_match++;
3427       else
3428         while (start_match < end_subject && UCHAR21TEST(start_match) != first_cu)
3429           start_match++;
3430       }
3431 
3432     /* Or to just after a linebreak for a multiline match */
3433 
3434     else if (startline)
3435       {
3436       if (start_match > mb->start_subject + start_offset)
3437         {
3438 #ifdef SUPPORT_UNICODE
3439         if (utf)
3440           {
3441           while (start_match < end_subject && !WAS_NEWLINE(start_match))
3442             {
3443             start_match++;
3444             ACROSSCHAR(start_match < end_subject, *start_match,
3445               start_match++);
3446             }
3447           }
3448         else
3449 #endif
3450         while (start_match < end_subject && !WAS_NEWLINE(start_match))
3451           start_match++;
3452 
3453         /* If we have just passed a CR and the newline option is ANY or
3454         ANYCRLF, and we are now at a LF, advance the match position by one more
3455         code unit. */
3456 
3457         if (start_match[-1] == CHAR_CR &&
3458              (mb->nltype == NLTYPE_ANY || mb->nltype == NLTYPE_ANYCRLF) &&
3459              start_match < end_subject &&
3460              UCHAR21TEST(start_match) == CHAR_NL)
3461           start_match++;
3462         }
3463       }
3464 
3465     /* Or to a non-unique first code unit if any have been identified. The
3466     bitmap contains only 256 bits. When code units are 16 or 32 bits wide, all
3467     code units greater than 254 set the 255 bit. */
3468 
3469     else if (start_bits != NULL)
3470       {
3471       while (start_match < end_subject)
3472         {
3473         register uint32_t c = UCHAR21TEST(start_match);
3474 #if PCRE2_CODE_UNIT_WIDTH != 8
3475         if (c > 255) c = 255;
3476 #endif
3477         if ((start_bits[c/8] & (1 << (c&7))) != 0) break;
3478         start_match++;
3479         }
3480       }
3481 
3482     /* Restore fudged end_subject */
3483 
3484     end_subject = save_end_subject;
3485 
3486     /* The following two optimizations are disabled for partial matching. */
3487 
3488     if ((mb->moptions & (PCRE2_PARTIAL_HARD|PCRE2_PARTIAL_SOFT)) == 0)
3489       {
3490       /* The minimum matching length is a lower bound; no actual string of that
3491       length may actually match the pattern. Although the value is, strictly,
3492       in characters, we treat it as code units to avoid spending too much time
3493       in this optimization. */
3494 
3495       if (end_subject - start_match < re->minlength) return PCRE2_ERROR_NOMATCH;
3496 
3497       /* If req_cu is set, we know that that code unit must appear in the
3498       subject for the match to succeed. If the first code unit is set, req_cu
3499       must be later in the subject; otherwise the test starts at the match
3500       point. This optimization can save a huge amount of backtracking in
3501       patterns with nested unlimited repeats that aren't going to match.
3502       Writing separate code for cased/caseless versions makes it go faster, as
3503       does using an autoincrement and backing off on a match.
3504 
3505       HOWEVER: when the subject string is very, very long, searching to its end
3506       can take a long time, and give bad performance on quite ordinary
3507       patterns. This showed up when somebody was matching something like
3508       /^\d+C/ on a 32-megabyte string... so we don't do this when the string is
3509       sufficiently long. */
3510 
3511       if (has_req_cu && end_subject - start_match < REQ_CU_MAX)
3512         {
3513         register PCRE2_SPTR p = start_match + (has_first_cu? 1:0);
3514 
3515         /* We don't need to repeat the search if we haven't yet reached the
3516         place we found it at last time. */
3517 
3518         if (p > req_cu_ptr)
3519           {
3520           if (req_cu != req_cu2)
3521             {
3522             while (p < end_subject)
3523               {
3524               register uint32_t pp = UCHAR21INCTEST(p);
3525               if (pp == req_cu || pp == req_cu2) { p--; break; }
3526               }
3527             }
3528           else
3529             {
3530             while (p < end_subject)
3531               {
3532               if (UCHAR21INCTEST(p) == req_cu) { p--; break; }
3533               }
3534             }
3535 
3536           /* If we can't find the required code unit, break the matching loop,
3537           forcing a match failure. */
3538 
3539           if (p >= end_subject) break;
3540 
3541           /* If we have found the required code unit, save the point where we
3542           found it, so that we don't search again next time round the loop if
3543           the start hasn't passed this code unit yet. */
3544 
3545           req_cu_ptr = p;
3546           }
3547         }
3548       }
3549     }
3550 
3551   /* ------------ End of start of match optimizations ------------ */
3552 
3553   /* Give no match if we have passed the bumpalong limit. */
3554 
3555   if (start_match > bumpalong_limit) break;
3556 
3557   /* OK, now we can do the business */
3558 
3559   mb->start_used_ptr = start_match;
3560   mb->last_used_ptr = start_match;
3561   mb->recursive = NULL;
3562 
3563   rc = internal_dfa_match(
3564     mb,                           /* fixed match data */
3565     mb->start_code,               /* this subexpression's code */
3566     start_match,                  /* where we currently are */
3567     start_offset,                 /* start offset in subject */
3568     match_data->ovector,          /* offset vector */
3569     (uint32_t)match_data->oveccount * 2,  /* actual size of same */
3570     workspace,                    /* workspace vector */
3571     (int)wscount,                 /* size of same */
3572     0);                           /* function recurse level */
3573 
3574   /* Anything other than "no match" means we are done, always; otherwise, carry
3575   on only if not anchored. */
3576 
3577   if (rc != PCRE2_ERROR_NOMATCH || anchored)
3578     {
3579     if (rc == PCRE2_ERROR_PARTIAL && match_data->oveccount > 0)
3580       {
3581       match_data->ovector[0] = (PCRE2_SIZE)(start_match - subject);
3582       match_data->ovector[1] = (PCRE2_SIZE)(end_subject - subject);
3583       }
3584     match_data->leftchar = (PCRE2_SIZE)(mb->start_used_ptr - subject);
3585     match_data->rightchar = (PCRE2_SIZE)( mb->last_used_ptr - subject);
3586     match_data->startchar = (PCRE2_SIZE)(start_match - subject);
3587     match_data->rc = rc;
3588     return rc;
3589     }
3590 
3591   /* Advance to the next subject character unless we are at the end of a line
3592   and firstline is set. */
3593 
3594   if (firstline && IS_NEWLINE(start_match)) break;
3595   start_match++;
3596 #ifdef SUPPORT_UNICODE
3597   if (utf)
3598     {
3599     ACROSSCHAR(start_match < end_subject, *start_match,
3600       start_match++);
3601     }
3602 #endif
3603   if (start_match > end_subject) break;
3604 
3605   /* If we have just passed a CR and we are now at a LF, and the pattern does
3606   not contain any explicit matches for \r or \n, and the newline option is CRLF
3607   or ANY or ANYCRLF, advance the match position by one more character. */
3608 
3609   if (UCHAR21TEST(start_match - 1) == CHAR_CR &&
3610       start_match < end_subject &&
3611       UCHAR21TEST(start_match) == CHAR_NL &&
3612       (re->flags & PCRE2_HASCRORLF) == 0 &&
3613         (mb->nltype == NLTYPE_ANY ||
3614          mb->nltype == NLTYPE_ANYCRLF ||
3615          mb->nllen == 2))
3616     start_match++;
3617 
3618   }   /* "Bumpalong" loop */
3619 
3620 
3621 return PCRE2_ERROR_NOMATCH;
3622 }
3623 
3624 /* End of pcre2_dfa_match.c */
3625