• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*************************************************
2 *      Perl-Compatible Regular Expressions       *
3 *************************************************/
4 
5 /* PCRE is a library of functions to support regular expressions whose syntax
6 and semantics are as close as possible to those of the Perl 5 language.
7 
8                        Written by Philip Hazel
9      Original API code Copyright (c) 1997-2012 University of Cambridge
10           New API code Copyright (c) 2016-2019 University of Cambridge
11 
12 -----------------------------------------------------------------------------
13 Redistribution and use in source and binary forms, with or without
14 modification, are permitted provided that the following conditions are met:
15 
16     * Redistributions of source code must retain the above copyright notice,
17       this list of conditions and the following disclaimer.
18 
19     * Redistributions in binary form must reproduce the above copyright
20       notice, this list of conditions and the following disclaimer in the
21       documentation and/or other materials provided with the distribution.
22 
23     * Neither the name of the University of Cambridge nor the names of its
24       contributors may be used to endorse or promote products derived from
25       this software without specific prior written permission.
26 
27 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
28 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
29 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
30 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
31 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
32 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
33 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
34 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
35 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
36 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
37 POSSIBILITY OF SUCH DAMAGE.
38 -----------------------------------------------------------------------------
39 */
40 
41 
42 /* This module contains the external function pcre2_dfa_match(), which is an
43 alternative matching function that uses a sort of DFA algorithm (not a true
44 FSM). This is NOT Perl-compatible, but it has advantages in certain
45 applications. */
46 
47 
48 /* NOTE ABOUT PERFORMANCE: A user of this function sent some code that improved
49 the performance of his patterns greatly. I could not use it as it stood, as it
50 was not thread safe, and made assumptions about pattern sizes. Also, it caused
51 test 7 to loop, and test 9 to crash with a segfault.
52 
53 The issue is the check for duplicate states, which is done by a simple linear
54 search up the state list. (Grep for "duplicate" below to find the code.) For
55 many patterns, there will never be many states active at one time, so a simple
56 linear search is fine. In patterns that have many active states, it might be a
57 bottleneck. The suggested code used an indexing scheme to remember which states
58 had previously been used for each character, and avoided the linear search when
59 it knew there was no chance of a duplicate. This was implemented when adding
60 states to the state lists.
61 
62 I wrote some thread-safe, not-limited code to try something similar at the time
63 of checking for duplicates (instead of when adding states), using index vectors
64 on the stack. It did give a 13% improvement with one specially constructed
65 pattern for certain subject strings, but on other strings and on many of the
66 simpler patterns in the test suite it did worse. The major problem, I think,
67 was the extra time to initialize the index. This had to be done for each call
68 of internal_dfa_match(). (The supplied patch used a static vector, initialized
69 only once - I suspect this was the cause of the problems with the tests.)
70 
71 Overall, I concluded that the gains in some cases did not outweigh the losses
72 in others, so I abandoned this code. */
73 
74 
75 #ifdef HAVE_CONFIG_H
76 #include "config.h"
77 #endif
78 
79 #define NLBLOCK mb             /* Block containing newline information */
80 #define PSSTART start_subject  /* Field containing processed string start */
81 #define PSEND   end_subject    /* Field containing processed string end */
82 
83 #include "pcre2_internal.h"
84 
85 #define PUBLIC_DFA_MATCH_OPTIONS \
86   (PCRE2_ANCHORED|PCRE2_ENDANCHORED|PCRE2_NOTBOL|PCRE2_NOTEOL|PCRE2_NOTEMPTY| \
87    PCRE2_NOTEMPTY_ATSTART|PCRE2_NO_UTF_CHECK|PCRE2_PARTIAL_HARD| \
88    PCRE2_PARTIAL_SOFT|PCRE2_DFA_SHORTEST|PCRE2_DFA_RESTART| \
89    PCRE2_COPY_MATCHED_SUBJECT)
90 
91 
92 /*************************************************
93 *      Code parameters and static tables         *
94 *************************************************/
95 
96 /* These are offsets that are used to turn the OP_TYPESTAR and friends opcodes
97 into others, under special conditions. A gap of 20 between the blocks should be
98 enough. The resulting opcodes don't have to be less than 256 because they are
99 never stored, so we push them well clear of the normal opcodes. */
100 
101 #define OP_PROP_EXTRA       300
102 #define OP_EXTUNI_EXTRA     320
103 #define OP_ANYNL_EXTRA      340
104 #define OP_HSPACE_EXTRA     360
105 #define OP_VSPACE_EXTRA     380
106 
107 
108 /* This table identifies those opcodes that are followed immediately by a
109 character that is to be tested in some way. This makes it possible to
110 centralize the loading of these characters. In the case of Type * etc, the
111 "character" is the opcode for \D, \d, \S, \s, \W, or \w, which will always be a
112 small value. Non-zero values in the table are the offsets from the opcode where
113 the character is to be found. ***NOTE*** If the start of this table is
114 modified, the three tables that follow must also be modified. */
115 
116 static const uint8_t coptable[] = {
117   0,                             /* End                                    */
118   0, 0, 0, 0, 0,                 /* \A, \G, \K, \B, \b                     */
119   0, 0, 0, 0, 0, 0,              /* \D, \d, \S, \s, \W, \w                 */
120   0, 0, 0,                       /* Any, AllAny, Anybyte                   */
121   0, 0,                          /* \P, \p                                 */
122   0, 0, 0, 0, 0,                 /* \R, \H, \h, \V, \v                     */
123   0,                             /* \X                                     */
124   0, 0, 0, 0, 0, 0,              /* \Z, \z, $, $M, ^, ^M                   */
125   1,                             /* Char                                   */
126   1,                             /* Chari                                  */
127   1,                             /* not                                    */
128   1,                             /* noti                                   */
129   /* Positive single-char repeats                                          */
130   1, 1, 1, 1, 1, 1,              /* *, *?, +, +?, ?, ??                    */
131   1+IMM2_SIZE, 1+IMM2_SIZE,      /* upto, minupto                          */
132   1+IMM2_SIZE,                   /* exact                                  */
133   1, 1, 1, 1+IMM2_SIZE,          /* *+, ++, ?+, upto+                      */
134   1, 1, 1, 1, 1, 1,              /* *I, *?I, +I, +?I, ?I, ??I              */
135   1+IMM2_SIZE, 1+IMM2_SIZE,      /* upto I, minupto I                      */
136   1+IMM2_SIZE,                   /* exact I                                */
137   1, 1, 1, 1+IMM2_SIZE,          /* *+I, ++I, ?+I, upto+I                  */
138   /* Negative single-char repeats - only for chars < 256                   */
139   1, 1, 1, 1, 1, 1,              /* NOT *, *?, +, +?, ?, ??                */
140   1+IMM2_SIZE, 1+IMM2_SIZE,      /* NOT upto, minupto                      */
141   1+IMM2_SIZE,                   /* NOT exact                              */
142   1, 1, 1, 1+IMM2_SIZE,          /* NOT *+, ++, ?+, upto+                  */
143   1, 1, 1, 1, 1, 1,              /* NOT *I, *?I, +I, +?I, ?I, ??I          */
144   1+IMM2_SIZE, 1+IMM2_SIZE,      /* NOT upto I, minupto I                  */
145   1+IMM2_SIZE,                   /* NOT exact I                            */
146   1, 1, 1, 1+IMM2_SIZE,          /* NOT *+I, ++I, ?+I, upto+I              */
147   /* Positive type repeats                                                 */
148   1, 1, 1, 1, 1, 1,              /* Type *, *?, +, +?, ?, ??               */
149   1+IMM2_SIZE, 1+IMM2_SIZE,      /* Type upto, minupto                     */
150   1+IMM2_SIZE,                   /* Type exact                             */
151   1, 1, 1, 1+IMM2_SIZE,          /* Type *+, ++, ?+, upto+                 */
152   /* Character class & ref repeats                                         */
153   0, 0, 0, 0, 0, 0,              /* *, *?, +, +?, ?, ??                    */
154   0, 0,                          /* CRRANGE, CRMINRANGE                    */
155   0, 0, 0, 0,                    /* Possessive *+, ++, ?+, CRPOSRANGE      */
156   0,                             /* CLASS                                  */
157   0,                             /* NCLASS                                 */
158   0,                             /* XCLASS - variable length               */
159   0,                             /* REF                                    */
160   0,                             /* REFI                                   */
161   0,                             /* DNREF                                  */
162   0,                             /* DNREFI                                 */
163   0,                             /* RECURSE                                */
164   0,                             /* CALLOUT                                */
165   0,                             /* CALLOUT_STR                            */
166   0,                             /* Alt                                    */
167   0,                             /* Ket                                    */
168   0,                             /* KetRmax                                */
169   0,                             /* KetRmin                                */
170   0,                             /* KetRpos                                */
171   0,                             /* Reverse                                */
172   0,                             /* Assert                                 */
173   0,                             /* Assert not                             */
174   0,                             /* Assert behind                          */
175   0,                             /* Assert behind not                      */
176   0,                             /* ONCE                                   */
177   0,                             /* SCRIPT_RUN                             */
178   0, 0, 0, 0, 0,                 /* BRA, BRAPOS, CBRA, CBRAPOS, COND       */
179   0, 0, 0, 0, 0,                 /* SBRA, SBRAPOS, SCBRA, SCBRAPOS, SCOND  */
180   0, 0,                          /* CREF, DNCREF                           */
181   0, 0,                          /* RREF, DNRREF                           */
182   0, 0,                          /* FALSE, TRUE                            */
183   0, 0, 0,                       /* BRAZERO, BRAMINZERO, BRAPOSZERO        */
184   0, 0, 0,                       /* MARK, PRUNE, PRUNE_ARG                 */
185   0, 0, 0, 0,                    /* SKIP, SKIP_ARG, THEN, THEN_ARG         */
186   0, 0,                          /* COMMIT, COMMIT_ARG                     */
187   0, 0, 0,                       /* FAIL, ACCEPT, ASSERT_ACCEPT            */
188   0, 0, 0                        /* CLOSE, SKIPZERO, DEFINE                */
189 };
190 
191 /* This table identifies those opcodes that inspect a character. It is used to
192 remember the fact that a character could have been inspected when the end of
193 the subject is reached. ***NOTE*** If the start of this table is modified, the
194 two tables that follow must also be modified. */
195 
196 static const uint8_t poptable[] = {
197   0,                             /* End                                    */
198   0, 0, 0, 1, 1,                 /* \A, \G, \K, \B, \b                     */
199   1, 1, 1, 1, 1, 1,              /* \D, \d, \S, \s, \W, \w                 */
200   1, 1, 1,                       /* Any, AllAny, Anybyte                   */
201   1, 1,                          /* \P, \p                                 */
202   1, 1, 1, 1, 1,                 /* \R, \H, \h, \V, \v                     */
203   1,                             /* \X                                     */
204   0, 0, 0, 0, 0, 0,              /* \Z, \z, $, $M, ^, ^M                   */
205   1,                             /* Char                                   */
206   1,                             /* Chari                                  */
207   1,                             /* not                                    */
208   1,                             /* noti                                   */
209   /* Positive single-char repeats                                          */
210   1, 1, 1, 1, 1, 1,              /* *, *?, +, +?, ?, ??                    */
211   1, 1, 1,                       /* upto, minupto, exact                   */
212   1, 1, 1, 1,                    /* *+, ++, ?+, upto+                      */
213   1, 1, 1, 1, 1, 1,              /* *I, *?I, +I, +?I, ?I, ??I              */
214   1, 1, 1,                       /* upto I, minupto I, exact I             */
215   1, 1, 1, 1,                    /* *+I, ++I, ?+I, upto+I                  */
216   /* Negative single-char repeats - only for chars < 256                   */
217   1, 1, 1, 1, 1, 1,              /* NOT *, *?, +, +?, ?, ??                */
218   1, 1, 1,                       /* NOT upto, minupto, exact               */
219   1, 1, 1, 1,                    /* NOT *+, ++, ?+, upto+                  */
220   1, 1, 1, 1, 1, 1,              /* NOT *I, *?I, +I, +?I, ?I, ??I          */
221   1, 1, 1,                       /* NOT upto I, minupto I, exact I         */
222   1, 1, 1, 1,                    /* NOT *+I, ++I, ?+I, upto+I              */
223   /* Positive type repeats                                                 */
224   1, 1, 1, 1, 1, 1,              /* Type *, *?, +, +?, ?, ??               */
225   1, 1, 1,                       /* Type upto, minupto, exact              */
226   1, 1, 1, 1,                    /* Type *+, ++, ?+, upto+                 */
227   /* Character class & ref repeats                                         */
228   1, 1, 1, 1, 1, 1,              /* *, *?, +, +?, ?, ??                    */
229   1, 1,                          /* CRRANGE, CRMINRANGE                    */
230   1, 1, 1, 1,                    /* Possessive *+, ++, ?+, CRPOSRANGE      */
231   1,                             /* CLASS                                  */
232   1,                             /* NCLASS                                 */
233   1,                             /* XCLASS - variable length               */
234   0,                             /* REF                                    */
235   0,                             /* REFI                                   */
236   0,                             /* DNREF                                  */
237   0,                             /* DNREFI                                 */
238   0,                             /* RECURSE                                */
239   0,                             /* CALLOUT                                */
240   0,                             /* CALLOUT_STR                            */
241   0,                             /* Alt                                    */
242   0,                             /* Ket                                    */
243   0,                             /* KetRmax                                */
244   0,                             /* KetRmin                                */
245   0,                             /* KetRpos                                */
246   0,                             /* Reverse                                */
247   0,                             /* Assert                                 */
248   0,                             /* Assert not                             */
249   0,                             /* Assert behind                          */
250   0,                             /* Assert behind not                      */
251   0,                             /* ONCE                                   */
252   0,                             /* SCRIPT_RUN                             */
253   0, 0, 0, 0, 0,                 /* BRA, BRAPOS, CBRA, CBRAPOS, COND       */
254   0, 0, 0, 0, 0,                 /* SBRA, SBRAPOS, SCBRA, SCBRAPOS, SCOND  */
255   0, 0,                          /* CREF, DNCREF                           */
256   0, 0,                          /* RREF, DNRREF                           */
257   0, 0,                          /* FALSE, TRUE                            */
258   0, 0, 0,                       /* BRAZERO, BRAMINZERO, BRAPOSZERO        */
259   0, 0, 0,                       /* MARK, PRUNE, PRUNE_ARG                 */
260   0, 0, 0, 0,                    /* SKIP, SKIP_ARG, THEN, THEN_ARG         */
261   0, 0,                          /* COMMIT, COMMIT_ARG                     */
262   0, 0, 0,                       /* FAIL, ACCEPT, ASSERT_ACCEPT            */
263   0, 0, 0                        /* CLOSE, SKIPZERO, DEFINE                */
264 };
265 
266 /* These 2 tables allow for compact code for testing for \D, \d, \S, \s, \W,
267 and \w */
268 
269 static const uint8_t toptable1[] = {
270   0, 0, 0, 0, 0, 0,
271   ctype_digit, ctype_digit,
272   ctype_space, ctype_space,
273   ctype_word,  ctype_word,
274   0, 0                            /* OP_ANY, OP_ALLANY */
275 };
276 
277 static const uint8_t toptable2[] = {
278   0, 0, 0, 0, 0, 0,
279   ctype_digit, 0,
280   ctype_space, 0,
281   ctype_word,  0,
282   1, 1                            /* OP_ANY, OP_ALLANY */
283 };
284 
285 
286 /* Structure for holding data about a particular state, which is in effect the
287 current data for an active path through the match tree. It must consist
288 entirely of ints because the working vector we are passed, and which we put
289 these structures in, is a vector of ints. */
290 
291 typedef struct stateblock {
292   int offset;                     /* Offset to opcode (-ve has meaning) */
293   int count;                      /* Count for repeats */
294   int data;                       /* Some use extra data */
295 } stateblock;
296 
297 #define INTS_PER_STATEBLOCK  (int)(sizeof(stateblock)/sizeof(int))
298 
299 
300 /* Before version 10.32 the recursive calls of internal_dfa_match() were passed
301 local working space and output vectors that were created on the stack. This has
302 caused issues for some patterns, especially in small-stack environments such as
303 Windows. A new scheme is now in use which sets up a vector on the stack, but if
304 this is too small, heap memory is used, up to the heap_limit. The main
305 parameters are all numbers of ints because the workspace is a vector of ints.
306 
307 The size of the starting stack vector, DFA_START_RWS_SIZE, is in bytes, and is
308 defined in pcre2_internal.h so as to be available to pcre2test when it is
309 finding the minimum heap requirement for a match. */
310 
311 #define OVEC_UNIT  (sizeof(PCRE2_SIZE)/sizeof(int))
312 
313 #define RWS_BASE_SIZE   (DFA_START_RWS_SIZE/sizeof(int))  /* Stack vector */
314 #define RWS_RSIZE       1000                    /* Work size for recursion */
315 #define RWS_OVEC_RSIZE  (1000*OVEC_UNIT)        /* Ovector for recursion */
316 #define RWS_OVEC_OSIZE  (2*OVEC_UNIT)           /* Ovector in other cases */
317 
318 /* This structure is at the start of each workspace block. */
319 
320 typedef struct RWS_anchor {
321   struct RWS_anchor *next;
322   uint32_t size;  /* Number of ints */
323   uint32_t free;  /* Number of ints */
324 } RWS_anchor;
325 
326 #define RWS_ANCHOR_SIZE (sizeof(RWS_anchor)/sizeof(int))
327 
328 
329 
330 /*************************************************
331 *               Process a callout                *
332 *************************************************/
333 
334 /* This function is called to perform a callout.
335 
336 Arguments:
337   code              current code pointer
338   offsets           points to current capture offsets
339   current_subject   start of current subject match
340   ptr               current position in subject
341   mb                the match block
342   extracode         extra code offset when called from condition
343   lengthptr         where to return the callout length
344 
345 Returns:            the return from the callout
346 */
347 
348 static int
do_callout(PCRE2_SPTR code,PCRE2_SIZE * offsets,PCRE2_SPTR current_subject,PCRE2_SPTR ptr,dfa_match_block * mb,PCRE2_SIZE extracode,PCRE2_SIZE * lengthptr)349 do_callout(PCRE2_SPTR code, PCRE2_SIZE *offsets, PCRE2_SPTR current_subject,
350   PCRE2_SPTR ptr, dfa_match_block *mb, PCRE2_SIZE extracode,
351   PCRE2_SIZE *lengthptr)
352 {
353 pcre2_callout_block *cb = mb->cb;
354 
355 *lengthptr = (code[extracode] == OP_CALLOUT)?
356   (PCRE2_SIZE)PRIV(OP_lengths)[OP_CALLOUT] :
357   (PCRE2_SIZE)GET(code, 1 + 2*LINK_SIZE + extracode);
358 
359 if (mb->callout == NULL) return 0;    /* No callout provided */
360 
361 /* Fixed fields in the callout block are set once and for all at the start of
362 matching. */
363 
364 cb->offset_vector    = offsets;
365 cb->start_match      = (PCRE2_SIZE)(current_subject - mb->start_subject);
366 cb->current_position = (PCRE2_SIZE)(ptr - mb->start_subject);
367 cb->pattern_position = GET(code, 1 + extracode);
368 cb->next_item_length = GET(code, 1 + LINK_SIZE + extracode);
369 
370 if (code[extracode] == OP_CALLOUT)
371   {
372   cb->callout_number = code[1 + 2*LINK_SIZE + extracode];
373   cb->callout_string_offset = 0;
374   cb->callout_string = NULL;
375   cb->callout_string_length = 0;
376   }
377 else
378   {
379   cb->callout_number = 0;
380   cb->callout_string_offset = GET(code, 1 + 3*LINK_SIZE + extracode);
381   cb->callout_string = code + (1 + 4*LINK_SIZE + extracode) + 1;
382   cb->callout_string_length = *lengthptr - (1 + 4*LINK_SIZE) - 2;
383   }
384 
385 return (mb->callout)(cb, mb->callout_data);
386 }
387 
388 
389 
390 /*************************************************
391 *         Expand local workspace memory          *
392 *************************************************/
393 
394 /* This function is called when internal_dfa_match() is about to be called
395 recursively and there is insufficient working space left in the current
396 workspace block. If there's an existing next block, use it; otherwise get a new
397 block unless the heap limit is reached.
398 
399 Arguments:
400   rwsptr     pointer to block pointer (updated)
401   ovecsize   space needed for an ovector
402   mb         the match block
403 
404 Returns:     0 rwsptr has been updated
405             !0 an error code
406 */
407 
408 static int
more_workspace(RWS_anchor ** rwsptr,unsigned int ovecsize,dfa_match_block * mb)409 more_workspace(RWS_anchor **rwsptr, unsigned int ovecsize, dfa_match_block *mb)
410 {
411 RWS_anchor *rws = *rwsptr;
412 RWS_anchor *new;
413 
414 if (rws->next != NULL)
415   {
416   new = rws->next;
417   }
418 
419 /* Sizes in the RWS_anchor blocks are in units of sizeof(int), but
420 mb->heap_limit and mb->heap_used are in kibibytes. Play carefully, to avoid
421 overflow. */
422 
423 else
424   {
425   uint32_t newsize = (rws->size >= UINT32_MAX/2)? UINT32_MAX/2 : rws->size * 2;
426   uint32_t newsizeK = newsize/(1024/sizeof(int));
427 
428   if (newsizeK + mb->heap_used > mb->heap_limit)
429     newsizeK = (uint32_t)(mb->heap_limit - mb->heap_used);
430   newsize = newsizeK*(1024/sizeof(int));
431 
432   if (newsize < RWS_RSIZE + ovecsize + RWS_ANCHOR_SIZE)
433     return PCRE2_ERROR_HEAPLIMIT;
434   new = mb->memctl.malloc(newsize*sizeof(int), mb->memctl.memory_data);
435   if (new == NULL) return PCRE2_ERROR_NOMEMORY;
436   mb->heap_used += newsizeK;
437   new->next = NULL;
438   new->size = newsize;
439   rws->next = new;
440   }
441 
442 new->free = new->size - RWS_ANCHOR_SIZE;
443 *rwsptr = new;
444 return 0;
445 }
446 
447 
448 
449 /*************************************************
450 *     Match a Regular Expression - DFA engine    *
451 *************************************************/
452 
453 /* This internal function applies a compiled pattern to a subject string,
454 starting at a given point, using a DFA engine. This function is called from the
455 external one, possibly multiple times if the pattern is not anchored. The
456 function calls itself recursively for some kinds of subpattern.
457 
458 Arguments:
459   mb                the match_data block with fixed information
460   this_start_code   the opening bracket of this subexpression's code
461   current_subject   where we currently are in the subject string
462   start_offset      start offset in the subject string
463   offsets           vector to contain the matching string offsets
464   offsetcount       size of same
465   workspace         vector of workspace
466   wscount           size of same
467   rlevel            function call recursion level
468 
469 Returns:            > 0 => number of match offset pairs placed in offsets
470                     = 0 => offsets overflowed; longest matches are present
471                      -1 => failed to match
472                    < -1 => some kind of unexpected problem
473 
474 The following macros are used for adding states to the two state vectors (one
475 for the current character, one for the following character). */
476 
477 #define ADD_ACTIVE(x,y) \
478   if (active_count++ < wscount) \
479     { \
480     next_active_state->offset = (x); \
481     next_active_state->count  = (y); \
482     next_active_state++; \
483     } \
484   else return PCRE2_ERROR_DFA_WSSIZE
485 
486 #define ADD_ACTIVE_DATA(x,y,z) \
487   if (active_count++ < wscount) \
488     { \
489     next_active_state->offset = (x); \
490     next_active_state->count  = (y); \
491     next_active_state->data   = (z); \
492     next_active_state++; \
493     } \
494   else return PCRE2_ERROR_DFA_WSSIZE
495 
496 #define ADD_NEW(x,y) \
497   if (new_count++ < wscount) \
498     { \
499     next_new_state->offset = (x); \
500     next_new_state->count  = (y); \
501     next_new_state++; \
502     } \
503   else return PCRE2_ERROR_DFA_WSSIZE
504 
505 #define ADD_NEW_DATA(x,y,z) \
506   if (new_count++ < wscount) \
507     { \
508     next_new_state->offset = (x); \
509     next_new_state->count  = (y); \
510     next_new_state->data   = (z); \
511     next_new_state++; \
512     } \
513   else return PCRE2_ERROR_DFA_WSSIZE
514 
515 /* And now, here is the code */
516 
517 static int
internal_dfa_match(dfa_match_block * mb,PCRE2_SPTR this_start_code,PCRE2_SPTR current_subject,PCRE2_SIZE start_offset,PCRE2_SIZE * offsets,uint32_t offsetcount,int * workspace,int wscount,uint32_t rlevel,int * RWS)518 internal_dfa_match(
519   dfa_match_block *mb,
520   PCRE2_SPTR this_start_code,
521   PCRE2_SPTR current_subject,
522   PCRE2_SIZE start_offset,
523   PCRE2_SIZE *offsets,
524   uint32_t offsetcount,
525   int *workspace,
526   int wscount,
527   uint32_t rlevel,
528   int *RWS)
529 {
530 stateblock *active_states, *new_states, *temp_states;
531 stateblock *next_active_state, *next_new_state;
532 const uint8_t *ctypes, *lcc, *fcc;
533 PCRE2_SPTR ptr;
534 PCRE2_SPTR end_code;
535 dfa_recursion_info new_recursive;
536 int active_count, new_count, match_count;
537 
538 /* Some fields in the mb block are frequently referenced, so we load them into
539 independent variables in the hope that this will perform better. */
540 
541 PCRE2_SPTR start_subject = mb->start_subject;
542 PCRE2_SPTR end_subject = mb->end_subject;
543 PCRE2_SPTR start_code = mb->start_code;
544 
545 #ifdef SUPPORT_UNICODE
546 BOOL utf = (mb->poptions & PCRE2_UTF) != 0;
547 #else
548 BOOL utf = FALSE;
549 #endif
550 
551 BOOL reset_could_continue = FALSE;
552 
553 if (mb->match_call_count++ >= mb->match_limit) return PCRE2_ERROR_MATCHLIMIT;
554 if (rlevel++ > mb->match_limit_depth) return PCRE2_ERROR_DEPTHLIMIT;
555 offsetcount &= (uint32_t)(-2);  /* Round down */
556 
557 wscount -= 2;
558 wscount = (wscount - (wscount % (INTS_PER_STATEBLOCK * 2))) /
559           (2 * INTS_PER_STATEBLOCK);
560 
561 ctypes = mb->tables + ctypes_offset;
562 lcc = mb->tables + lcc_offset;
563 fcc = mb->tables + fcc_offset;
564 
565 match_count = PCRE2_ERROR_NOMATCH;   /* A negative number */
566 
567 active_states = (stateblock *)(workspace + 2);
568 next_new_state = new_states = active_states + wscount;
569 new_count = 0;
570 
571 /* The first thing in any (sub) pattern is a bracket of some sort. Push all
572 the alternative states onto the list, and find out where the end is. This
573 makes is possible to use this function recursively, when we want to stop at a
574 matching internal ket rather than at the end.
575 
576 If we are dealing with a backward assertion we have to find out the maximum
577 amount to move back, and set up each alternative appropriately. */
578 
579 if (*this_start_code == OP_ASSERTBACK || *this_start_code == OP_ASSERTBACK_NOT)
580   {
581   size_t max_back = 0;
582   size_t gone_back;
583 
584   end_code = this_start_code;
585   do
586     {
587     size_t back = (size_t)GET(end_code, 2+LINK_SIZE);
588     if (back > max_back) max_back = back;
589     end_code += GET(end_code, 1);
590     }
591   while (*end_code == OP_ALT);
592 
593   /* If we can't go back the amount required for the longest lookbehind
594   pattern, go back as far as we can; some alternatives may still be viable. */
595 
596 #ifdef SUPPORT_UNICODE
597   /* In character mode we have to step back character by character */
598 
599   if (utf)
600     {
601     for (gone_back = 0; gone_back < max_back; gone_back++)
602       {
603       if (current_subject <= start_subject) break;
604       current_subject--;
605       ACROSSCHAR(current_subject > start_subject, current_subject,
606         current_subject--);
607       }
608     }
609   else
610 #endif
611 
612   /* In byte-mode we can do this quickly. */
613 
614     {
615     size_t current_offset = (size_t)(current_subject - start_subject);
616     gone_back = (current_offset < max_back)? current_offset : max_back;
617     current_subject -= gone_back;
618     }
619 
620   /* Save the earliest consulted character */
621 
622   if (current_subject < mb->start_used_ptr)
623     mb->start_used_ptr = current_subject;
624 
625   /* Now we can process the individual branches. There will be an OP_REVERSE at
626   the start of each branch, except when the length of the branch is zero. */
627 
628   end_code = this_start_code;
629   do
630     {
631     uint32_t revlen = (end_code[1+LINK_SIZE] == OP_REVERSE)? 1 + LINK_SIZE : 0;
632     size_t back = (revlen == 0)? 0 : (size_t)GET(end_code, 2+LINK_SIZE);
633     if (back <= gone_back)
634       {
635       int bstate = (int)(end_code - start_code + 1 + LINK_SIZE + revlen);
636       ADD_NEW_DATA(-bstate, 0, (int)(gone_back - back));
637       }
638     end_code += GET(end_code, 1);
639     }
640   while (*end_code == OP_ALT);
641  }
642 
643 /* This is the code for a "normal" subpattern (not a backward assertion). The
644 start of a whole pattern is always one of these. If we are at the top level,
645 we may be asked to restart matching from the same point that we reached for a
646 previous partial match. We still have to scan through the top-level branches to
647 find the end state. */
648 
649 else
650   {
651   end_code = this_start_code;
652 
653   /* Restarting */
654 
655   if (rlevel == 1 && (mb->moptions & PCRE2_DFA_RESTART) != 0)
656     {
657     do { end_code += GET(end_code, 1); } while (*end_code == OP_ALT);
658     new_count = workspace[1];
659     if (!workspace[0])
660       memcpy(new_states, active_states, (size_t)new_count * sizeof(stateblock));
661     }
662 
663   /* Not restarting */
664 
665   else
666     {
667     int length = 1 + LINK_SIZE +
668       ((*this_start_code == OP_CBRA || *this_start_code == OP_SCBRA ||
669         *this_start_code == OP_CBRAPOS || *this_start_code == OP_SCBRAPOS)
670         ? IMM2_SIZE:0);
671     do
672       {
673       ADD_NEW((int)(end_code - start_code + length), 0);
674       end_code += GET(end_code, 1);
675       length = 1 + LINK_SIZE;
676       }
677     while (*end_code == OP_ALT);
678     }
679   }
680 
681 workspace[0] = 0;    /* Bit indicating which vector is current */
682 
683 /* Loop for scanning the subject */
684 
685 ptr = current_subject;
686 for (;;)
687   {
688   int i, j;
689   int clen, dlen;
690   uint32_t c, d;
691   int forced_fail = 0;
692   BOOL partial_newline = FALSE;
693   BOOL could_continue = reset_could_continue;
694   reset_could_continue = FALSE;
695 
696   if (ptr > mb->last_used_ptr) mb->last_used_ptr = ptr;
697 
698   /* Make the new state list into the active state list and empty the
699   new state list. */
700 
701   temp_states = active_states;
702   active_states = new_states;
703   new_states = temp_states;
704   active_count = new_count;
705   new_count = 0;
706 
707   workspace[0] ^= 1;              /* Remember for the restarting feature */
708   workspace[1] = active_count;
709 
710   /* Set the pointers for adding new states */
711 
712   next_active_state = active_states + active_count;
713   next_new_state = new_states;
714 
715   /* Load the current character from the subject outside the loop, as many
716   different states may want to look at it, and we assume that at least one
717   will. */
718 
719   if (ptr < end_subject)
720     {
721     clen = 1;        /* Number of data items in the character */
722 #ifdef SUPPORT_UNICODE
723     GETCHARLENTEST(c, ptr, clen);
724 #else
725     c = *ptr;
726 #endif  /* SUPPORT_UNICODE */
727     }
728   else
729     {
730     clen = 0;        /* This indicates the end of the subject */
731     c = NOTACHAR;    /* This value should never actually be used */
732     }
733 
734   /* Scan up the active states and act on each one. The result of an action
735   may be to add more states to the currently active list (e.g. on hitting a
736   parenthesis) or it may be to put states on the new list, for considering
737   when we move the character pointer on. */
738 
739   for (i = 0; i < active_count; i++)
740     {
741     stateblock *current_state = active_states + i;
742     BOOL caseless = FALSE;
743     PCRE2_SPTR code;
744     uint32_t codevalue;
745     int state_offset = current_state->offset;
746     int rrc;
747     int count;
748 
749     /* A negative offset is a special case meaning "hold off going to this
750     (negated) state until the number of characters in the data field have
751     been skipped". If the could_continue flag was passed over from a previous
752     state, arrange for it to passed on. */
753 
754     if (state_offset < 0)
755       {
756       if (current_state->data > 0)
757         {
758         ADD_NEW_DATA(state_offset, current_state->count,
759           current_state->data - 1);
760         if (could_continue) reset_could_continue = TRUE;
761         continue;
762         }
763       else
764         {
765         current_state->offset = state_offset = -state_offset;
766         }
767       }
768 
769     /* Check for a duplicate state with the same count, and skip if found.
770     See the note at the head of this module about the possibility of improving
771     performance here. */
772 
773     for (j = 0; j < i; j++)
774       {
775       if (active_states[j].offset == state_offset &&
776           active_states[j].count == current_state->count)
777         goto NEXT_ACTIVE_STATE;
778       }
779 
780     /* The state offset is the offset to the opcode */
781 
782     code = start_code + state_offset;
783     codevalue = *code;
784 
785     /* If this opcode inspects a character, but we are at the end of the
786     subject, remember the fact for use when testing for a partial match. */
787 
788     if (clen == 0 && poptable[codevalue] != 0)
789       could_continue = TRUE;
790 
791     /* If this opcode is followed by an inline character, load it. It is
792     tempting to test for the presence of a subject character here, but that
793     is wrong, because sometimes zero repetitions of the subject are
794     permitted.
795 
796     We also use this mechanism for opcodes such as OP_TYPEPLUS that take an
797     argument that is not a data character - but is always one byte long because
798     the values are small. We have to take special action to deal with  \P, \p,
799     \H, \h, \V, \v and \X in this case. To keep the other cases fast, convert
800     these ones to new opcodes. */
801 
802     if (coptable[codevalue] > 0)
803       {
804       dlen = 1;
805 #ifdef SUPPORT_UNICODE
806       if (utf) { GETCHARLEN(d, (code + coptable[codevalue]), dlen); } else
807 #endif  /* SUPPORT_UNICODE */
808       d = code[coptable[codevalue]];
809       if (codevalue >= OP_TYPESTAR)
810         {
811         switch(d)
812           {
813           case OP_ANYBYTE: return PCRE2_ERROR_DFA_UITEM;
814           case OP_NOTPROP:
815           case OP_PROP: codevalue += OP_PROP_EXTRA; break;
816           case OP_ANYNL: codevalue += OP_ANYNL_EXTRA; break;
817           case OP_EXTUNI: codevalue += OP_EXTUNI_EXTRA; break;
818           case OP_NOT_HSPACE:
819           case OP_HSPACE: codevalue += OP_HSPACE_EXTRA; break;
820           case OP_NOT_VSPACE:
821           case OP_VSPACE: codevalue += OP_VSPACE_EXTRA; break;
822           default: break;
823           }
824         }
825       }
826     else
827       {
828       dlen = 0;         /* Not strictly necessary, but compilers moan */
829       d = NOTACHAR;     /* if these variables are not set. */
830       }
831 
832 
833     /* Now process the individual opcodes */
834 
835     switch (codevalue)
836       {
837 /* ========================================================================== */
838       /* These cases are never obeyed. This is a fudge that causes a compile-
839       time error if the vectors coptable or poptable, which are indexed by
840       opcode, are not the correct length. It seems to be the only way to do
841       such a check at compile time, as the sizeof() operator does not work
842       in the C preprocessor. */
843 
844       case OP_TABLE_LENGTH:
845       case OP_TABLE_LENGTH +
846         ((sizeof(coptable) == OP_TABLE_LENGTH) &&
847          (sizeof(poptable) == OP_TABLE_LENGTH)):
848       return 0;
849 
850 /* ========================================================================== */
851       /* Reached a closing bracket. If not at the end of the pattern, carry
852       on with the next opcode. For repeating opcodes, also add the repeat
853       state. Note that KETRPOS will always be encountered at the end of the
854       subpattern, because the possessive subpattern repeats are always handled
855       using recursive calls. Thus, it never adds any new states.
856 
857       At the end of the (sub)pattern, unless we have an empty string and
858       PCRE2_NOTEMPTY is set, or PCRE2_NOTEMPTY_ATSTART is set and we are at the
859       start of the subject, save the match data, shifting up all previous
860       matches so we always have the longest first. */
861 
862       case OP_KET:
863       case OP_KETRMIN:
864       case OP_KETRMAX:
865       case OP_KETRPOS:
866       if (code != end_code)
867         {
868         ADD_ACTIVE(state_offset + 1 + LINK_SIZE, 0);
869         if (codevalue != OP_KET)
870           {
871           ADD_ACTIVE(state_offset - (int)GET(code, 1), 0);
872           }
873         }
874       else
875         {
876         if (ptr > current_subject ||
877             ((mb->moptions & PCRE2_NOTEMPTY) == 0 &&
878               ((mb->moptions & PCRE2_NOTEMPTY_ATSTART) == 0 ||
879                 current_subject > start_subject + mb->start_offset)))
880           {
881           if (match_count < 0) match_count = (offsetcount >= 2)? 1 : 0;
882             else if (match_count > 0 && ++match_count * 2 > (int)offsetcount)
883               match_count = 0;
884           count = ((match_count == 0)? (int)offsetcount : match_count * 2) - 2;
885           if (count > 0) (void)memmove(offsets + 2, offsets,
886             (size_t)count * sizeof(PCRE2_SIZE));
887           if (offsetcount >= 2)
888             {
889             offsets[0] = (PCRE2_SIZE)(current_subject - start_subject);
890             offsets[1] = (PCRE2_SIZE)(ptr - start_subject);
891             }
892           if ((mb->moptions & PCRE2_DFA_SHORTEST) != 0) return match_count;
893           }
894         }
895       break;
896 
897 /* ========================================================================== */
898       /* These opcodes add to the current list of states without looking
899       at the current character. */
900 
901       /*-----------------------------------------------------------------*/
902       case OP_ALT:
903       do { code += GET(code, 1); } while (*code == OP_ALT);
904       ADD_ACTIVE((int)(code - start_code), 0);
905       break;
906 
907       /*-----------------------------------------------------------------*/
908       case OP_BRA:
909       case OP_SBRA:
910       do
911         {
912         ADD_ACTIVE((int)(code - start_code + 1 + LINK_SIZE), 0);
913         code += GET(code, 1);
914         }
915       while (*code == OP_ALT);
916       break;
917 
918       /*-----------------------------------------------------------------*/
919       case OP_CBRA:
920       case OP_SCBRA:
921       ADD_ACTIVE((int)(code - start_code + 1 + LINK_SIZE + IMM2_SIZE),  0);
922       code += GET(code, 1);
923       while (*code == OP_ALT)
924         {
925         ADD_ACTIVE((int)(code - start_code + 1 + LINK_SIZE),  0);
926         code += GET(code, 1);
927         }
928       break;
929 
930       /*-----------------------------------------------------------------*/
931       case OP_BRAZERO:
932       case OP_BRAMINZERO:
933       ADD_ACTIVE(state_offset + 1, 0);
934       code += 1 + GET(code, 2);
935       while (*code == OP_ALT) code += GET(code, 1);
936       ADD_ACTIVE((int)(code - start_code + 1 + LINK_SIZE), 0);
937       break;
938 
939       /*-----------------------------------------------------------------*/
940       case OP_SKIPZERO:
941       code += 1 + GET(code, 2);
942       while (*code == OP_ALT) code += GET(code, 1);
943       ADD_ACTIVE((int)(code - start_code + 1 + LINK_SIZE), 0);
944       break;
945 
946       /*-----------------------------------------------------------------*/
947       case OP_CIRC:
948       if (ptr == start_subject && (mb->moptions & PCRE2_NOTBOL) == 0)
949         { ADD_ACTIVE(state_offset + 1, 0); }
950       break;
951 
952       /*-----------------------------------------------------------------*/
953       case OP_CIRCM:
954       if ((ptr == start_subject && (mb->moptions & PCRE2_NOTBOL) == 0) ||
955           ((ptr != end_subject || (mb->poptions & PCRE2_ALT_CIRCUMFLEX) != 0 )
956             && WAS_NEWLINE(ptr)))
957         { ADD_ACTIVE(state_offset + 1, 0); }
958       break;
959 
960       /*-----------------------------------------------------------------*/
961       case OP_EOD:
962       if (ptr >= end_subject)
963         {
964         if ((mb->moptions & PCRE2_PARTIAL_HARD) != 0)
965           could_continue = TRUE;
966         else { ADD_ACTIVE(state_offset + 1, 0); }
967         }
968       break;
969 
970       /*-----------------------------------------------------------------*/
971       case OP_SOD:
972       if (ptr == start_subject) { ADD_ACTIVE(state_offset + 1, 0); }
973       break;
974 
975       /*-----------------------------------------------------------------*/
976       case OP_SOM:
977       if (ptr == start_subject + start_offset) { ADD_ACTIVE(state_offset + 1, 0); }
978       break;
979 
980 
981 /* ========================================================================== */
982       /* These opcodes inspect the next subject character, and sometimes
983       the previous one as well, but do not have an argument. The variable
984       clen contains the length of the current character and is zero if we are
985       at the end of the subject. */
986 
987       /*-----------------------------------------------------------------*/
988       case OP_ANY:
989       if (clen > 0 && !IS_NEWLINE(ptr))
990         {
991         if (ptr + 1 >= mb->end_subject &&
992             (mb->moptions & (PCRE2_PARTIAL_HARD)) != 0 &&
993             NLBLOCK->nltype == NLTYPE_FIXED &&
994             NLBLOCK->nllen == 2 &&
995             c == NLBLOCK->nl[0])
996           {
997           could_continue = partial_newline = TRUE;
998           }
999         else
1000           {
1001           ADD_NEW(state_offset + 1, 0);
1002           }
1003         }
1004       break;
1005 
1006       /*-----------------------------------------------------------------*/
1007       case OP_ALLANY:
1008       if (clen > 0)
1009         { ADD_NEW(state_offset + 1, 0); }
1010       break;
1011 
1012       /*-----------------------------------------------------------------*/
1013       case OP_EODN:
1014       if (clen == 0 && (mb->moptions & PCRE2_PARTIAL_HARD) != 0)
1015         could_continue = TRUE;
1016       else if (clen == 0 || (IS_NEWLINE(ptr) && ptr == end_subject - mb->nllen))
1017         { ADD_ACTIVE(state_offset + 1, 0); }
1018       break;
1019 
1020       /*-----------------------------------------------------------------*/
1021       case OP_DOLL:
1022       if ((mb->moptions & PCRE2_NOTEOL) == 0)
1023         {
1024         if (clen == 0 && (mb->moptions & PCRE2_PARTIAL_HARD) != 0)
1025           could_continue = TRUE;
1026         else if (clen == 0 ||
1027             ((mb->poptions & PCRE2_DOLLAR_ENDONLY) == 0 && IS_NEWLINE(ptr) &&
1028                (ptr == end_subject - mb->nllen)
1029             ))
1030           { ADD_ACTIVE(state_offset + 1, 0); }
1031         else if (ptr + 1 >= mb->end_subject &&
1032                  (mb->moptions & (PCRE2_PARTIAL_HARD|PCRE2_PARTIAL_SOFT)) != 0 &&
1033                  NLBLOCK->nltype == NLTYPE_FIXED &&
1034                  NLBLOCK->nllen == 2 &&
1035                  c == NLBLOCK->nl[0])
1036           {
1037           if ((mb->moptions & PCRE2_PARTIAL_HARD) != 0)
1038             {
1039             reset_could_continue = TRUE;
1040             ADD_NEW_DATA(-(state_offset + 1), 0, 1);
1041             }
1042           else could_continue = partial_newline = TRUE;
1043           }
1044         }
1045       break;
1046 
1047       /*-----------------------------------------------------------------*/
1048       case OP_DOLLM:
1049       if ((mb->moptions & PCRE2_NOTEOL) == 0)
1050         {
1051         if (clen == 0 && (mb->moptions & PCRE2_PARTIAL_HARD) != 0)
1052           could_continue = TRUE;
1053         else if (clen == 0 ||
1054             ((mb->poptions & PCRE2_DOLLAR_ENDONLY) == 0 && IS_NEWLINE(ptr)))
1055           { ADD_ACTIVE(state_offset + 1, 0); }
1056         else if (ptr + 1 >= mb->end_subject &&
1057                  (mb->moptions & (PCRE2_PARTIAL_HARD|PCRE2_PARTIAL_SOFT)) != 0 &&
1058                  NLBLOCK->nltype == NLTYPE_FIXED &&
1059                  NLBLOCK->nllen == 2 &&
1060                  c == NLBLOCK->nl[0])
1061           {
1062           if ((mb->moptions & PCRE2_PARTIAL_HARD) != 0)
1063             {
1064             reset_could_continue = TRUE;
1065             ADD_NEW_DATA(-(state_offset + 1), 0, 1);
1066             }
1067           else could_continue = partial_newline = TRUE;
1068           }
1069         }
1070       else if (IS_NEWLINE(ptr))
1071         { ADD_ACTIVE(state_offset + 1, 0); }
1072       break;
1073 
1074       /*-----------------------------------------------------------------*/
1075 
1076       case OP_DIGIT:
1077       case OP_WHITESPACE:
1078       case OP_WORDCHAR:
1079       if (clen > 0 && c < 256 &&
1080             ((ctypes[c] & toptable1[codevalue]) ^ toptable2[codevalue]) != 0)
1081         { ADD_NEW(state_offset + 1, 0); }
1082       break;
1083 
1084       /*-----------------------------------------------------------------*/
1085       case OP_NOT_DIGIT:
1086       case OP_NOT_WHITESPACE:
1087       case OP_NOT_WORDCHAR:
1088       if (clen > 0 && (c >= 256 ||
1089             ((ctypes[c] & toptable1[codevalue]) ^ toptable2[codevalue]) != 0))
1090         { ADD_NEW(state_offset + 1, 0); }
1091       break;
1092 
1093       /*-----------------------------------------------------------------*/
1094       case OP_WORD_BOUNDARY:
1095       case OP_NOT_WORD_BOUNDARY:
1096         {
1097         int left_word, right_word;
1098 
1099         if (ptr > start_subject)
1100           {
1101           PCRE2_SPTR temp = ptr - 1;
1102           if (temp < mb->start_used_ptr) mb->start_used_ptr = temp;
1103 #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32
1104           if (utf) { BACKCHAR(temp); }
1105 #endif
1106           GETCHARTEST(d, temp);
1107 #ifdef SUPPORT_UNICODE
1108           if ((mb->poptions & PCRE2_UCP) != 0)
1109             {
1110             if (d == '_') left_word = TRUE; else
1111               {
1112               uint32_t cat = UCD_CATEGORY(d);
1113               left_word = (cat == ucp_L || cat == ucp_N);
1114               }
1115             }
1116           else
1117 #endif
1118           left_word = d < 256 && (ctypes[d] & ctype_word) != 0;
1119           }
1120         else left_word = FALSE;
1121 
1122         if (clen > 0)
1123           {
1124           if (ptr >= mb->last_used_ptr)
1125             {
1126             PCRE2_SPTR temp = ptr + 1;
1127 #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32
1128             if (utf) { FORWARDCHARTEST(temp, mb->end_subject); }
1129 #endif
1130             mb->last_used_ptr = temp;
1131             }
1132 #ifdef SUPPORT_UNICODE
1133           if ((mb->poptions & PCRE2_UCP) != 0)
1134             {
1135             if (c == '_') right_word = TRUE; else
1136               {
1137               uint32_t cat = UCD_CATEGORY(c);
1138               right_word = (cat == ucp_L || cat == ucp_N);
1139               }
1140             }
1141           else
1142 #endif
1143           right_word = c < 256 && (ctypes[c] & ctype_word) != 0;
1144           }
1145         else right_word = FALSE;
1146 
1147         if ((left_word == right_word) == (codevalue == OP_NOT_WORD_BOUNDARY))
1148           { ADD_ACTIVE(state_offset + 1, 0); }
1149         }
1150       break;
1151 
1152 
1153       /*-----------------------------------------------------------------*/
1154       /* Check the next character by Unicode property. We will get here only
1155       if the support is in the binary; otherwise a compile-time error occurs.
1156       */
1157 
1158 #ifdef SUPPORT_UNICODE
1159       case OP_PROP:
1160       case OP_NOTPROP:
1161       if (clen > 0)
1162         {
1163         BOOL OK;
1164         const uint32_t *cp;
1165         const ucd_record * prop = GET_UCD(c);
1166         switch(code[1])
1167           {
1168           case PT_ANY:
1169           OK = TRUE;
1170           break;
1171 
1172           case PT_LAMP:
1173           OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll ||
1174                prop->chartype == ucp_Lt;
1175           break;
1176 
1177           case PT_GC:
1178           OK = PRIV(ucp_gentype)[prop->chartype] == code[2];
1179           break;
1180 
1181           case PT_PC:
1182           OK = prop->chartype == code[2];
1183           break;
1184 
1185           case PT_SC:
1186           OK = prop->script == code[2];
1187           break;
1188 
1189           /* These are specials for combination cases. */
1190 
1191           case PT_ALNUM:
1192           OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
1193                PRIV(ucp_gentype)[prop->chartype] == ucp_N;
1194           break;
1195 
1196           /* Perl space used to exclude VT, but from Perl 5.18 it is included,
1197           which means that Perl space and POSIX space are now identical. PCRE
1198           was changed at release 8.34. */
1199 
1200           case PT_SPACE:    /* Perl space */
1201           case PT_PXSPACE:  /* POSIX space */
1202           switch(c)
1203             {
1204             HSPACE_CASES:
1205             VSPACE_CASES:
1206             OK = TRUE;
1207             break;
1208 
1209             default:
1210             OK = PRIV(ucp_gentype)[prop->chartype] == ucp_Z;
1211             break;
1212             }
1213           break;
1214 
1215           case PT_WORD:
1216           OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
1217                PRIV(ucp_gentype)[prop->chartype] == ucp_N ||
1218                c == CHAR_UNDERSCORE;
1219           break;
1220 
1221           case PT_CLIST:
1222           cp = PRIV(ucd_caseless_sets) + code[2];
1223           for (;;)
1224             {
1225             if (c < *cp) { OK = FALSE; break; }
1226             if (c == *cp++) { OK = TRUE; break; }
1227             }
1228           break;
1229 
1230           case PT_UCNC:
1231           OK = c == CHAR_DOLLAR_SIGN || c == CHAR_COMMERCIAL_AT ||
1232                c == CHAR_GRAVE_ACCENT || (c >= 0xa0 && c <= 0xd7ff) ||
1233                c >= 0xe000;
1234           break;
1235 
1236           /* Should never occur, but keep compilers from grumbling. */
1237 
1238           default:
1239           OK = codevalue != OP_PROP;
1240           break;
1241           }
1242 
1243         if (OK == (codevalue == OP_PROP)) { ADD_NEW(state_offset + 3, 0); }
1244         }
1245       break;
1246 #endif
1247 
1248 
1249 
1250 /* ========================================================================== */
1251       /* These opcodes likewise inspect the subject character, but have an
1252       argument that is not a data character. It is one of these opcodes:
1253       OP_ANY, OP_ALLANY, OP_DIGIT, OP_NOT_DIGIT, OP_WHITESPACE, OP_NOT_SPACE,
1254       OP_WORDCHAR, OP_NOT_WORDCHAR. The value is loaded into d. */
1255 
1256       case OP_TYPEPLUS:
1257       case OP_TYPEMINPLUS:
1258       case OP_TYPEPOSPLUS:
1259       count = current_state->count;  /* Already matched */
1260       if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1261       if (clen > 0)
1262         {
1263         if (d == OP_ANY && ptr + 1 >= mb->end_subject &&
1264             (mb->moptions & (PCRE2_PARTIAL_HARD)) != 0 &&
1265             NLBLOCK->nltype == NLTYPE_FIXED &&
1266             NLBLOCK->nllen == 2 &&
1267             c == NLBLOCK->nl[0])
1268           {
1269           could_continue = partial_newline = TRUE;
1270           }
1271         else if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
1272             (c < 256 &&
1273               (d != OP_ANY || !IS_NEWLINE(ptr)) &&
1274               ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
1275           {
1276           if (count > 0 && codevalue == OP_TYPEPOSPLUS)
1277             {
1278             active_count--;            /* Remove non-match possibility */
1279             next_active_state--;
1280             }
1281           count++;
1282           ADD_NEW(state_offset, count);
1283           }
1284         }
1285       break;
1286 
1287       /*-----------------------------------------------------------------*/
1288       case OP_TYPEQUERY:
1289       case OP_TYPEMINQUERY:
1290       case OP_TYPEPOSQUERY:
1291       ADD_ACTIVE(state_offset + 2, 0);
1292       if (clen > 0)
1293         {
1294         if (d == OP_ANY && ptr + 1 >= mb->end_subject &&
1295             (mb->moptions & (PCRE2_PARTIAL_HARD)) != 0 &&
1296             NLBLOCK->nltype == NLTYPE_FIXED &&
1297             NLBLOCK->nllen == 2 &&
1298             c == NLBLOCK->nl[0])
1299           {
1300           could_continue = partial_newline = TRUE;
1301           }
1302         else if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
1303             (c < 256 &&
1304               (d != OP_ANY || !IS_NEWLINE(ptr)) &&
1305               ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
1306           {
1307           if (codevalue == OP_TYPEPOSQUERY)
1308             {
1309             active_count--;            /* Remove non-match possibility */
1310             next_active_state--;
1311             }
1312           ADD_NEW(state_offset + 2, 0);
1313           }
1314         }
1315       break;
1316 
1317       /*-----------------------------------------------------------------*/
1318       case OP_TYPESTAR:
1319       case OP_TYPEMINSTAR:
1320       case OP_TYPEPOSSTAR:
1321       ADD_ACTIVE(state_offset + 2, 0);
1322       if (clen > 0)
1323         {
1324         if (d == OP_ANY && ptr + 1 >= mb->end_subject &&
1325             (mb->moptions & (PCRE2_PARTIAL_HARD)) != 0 &&
1326             NLBLOCK->nltype == NLTYPE_FIXED &&
1327             NLBLOCK->nllen == 2 &&
1328             c == NLBLOCK->nl[0])
1329           {
1330           could_continue = partial_newline = TRUE;
1331           }
1332         else if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
1333             (c < 256 &&
1334               (d != OP_ANY || !IS_NEWLINE(ptr)) &&
1335               ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
1336           {
1337           if (codevalue == OP_TYPEPOSSTAR)
1338             {
1339             active_count--;            /* Remove non-match possibility */
1340             next_active_state--;
1341             }
1342           ADD_NEW(state_offset, 0);
1343           }
1344         }
1345       break;
1346 
1347       /*-----------------------------------------------------------------*/
1348       case OP_TYPEEXACT:
1349       count = current_state->count;  /* Number already matched */
1350       if (clen > 0)
1351         {
1352         if (d == OP_ANY && ptr + 1 >= mb->end_subject &&
1353             (mb->moptions & (PCRE2_PARTIAL_HARD)) != 0 &&
1354             NLBLOCK->nltype == NLTYPE_FIXED &&
1355             NLBLOCK->nllen == 2 &&
1356             c == NLBLOCK->nl[0])
1357           {
1358           could_continue = partial_newline = TRUE;
1359           }
1360         else if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
1361             (c < 256 &&
1362               (d != OP_ANY || !IS_NEWLINE(ptr)) &&
1363               ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
1364           {
1365           if (++count >= (int)GET2(code, 1))
1366             { ADD_NEW(state_offset + 1 + IMM2_SIZE + 1, 0); }
1367           else
1368             { ADD_NEW(state_offset, count); }
1369           }
1370         }
1371       break;
1372 
1373       /*-----------------------------------------------------------------*/
1374       case OP_TYPEUPTO:
1375       case OP_TYPEMINUPTO:
1376       case OP_TYPEPOSUPTO:
1377       ADD_ACTIVE(state_offset + 2 + IMM2_SIZE, 0);
1378       count = current_state->count;  /* Number already matched */
1379       if (clen > 0)
1380         {
1381         if (d == OP_ANY && ptr + 1 >= mb->end_subject &&
1382             (mb->moptions & (PCRE2_PARTIAL_HARD)) != 0 &&
1383             NLBLOCK->nltype == NLTYPE_FIXED &&
1384             NLBLOCK->nllen == 2 &&
1385             c == NLBLOCK->nl[0])
1386           {
1387           could_continue = partial_newline = TRUE;
1388           }
1389         else if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
1390             (c < 256 &&
1391               (d != OP_ANY || !IS_NEWLINE(ptr)) &&
1392               ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
1393           {
1394           if (codevalue == OP_TYPEPOSUPTO)
1395             {
1396             active_count--;           /* Remove non-match possibility */
1397             next_active_state--;
1398             }
1399           if (++count >= (int)GET2(code, 1))
1400             { ADD_NEW(state_offset + 2 + IMM2_SIZE, 0); }
1401           else
1402             { ADD_NEW(state_offset, count); }
1403           }
1404         }
1405       break;
1406 
1407 /* ========================================================================== */
1408       /* These are virtual opcodes that are used when something like
1409       OP_TYPEPLUS has OP_PROP, OP_NOTPROP, OP_ANYNL, or OP_EXTUNI as its
1410       argument. It keeps the code above fast for the other cases. The argument
1411       is in the d variable. */
1412 
1413 #ifdef SUPPORT_UNICODE
1414       case OP_PROP_EXTRA + OP_TYPEPLUS:
1415       case OP_PROP_EXTRA + OP_TYPEMINPLUS:
1416       case OP_PROP_EXTRA + OP_TYPEPOSPLUS:
1417       count = current_state->count;           /* Already matched */
1418       if (count > 0) { ADD_ACTIVE(state_offset + 4, 0); }
1419       if (clen > 0)
1420         {
1421         BOOL OK;
1422         const uint32_t *cp;
1423         const ucd_record * prop = GET_UCD(c);
1424         switch(code[2])
1425           {
1426           case PT_ANY:
1427           OK = TRUE;
1428           break;
1429 
1430           case PT_LAMP:
1431           OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll ||
1432             prop->chartype == ucp_Lt;
1433           break;
1434 
1435           case PT_GC:
1436           OK = PRIV(ucp_gentype)[prop->chartype] == code[3];
1437           break;
1438 
1439           case PT_PC:
1440           OK = prop->chartype == code[3];
1441           break;
1442 
1443           case PT_SC:
1444           OK = prop->script == code[3];
1445           break;
1446 
1447           /* These are specials for combination cases. */
1448 
1449           case PT_ALNUM:
1450           OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
1451                PRIV(ucp_gentype)[prop->chartype] == ucp_N;
1452           break;
1453 
1454           /* Perl space used to exclude VT, but from Perl 5.18 it is included,
1455           which means that Perl space and POSIX space are now identical. PCRE
1456           was changed at release 8.34. */
1457 
1458           case PT_SPACE:    /* Perl space */
1459           case PT_PXSPACE:  /* POSIX space */
1460           switch(c)
1461             {
1462             HSPACE_CASES:
1463             VSPACE_CASES:
1464             OK = TRUE;
1465             break;
1466 
1467             default:
1468             OK = PRIV(ucp_gentype)[prop->chartype] == ucp_Z;
1469             break;
1470             }
1471           break;
1472 
1473           case PT_WORD:
1474           OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
1475                PRIV(ucp_gentype)[prop->chartype] == ucp_N ||
1476                c == CHAR_UNDERSCORE;
1477           break;
1478 
1479           case PT_CLIST:
1480           cp = PRIV(ucd_caseless_sets) + code[3];
1481           for (;;)
1482             {
1483             if (c < *cp) { OK = FALSE; break; }
1484             if (c == *cp++) { OK = TRUE; break; }
1485             }
1486           break;
1487 
1488           case PT_UCNC:
1489           OK = c == CHAR_DOLLAR_SIGN || c == CHAR_COMMERCIAL_AT ||
1490                c == CHAR_GRAVE_ACCENT || (c >= 0xa0 && c <= 0xd7ff) ||
1491                c >= 0xe000;
1492           break;
1493 
1494           /* Should never occur, but keep compilers from grumbling. */
1495 
1496           default:
1497           OK = codevalue != OP_PROP;
1498           break;
1499           }
1500 
1501         if (OK == (d == OP_PROP))
1502           {
1503           if (count > 0 && codevalue == OP_PROP_EXTRA + OP_TYPEPOSPLUS)
1504             {
1505             active_count--;           /* Remove non-match possibility */
1506             next_active_state--;
1507             }
1508           count++;
1509           ADD_NEW(state_offset, count);
1510           }
1511         }
1512       break;
1513 
1514       /*-----------------------------------------------------------------*/
1515       case OP_EXTUNI_EXTRA + OP_TYPEPLUS:
1516       case OP_EXTUNI_EXTRA + OP_TYPEMINPLUS:
1517       case OP_EXTUNI_EXTRA + OP_TYPEPOSPLUS:
1518       count = current_state->count;  /* Already matched */
1519       if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1520       if (clen > 0)
1521         {
1522         int ncount = 0;
1523         if (count > 0 && codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSPLUS)
1524           {
1525           active_count--;           /* Remove non-match possibility */
1526           next_active_state--;
1527           }
1528         (void)PRIV(extuni)(c, ptr + clen, mb->start_subject, end_subject, utf,
1529           &ncount);
1530         count++;
1531         ADD_NEW_DATA(-state_offset, count, ncount);
1532         }
1533       break;
1534 #endif
1535 
1536       /*-----------------------------------------------------------------*/
1537       case OP_ANYNL_EXTRA + OP_TYPEPLUS:
1538       case OP_ANYNL_EXTRA + OP_TYPEMINPLUS:
1539       case OP_ANYNL_EXTRA + OP_TYPEPOSPLUS:
1540       count = current_state->count;  /* Already matched */
1541       if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1542       if (clen > 0)
1543         {
1544         int ncount = 0;
1545         switch (c)
1546           {
1547           case CHAR_VT:
1548           case CHAR_FF:
1549           case CHAR_NEL:
1550 #ifndef EBCDIC
1551           case 0x2028:
1552           case 0x2029:
1553 #endif  /* Not EBCDIC */
1554           if (mb->bsr_convention == PCRE2_BSR_ANYCRLF) break;
1555           goto ANYNL01;
1556 
1557           case CHAR_CR:
1558           if (ptr + 1 < end_subject && UCHAR21TEST(ptr + 1) == CHAR_LF) ncount = 1;
1559           /* Fall through */
1560 
1561           ANYNL01:
1562           case CHAR_LF:
1563           if (count > 0 && codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSPLUS)
1564             {
1565             active_count--;           /* Remove non-match possibility */
1566             next_active_state--;
1567             }
1568           count++;
1569           ADD_NEW_DATA(-state_offset, count, ncount);
1570           break;
1571 
1572           default:
1573           break;
1574           }
1575         }
1576       break;
1577 
1578       /*-----------------------------------------------------------------*/
1579       case OP_VSPACE_EXTRA + OP_TYPEPLUS:
1580       case OP_VSPACE_EXTRA + OP_TYPEMINPLUS:
1581       case OP_VSPACE_EXTRA + OP_TYPEPOSPLUS:
1582       count = current_state->count;  /* Already matched */
1583       if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1584       if (clen > 0)
1585         {
1586         BOOL OK;
1587         switch (c)
1588           {
1589           VSPACE_CASES:
1590           OK = TRUE;
1591           break;
1592 
1593           default:
1594           OK = FALSE;
1595           break;
1596           }
1597 
1598         if (OK == (d == OP_VSPACE))
1599           {
1600           if (count > 0 && codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSPLUS)
1601             {
1602             active_count--;           /* Remove non-match possibility */
1603             next_active_state--;
1604             }
1605           count++;
1606           ADD_NEW_DATA(-state_offset, count, 0);
1607           }
1608         }
1609       break;
1610 
1611       /*-----------------------------------------------------------------*/
1612       case OP_HSPACE_EXTRA + OP_TYPEPLUS:
1613       case OP_HSPACE_EXTRA + OP_TYPEMINPLUS:
1614       case OP_HSPACE_EXTRA + OP_TYPEPOSPLUS:
1615       count = current_state->count;  /* Already matched */
1616       if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1617       if (clen > 0)
1618         {
1619         BOOL OK;
1620         switch (c)
1621           {
1622           HSPACE_CASES:
1623           OK = TRUE;
1624           break;
1625 
1626           default:
1627           OK = FALSE;
1628           break;
1629           }
1630 
1631         if (OK == (d == OP_HSPACE))
1632           {
1633           if (count > 0 && codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSPLUS)
1634             {
1635             active_count--;           /* Remove non-match possibility */
1636             next_active_state--;
1637             }
1638           count++;
1639           ADD_NEW_DATA(-state_offset, count, 0);
1640           }
1641         }
1642       break;
1643 
1644       /*-----------------------------------------------------------------*/
1645 #ifdef SUPPORT_UNICODE
1646       case OP_PROP_EXTRA + OP_TYPEQUERY:
1647       case OP_PROP_EXTRA + OP_TYPEMINQUERY:
1648       case OP_PROP_EXTRA + OP_TYPEPOSQUERY:
1649       count = 4;
1650       goto QS1;
1651 
1652       case OP_PROP_EXTRA + OP_TYPESTAR:
1653       case OP_PROP_EXTRA + OP_TYPEMINSTAR:
1654       case OP_PROP_EXTRA + OP_TYPEPOSSTAR:
1655       count = 0;
1656 
1657       QS1:
1658 
1659       ADD_ACTIVE(state_offset + 4, 0);
1660       if (clen > 0)
1661         {
1662         BOOL OK;
1663         const uint32_t *cp;
1664         const ucd_record * prop = GET_UCD(c);
1665         switch(code[2])
1666           {
1667           case PT_ANY:
1668           OK = TRUE;
1669           break;
1670 
1671           case PT_LAMP:
1672           OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll ||
1673             prop->chartype == ucp_Lt;
1674           break;
1675 
1676           case PT_GC:
1677           OK = PRIV(ucp_gentype)[prop->chartype] == code[3];
1678           break;
1679 
1680           case PT_PC:
1681           OK = prop->chartype == code[3];
1682           break;
1683 
1684           case PT_SC:
1685           OK = prop->script == code[3];
1686           break;
1687 
1688           /* These are specials for combination cases. */
1689 
1690           case PT_ALNUM:
1691           OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
1692                PRIV(ucp_gentype)[prop->chartype] == ucp_N;
1693           break;
1694 
1695           /* Perl space used to exclude VT, but from Perl 5.18 it is included,
1696           which means that Perl space and POSIX space are now identical. PCRE
1697           was changed at release 8.34. */
1698 
1699           case PT_SPACE:    /* Perl space */
1700           case PT_PXSPACE:  /* POSIX space */
1701           switch(c)
1702             {
1703             HSPACE_CASES:
1704             VSPACE_CASES:
1705             OK = TRUE;
1706             break;
1707 
1708             default:
1709             OK = PRIV(ucp_gentype)[prop->chartype] == ucp_Z;
1710             break;
1711             }
1712           break;
1713 
1714           case PT_WORD:
1715           OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
1716                PRIV(ucp_gentype)[prop->chartype] == ucp_N ||
1717                c == CHAR_UNDERSCORE;
1718           break;
1719 
1720           case PT_CLIST:
1721           cp = PRIV(ucd_caseless_sets) + code[3];
1722           for (;;)
1723             {
1724             if (c < *cp) { OK = FALSE; break; }
1725             if (c == *cp++) { OK = TRUE; break; }
1726             }
1727           break;
1728 
1729           case PT_UCNC:
1730           OK = c == CHAR_DOLLAR_SIGN || c == CHAR_COMMERCIAL_AT ||
1731                c == CHAR_GRAVE_ACCENT || (c >= 0xa0 && c <= 0xd7ff) ||
1732                c >= 0xe000;
1733           break;
1734 
1735           /* Should never occur, but keep compilers from grumbling. */
1736 
1737           default:
1738           OK = codevalue != OP_PROP;
1739           break;
1740           }
1741 
1742         if (OK == (d == OP_PROP))
1743           {
1744           if (codevalue == OP_PROP_EXTRA + OP_TYPEPOSSTAR ||
1745               codevalue == OP_PROP_EXTRA + OP_TYPEPOSQUERY)
1746             {
1747             active_count--;           /* Remove non-match possibility */
1748             next_active_state--;
1749             }
1750           ADD_NEW(state_offset + count, 0);
1751           }
1752         }
1753       break;
1754 
1755       /*-----------------------------------------------------------------*/
1756       case OP_EXTUNI_EXTRA + OP_TYPEQUERY:
1757       case OP_EXTUNI_EXTRA + OP_TYPEMINQUERY:
1758       case OP_EXTUNI_EXTRA + OP_TYPEPOSQUERY:
1759       count = 2;
1760       goto QS2;
1761 
1762       case OP_EXTUNI_EXTRA + OP_TYPESTAR:
1763       case OP_EXTUNI_EXTRA + OP_TYPEMINSTAR:
1764       case OP_EXTUNI_EXTRA + OP_TYPEPOSSTAR:
1765       count = 0;
1766 
1767       QS2:
1768 
1769       ADD_ACTIVE(state_offset + 2, 0);
1770       if (clen > 0)
1771         {
1772         int ncount = 0;
1773         if (codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSSTAR ||
1774             codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSQUERY)
1775           {
1776           active_count--;           /* Remove non-match possibility */
1777           next_active_state--;
1778           }
1779         (void)PRIV(extuni)(c, ptr + clen, mb->start_subject, end_subject, utf,
1780           &ncount);
1781         ADD_NEW_DATA(-(state_offset + count), 0, ncount);
1782         }
1783       break;
1784 #endif
1785 
1786       /*-----------------------------------------------------------------*/
1787       case OP_ANYNL_EXTRA + OP_TYPEQUERY:
1788       case OP_ANYNL_EXTRA + OP_TYPEMINQUERY:
1789       case OP_ANYNL_EXTRA + OP_TYPEPOSQUERY:
1790       count = 2;
1791       goto QS3;
1792 
1793       case OP_ANYNL_EXTRA + OP_TYPESTAR:
1794       case OP_ANYNL_EXTRA + OP_TYPEMINSTAR:
1795       case OP_ANYNL_EXTRA + OP_TYPEPOSSTAR:
1796       count = 0;
1797 
1798       QS3:
1799       ADD_ACTIVE(state_offset + 2, 0);
1800       if (clen > 0)
1801         {
1802         int ncount = 0;
1803         switch (c)
1804           {
1805           case CHAR_VT:
1806           case CHAR_FF:
1807           case CHAR_NEL:
1808 #ifndef EBCDIC
1809           case 0x2028:
1810           case 0x2029:
1811 #endif  /* Not EBCDIC */
1812           if (mb->bsr_convention == PCRE2_BSR_ANYCRLF) break;
1813           goto ANYNL02;
1814 
1815           case CHAR_CR:
1816           if (ptr + 1 < end_subject && UCHAR21TEST(ptr + 1) == CHAR_LF) ncount = 1;
1817           /* Fall through */
1818 
1819           ANYNL02:
1820           case CHAR_LF:
1821           if (codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSSTAR ||
1822               codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSQUERY)
1823             {
1824             active_count--;           /* Remove non-match possibility */
1825             next_active_state--;
1826             }
1827           ADD_NEW_DATA(-(state_offset + (int)count), 0, ncount);
1828           break;
1829 
1830           default:
1831           break;
1832           }
1833         }
1834       break;
1835 
1836       /*-----------------------------------------------------------------*/
1837       case OP_VSPACE_EXTRA + OP_TYPEQUERY:
1838       case OP_VSPACE_EXTRA + OP_TYPEMINQUERY:
1839       case OP_VSPACE_EXTRA + OP_TYPEPOSQUERY:
1840       count = 2;
1841       goto QS4;
1842 
1843       case OP_VSPACE_EXTRA + OP_TYPESTAR:
1844       case OP_VSPACE_EXTRA + OP_TYPEMINSTAR:
1845       case OP_VSPACE_EXTRA + OP_TYPEPOSSTAR:
1846       count = 0;
1847 
1848       QS4:
1849       ADD_ACTIVE(state_offset + 2, 0);
1850       if (clen > 0)
1851         {
1852         BOOL OK;
1853         switch (c)
1854           {
1855           VSPACE_CASES:
1856           OK = TRUE;
1857           break;
1858 
1859           default:
1860           OK = FALSE;
1861           break;
1862           }
1863         if (OK == (d == OP_VSPACE))
1864           {
1865           if (codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSSTAR ||
1866               codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSQUERY)
1867             {
1868             active_count--;           /* Remove non-match possibility */
1869             next_active_state--;
1870             }
1871           ADD_NEW_DATA(-(state_offset + (int)count), 0, 0);
1872           }
1873         }
1874       break;
1875 
1876       /*-----------------------------------------------------------------*/
1877       case OP_HSPACE_EXTRA + OP_TYPEQUERY:
1878       case OP_HSPACE_EXTRA + OP_TYPEMINQUERY:
1879       case OP_HSPACE_EXTRA + OP_TYPEPOSQUERY:
1880       count = 2;
1881       goto QS5;
1882 
1883       case OP_HSPACE_EXTRA + OP_TYPESTAR:
1884       case OP_HSPACE_EXTRA + OP_TYPEMINSTAR:
1885       case OP_HSPACE_EXTRA + OP_TYPEPOSSTAR:
1886       count = 0;
1887 
1888       QS5:
1889       ADD_ACTIVE(state_offset + 2, 0);
1890       if (clen > 0)
1891         {
1892         BOOL OK;
1893         switch (c)
1894           {
1895           HSPACE_CASES:
1896           OK = TRUE;
1897           break;
1898 
1899           default:
1900           OK = FALSE;
1901           break;
1902           }
1903 
1904         if (OK == (d == OP_HSPACE))
1905           {
1906           if (codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSSTAR ||
1907               codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSQUERY)
1908             {
1909             active_count--;           /* Remove non-match possibility */
1910             next_active_state--;
1911             }
1912           ADD_NEW_DATA(-(state_offset + (int)count), 0, 0);
1913           }
1914         }
1915       break;
1916 
1917       /*-----------------------------------------------------------------*/
1918 #ifdef SUPPORT_UNICODE
1919       case OP_PROP_EXTRA + OP_TYPEEXACT:
1920       case OP_PROP_EXTRA + OP_TYPEUPTO:
1921       case OP_PROP_EXTRA + OP_TYPEMINUPTO:
1922       case OP_PROP_EXTRA + OP_TYPEPOSUPTO:
1923       if (codevalue != OP_PROP_EXTRA + OP_TYPEEXACT)
1924         { ADD_ACTIVE(state_offset + 1 + IMM2_SIZE + 3, 0); }
1925       count = current_state->count;  /* Number already matched */
1926       if (clen > 0)
1927         {
1928         BOOL OK;
1929         const uint32_t *cp;
1930         const ucd_record * prop = GET_UCD(c);
1931         switch(code[1 + IMM2_SIZE + 1])
1932           {
1933           case PT_ANY:
1934           OK = TRUE;
1935           break;
1936 
1937           case PT_LAMP:
1938           OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll ||
1939             prop->chartype == ucp_Lt;
1940           break;
1941 
1942           case PT_GC:
1943           OK = PRIV(ucp_gentype)[prop->chartype] == code[1 + IMM2_SIZE + 2];
1944           break;
1945 
1946           case PT_PC:
1947           OK = prop->chartype == code[1 + IMM2_SIZE + 2];
1948           break;
1949 
1950           case PT_SC:
1951           OK = prop->script == code[1 + IMM2_SIZE + 2];
1952           break;
1953 
1954           /* These are specials for combination cases. */
1955 
1956           case PT_ALNUM:
1957           OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
1958                PRIV(ucp_gentype)[prop->chartype] == ucp_N;
1959           break;
1960 
1961           /* Perl space used to exclude VT, but from Perl 5.18 it is included,
1962           which means that Perl space and POSIX space are now identical. PCRE
1963           was changed at release 8.34. */
1964 
1965           case PT_SPACE:    /* Perl space */
1966           case PT_PXSPACE:  /* POSIX space */
1967           switch(c)
1968             {
1969             HSPACE_CASES:
1970             VSPACE_CASES:
1971             OK = TRUE;
1972             break;
1973 
1974             default:
1975             OK = PRIV(ucp_gentype)[prop->chartype] == ucp_Z;
1976             break;
1977             }
1978           break;
1979 
1980           case PT_WORD:
1981           OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
1982                PRIV(ucp_gentype)[prop->chartype] == ucp_N ||
1983                c == CHAR_UNDERSCORE;
1984           break;
1985 
1986           case PT_CLIST:
1987           cp = PRIV(ucd_caseless_sets) + code[1 + IMM2_SIZE + 2];
1988           for (;;)
1989             {
1990             if (c < *cp) { OK = FALSE; break; }
1991             if (c == *cp++) { OK = TRUE; break; }
1992             }
1993           break;
1994 
1995           case PT_UCNC:
1996           OK = c == CHAR_DOLLAR_SIGN || c == CHAR_COMMERCIAL_AT ||
1997                c == CHAR_GRAVE_ACCENT || (c >= 0xa0 && c <= 0xd7ff) ||
1998                c >= 0xe000;
1999           break;
2000 
2001           /* Should never occur, but keep compilers from grumbling. */
2002 
2003           default:
2004           OK = codevalue != OP_PROP;
2005           break;
2006           }
2007 
2008         if (OK == (d == OP_PROP))
2009           {
2010           if (codevalue == OP_PROP_EXTRA + OP_TYPEPOSUPTO)
2011             {
2012             active_count--;           /* Remove non-match possibility */
2013             next_active_state--;
2014             }
2015           if (++count >= (int)GET2(code, 1))
2016             { ADD_NEW(state_offset + 1 + IMM2_SIZE + 3, 0); }
2017           else
2018             { ADD_NEW(state_offset, count); }
2019           }
2020         }
2021       break;
2022 
2023       /*-----------------------------------------------------------------*/
2024       case OP_EXTUNI_EXTRA + OP_TYPEEXACT:
2025       case OP_EXTUNI_EXTRA + OP_TYPEUPTO:
2026       case OP_EXTUNI_EXTRA + OP_TYPEMINUPTO:
2027       case OP_EXTUNI_EXTRA + OP_TYPEPOSUPTO:
2028       if (codevalue != OP_EXTUNI_EXTRA + OP_TYPEEXACT)
2029         { ADD_ACTIVE(state_offset + 2 + IMM2_SIZE, 0); }
2030       count = current_state->count;  /* Number already matched */
2031       if (clen > 0)
2032         {
2033         PCRE2_SPTR nptr;
2034         int ncount = 0;
2035         if (codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSUPTO)
2036           {
2037           active_count--;           /* Remove non-match possibility */
2038           next_active_state--;
2039           }
2040         nptr = PRIV(extuni)(c, ptr + clen, mb->start_subject, end_subject, utf,
2041           &ncount);
2042         if (nptr >= end_subject && (mb->moptions & PCRE2_PARTIAL_HARD) != 0)
2043             reset_could_continue = TRUE;
2044         if (++count >= (int)GET2(code, 1))
2045           { ADD_NEW_DATA(-(state_offset + 2 + IMM2_SIZE), 0, ncount); }
2046         else
2047           { ADD_NEW_DATA(-state_offset, count, ncount); }
2048         }
2049       break;
2050 #endif
2051 
2052       /*-----------------------------------------------------------------*/
2053       case OP_ANYNL_EXTRA + OP_TYPEEXACT:
2054       case OP_ANYNL_EXTRA + OP_TYPEUPTO:
2055       case OP_ANYNL_EXTRA + OP_TYPEMINUPTO:
2056       case OP_ANYNL_EXTRA + OP_TYPEPOSUPTO:
2057       if (codevalue != OP_ANYNL_EXTRA + OP_TYPEEXACT)
2058         { ADD_ACTIVE(state_offset + 2 + IMM2_SIZE, 0); }
2059       count = current_state->count;  /* Number already matched */
2060       if (clen > 0)
2061         {
2062         int ncount = 0;
2063         switch (c)
2064           {
2065           case CHAR_VT:
2066           case CHAR_FF:
2067           case CHAR_NEL:
2068 #ifndef EBCDIC
2069           case 0x2028:
2070           case 0x2029:
2071 #endif  /* Not EBCDIC */
2072           if (mb->bsr_convention == PCRE2_BSR_ANYCRLF) break;
2073           goto ANYNL03;
2074 
2075           case CHAR_CR:
2076           if (ptr + 1 < end_subject && UCHAR21TEST(ptr + 1) == CHAR_LF) ncount = 1;
2077           /* Fall through */
2078 
2079           ANYNL03:
2080           case CHAR_LF:
2081           if (codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSUPTO)
2082             {
2083             active_count--;           /* Remove non-match possibility */
2084             next_active_state--;
2085             }
2086           if (++count >= (int)GET2(code, 1))
2087             { ADD_NEW_DATA(-(state_offset + 2 + IMM2_SIZE), 0, ncount); }
2088           else
2089             { ADD_NEW_DATA(-state_offset, count, ncount); }
2090           break;
2091 
2092           default:
2093           break;
2094           }
2095         }
2096       break;
2097 
2098       /*-----------------------------------------------------------------*/
2099       case OP_VSPACE_EXTRA + OP_TYPEEXACT:
2100       case OP_VSPACE_EXTRA + OP_TYPEUPTO:
2101       case OP_VSPACE_EXTRA + OP_TYPEMINUPTO:
2102       case OP_VSPACE_EXTRA + OP_TYPEPOSUPTO:
2103       if (codevalue != OP_VSPACE_EXTRA + OP_TYPEEXACT)
2104         { ADD_ACTIVE(state_offset + 2 + IMM2_SIZE, 0); }
2105       count = current_state->count;  /* Number already matched */
2106       if (clen > 0)
2107         {
2108         BOOL OK;
2109         switch (c)
2110           {
2111           VSPACE_CASES:
2112           OK = TRUE;
2113           break;
2114 
2115           default:
2116           OK = FALSE;
2117           }
2118 
2119         if (OK == (d == OP_VSPACE))
2120           {
2121           if (codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSUPTO)
2122             {
2123             active_count--;           /* Remove non-match possibility */
2124             next_active_state--;
2125             }
2126           if (++count >= (int)GET2(code, 1))
2127             { ADD_NEW_DATA(-(state_offset + 2 + IMM2_SIZE), 0, 0); }
2128           else
2129             { ADD_NEW_DATA(-state_offset, count, 0); }
2130           }
2131         }
2132       break;
2133 
2134       /*-----------------------------------------------------------------*/
2135       case OP_HSPACE_EXTRA + OP_TYPEEXACT:
2136       case OP_HSPACE_EXTRA + OP_TYPEUPTO:
2137       case OP_HSPACE_EXTRA + OP_TYPEMINUPTO:
2138       case OP_HSPACE_EXTRA + OP_TYPEPOSUPTO:
2139       if (codevalue != OP_HSPACE_EXTRA + OP_TYPEEXACT)
2140         { ADD_ACTIVE(state_offset + 2 + IMM2_SIZE, 0); }
2141       count = current_state->count;  /* Number already matched */
2142       if (clen > 0)
2143         {
2144         BOOL OK;
2145         switch (c)
2146           {
2147           HSPACE_CASES:
2148           OK = TRUE;
2149           break;
2150 
2151           default:
2152           OK = FALSE;
2153           break;
2154           }
2155 
2156         if (OK == (d == OP_HSPACE))
2157           {
2158           if (codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSUPTO)
2159             {
2160             active_count--;           /* Remove non-match possibility */
2161             next_active_state--;
2162             }
2163           if (++count >= (int)GET2(code, 1))
2164             { ADD_NEW_DATA(-(state_offset + 2 + IMM2_SIZE), 0, 0); }
2165           else
2166             { ADD_NEW_DATA(-state_offset, count, 0); }
2167           }
2168         }
2169       break;
2170 
2171 /* ========================================================================== */
2172       /* These opcodes are followed by a character that is usually compared
2173       to the current subject character; it is loaded into d. We still get
2174       here even if there is no subject character, because in some cases zero
2175       repetitions are permitted. */
2176 
2177       /*-----------------------------------------------------------------*/
2178       case OP_CHAR:
2179       if (clen > 0 && c == d) { ADD_NEW(state_offset + dlen + 1, 0); }
2180       break;
2181 
2182       /*-----------------------------------------------------------------*/
2183       case OP_CHARI:
2184       if (clen == 0) break;
2185 
2186 #ifdef SUPPORT_UNICODE
2187       if (utf)
2188         {
2189         if (c == d) { ADD_NEW(state_offset + dlen + 1, 0); } else
2190           {
2191           unsigned int othercase;
2192           if (c < 128)
2193             othercase = fcc[c];
2194           else
2195             othercase = UCD_OTHERCASE(c);
2196           if (d == othercase) { ADD_NEW(state_offset + dlen + 1, 0); }
2197           }
2198         }
2199       else
2200 #endif  /* SUPPORT_UNICODE */
2201       /* Not UTF mode */
2202         {
2203         if (TABLE_GET(c, lcc, c) == TABLE_GET(d, lcc, d))
2204           { ADD_NEW(state_offset + 2, 0); }
2205         }
2206       break;
2207 
2208 
2209 #ifdef SUPPORT_UNICODE
2210       /*-----------------------------------------------------------------*/
2211       /* This is a tricky one because it can match more than one character.
2212       Find out how many characters to skip, and then set up a negative state
2213       to wait for them to pass before continuing. */
2214 
2215       case OP_EXTUNI:
2216       if (clen > 0)
2217         {
2218         int ncount = 0;
2219         PCRE2_SPTR nptr = PRIV(extuni)(c, ptr + clen, mb->start_subject,
2220           end_subject, utf, &ncount);
2221         if (nptr >= end_subject && (mb->moptions & PCRE2_PARTIAL_HARD) != 0)
2222             reset_could_continue = TRUE;
2223         ADD_NEW_DATA(-(state_offset + 1), 0, ncount);
2224         }
2225       break;
2226 #endif
2227 
2228       /*-----------------------------------------------------------------*/
2229       /* This is a tricky like EXTUNI because it too can match more than one
2230       character (when CR is followed by LF). In this case, set up a negative
2231       state to wait for one character to pass before continuing. */
2232 
2233       case OP_ANYNL:
2234       if (clen > 0) switch(c)
2235         {
2236         case CHAR_VT:
2237         case CHAR_FF:
2238         case CHAR_NEL:
2239 #ifndef EBCDIC
2240         case 0x2028:
2241         case 0x2029:
2242 #endif  /* Not EBCDIC */
2243         if (mb->bsr_convention == PCRE2_BSR_ANYCRLF) break;
2244         /* Fall through */
2245 
2246         case CHAR_LF:
2247         ADD_NEW(state_offset + 1, 0);
2248         break;
2249 
2250         case CHAR_CR:
2251         if (ptr + 1 >= end_subject)
2252           {
2253           ADD_NEW(state_offset + 1, 0);
2254           if ((mb->moptions & PCRE2_PARTIAL_HARD) != 0)
2255             reset_could_continue = TRUE;
2256           }
2257         else if (UCHAR21TEST(ptr + 1) == CHAR_LF)
2258           {
2259           ADD_NEW_DATA(-(state_offset + 1), 0, 1);
2260           }
2261         else
2262           {
2263           ADD_NEW(state_offset + 1, 0);
2264           }
2265         break;
2266         }
2267       break;
2268 
2269       /*-----------------------------------------------------------------*/
2270       case OP_NOT_VSPACE:
2271       if (clen > 0) switch(c)
2272         {
2273         VSPACE_CASES:
2274         break;
2275 
2276         default:
2277         ADD_NEW(state_offset + 1, 0);
2278         break;
2279         }
2280       break;
2281 
2282       /*-----------------------------------------------------------------*/
2283       case OP_VSPACE:
2284       if (clen > 0) switch(c)
2285         {
2286         VSPACE_CASES:
2287         ADD_NEW(state_offset + 1, 0);
2288         break;
2289 
2290         default:
2291         break;
2292         }
2293       break;
2294 
2295       /*-----------------------------------------------------------------*/
2296       case OP_NOT_HSPACE:
2297       if (clen > 0) switch(c)
2298         {
2299         HSPACE_CASES:
2300         break;
2301 
2302         default:
2303         ADD_NEW(state_offset + 1, 0);
2304         break;
2305         }
2306       break;
2307 
2308       /*-----------------------------------------------------------------*/
2309       case OP_HSPACE:
2310       if (clen > 0) switch(c)
2311         {
2312         HSPACE_CASES:
2313         ADD_NEW(state_offset + 1, 0);
2314         break;
2315 
2316         default:
2317         break;
2318         }
2319       break;
2320 
2321       /*-----------------------------------------------------------------*/
2322       /* Match a negated single character casefully. */
2323 
2324       case OP_NOT:
2325       if (clen > 0 && c != d) { ADD_NEW(state_offset + dlen + 1, 0); }
2326       break;
2327 
2328       /*-----------------------------------------------------------------*/
2329       /* Match a negated single character caselessly. */
2330 
2331       case OP_NOTI:
2332       if (clen > 0)
2333         {
2334         uint32_t otherd;
2335 #ifdef SUPPORT_UNICODE
2336         if (utf && d >= 128)
2337           otherd = UCD_OTHERCASE(d);
2338         else
2339 #endif  /* SUPPORT_UNICODE */
2340         otherd = TABLE_GET(d, fcc, d);
2341         if (c != d && c != otherd)
2342           { ADD_NEW(state_offset + dlen + 1, 0); }
2343         }
2344       break;
2345 
2346       /*-----------------------------------------------------------------*/
2347       case OP_PLUSI:
2348       case OP_MINPLUSI:
2349       case OP_POSPLUSI:
2350       case OP_NOTPLUSI:
2351       case OP_NOTMINPLUSI:
2352       case OP_NOTPOSPLUSI:
2353       caseless = TRUE;
2354       codevalue -= OP_STARI - OP_STAR;
2355 
2356       /* Fall through */
2357       case OP_PLUS:
2358       case OP_MINPLUS:
2359       case OP_POSPLUS:
2360       case OP_NOTPLUS:
2361       case OP_NOTMINPLUS:
2362       case OP_NOTPOSPLUS:
2363       count = current_state->count;  /* Already matched */
2364       if (count > 0) { ADD_ACTIVE(state_offset + dlen + 1, 0); }
2365       if (clen > 0)
2366         {
2367         uint32_t otherd = NOTACHAR;
2368         if (caseless)
2369           {
2370 #ifdef SUPPORT_UNICODE
2371           if (utf && d >= 128)
2372             otherd = UCD_OTHERCASE(d);
2373           else
2374 #endif  /* SUPPORT_UNICODE */
2375           otherd = TABLE_GET(d, fcc, d);
2376           }
2377         if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
2378           {
2379           if (count > 0 &&
2380               (codevalue == OP_POSPLUS || codevalue == OP_NOTPOSPLUS))
2381             {
2382             active_count--;             /* Remove non-match possibility */
2383             next_active_state--;
2384             }
2385           count++;
2386           ADD_NEW(state_offset, count);
2387           }
2388         }
2389       break;
2390 
2391       /*-----------------------------------------------------------------*/
2392       case OP_QUERYI:
2393       case OP_MINQUERYI:
2394       case OP_POSQUERYI:
2395       case OP_NOTQUERYI:
2396       case OP_NOTMINQUERYI:
2397       case OP_NOTPOSQUERYI:
2398       caseless = TRUE;
2399       codevalue -= OP_STARI - OP_STAR;
2400       /* Fall through */
2401       case OP_QUERY:
2402       case OP_MINQUERY:
2403       case OP_POSQUERY:
2404       case OP_NOTQUERY:
2405       case OP_NOTMINQUERY:
2406       case OP_NOTPOSQUERY:
2407       ADD_ACTIVE(state_offset + dlen + 1, 0);
2408       if (clen > 0)
2409         {
2410         uint32_t otherd = NOTACHAR;
2411         if (caseless)
2412           {
2413 #ifdef SUPPORT_UNICODE
2414           if (utf && d >= 128)
2415             otherd = UCD_OTHERCASE(d);
2416           else
2417 #endif  /* SUPPORT_UNICODE */
2418           otherd = TABLE_GET(d, fcc, d);
2419           }
2420         if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
2421           {
2422           if (codevalue == OP_POSQUERY || codevalue == OP_NOTPOSQUERY)
2423             {
2424             active_count--;            /* Remove non-match possibility */
2425             next_active_state--;
2426             }
2427           ADD_NEW(state_offset + dlen + 1, 0);
2428           }
2429         }
2430       break;
2431 
2432       /*-----------------------------------------------------------------*/
2433       case OP_STARI:
2434       case OP_MINSTARI:
2435       case OP_POSSTARI:
2436       case OP_NOTSTARI:
2437       case OP_NOTMINSTARI:
2438       case OP_NOTPOSSTARI:
2439       caseless = TRUE;
2440       codevalue -= OP_STARI - OP_STAR;
2441       /* Fall through */
2442       case OP_STAR:
2443       case OP_MINSTAR:
2444       case OP_POSSTAR:
2445       case OP_NOTSTAR:
2446       case OP_NOTMINSTAR:
2447       case OP_NOTPOSSTAR:
2448       ADD_ACTIVE(state_offset + dlen + 1, 0);
2449       if (clen > 0)
2450         {
2451         uint32_t otherd = NOTACHAR;
2452         if (caseless)
2453           {
2454 #ifdef SUPPORT_UNICODE
2455           if (utf && d >= 128)
2456             otherd = UCD_OTHERCASE(d);
2457           else
2458 #endif  /* SUPPORT_UNICODE */
2459           otherd = TABLE_GET(d, fcc, d);
2460           }
2461         if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
2462           {
2463           if (codevalue == OP_POSSTAR || codevalue == OP_NOTPOSSTAR)
2464             {
2465             active_count--;            /* Remove non-match possibility */
2466             next_active_state--;
2467             }
2468           ADD_NEW(state_offset, 0);
2469           }
2470         }
2471       break;
2472 
2473       /*-----------------------------------------------------------------*/
2474       case OP_EXACTI:
2475       case OP_NOTEXACTI:
2476       caseless = TRUE;
2477       codevalue -= OP_STARI - OP_STAR;
2478       /* Fall through */
2479       case OP_EXACT:
2480       case OP_NOTEXACT:
2481       count = current_state->count;  /* Number already matched */
2482       if (clen > 0)
2483         {
2484         uint32_t otherd = NOTACHAR;
2485         if (caseless)
2486           {
2487 #ifdef SUPPORT_UNICODE
2488           if (utf && d >= 128)
2489             otherd = UCD_OTHERCASE(d);
2490           else
2491 #endif  /* SUPPORT_UNICODE */
2492           otherd = TABLE_GET(d, fcc, d);
2493           }
2494         if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
2495           {
2496           if (++count >= (int)GET2(code, 1))
2497             { ADD_NEW(state_offset + dlen + 1 + IMM2_SIZE, 0); }
2498           else
2499             { ADD_NEW(state_offset, count); }
2500           }
2501         }
2502       break;
2503 
2504       /*-----------------------------------------------------------------*/
2505       case OP_UPTOI:
2506       case OP_MINUPTOI:
2507       case OP_POSUPTOI:
2508       case OP_NOTUPTOI:
2509       case OP_NOTMINUPTOI:
2510       case OP_NOTPOSUPTOI:
2511       caseless = TRUE;
2512       codevalue -= OP_STARI - OP_STAR;
2513       /* Fall through */
2514       case OP_UPTO:
2515       case OP_MINUPTO:
2516       case OP_POSUPTO:
2517       case OP_NOTUPTO:
2518       case OP_NOTMINUPTO:
2519       case OP_NOTPOSUPTO:
2520       ADD_ACTIVE(state_offset + dlen + 1 + IMM2_SIZE, 0);
2521       count = current_state->count;  /* Number already matched */
2522       if (clen > 0)
2523         {
2524         uint32_t otherd = NOTACHAR;
2525         if (caseless)
2526           {
2527 #ifdef SUPPORT_UNICODE
2528           if (utf && d >= 128)
2529             otherd = UCD_OTHERCASE(d);
2530           else
2531 #endif  /* SUPPORT_UNICODE */
2532           otherd = TABLE_GET(d, fcc, d);
2533           }
2534         if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
2535           {
2536           if (codevalue == OP_POSUPTO || codevalue == OP_NOTPOSUPTO)
2537             {
2538             active_count--;             /* Remove non-match possibility */
2539             next_active_state--;
2540             }
2541           if (++count >= (int)GET2(code, 1))
2542             { ADD_NEW(state_offset + dlen + 1 + IMM2_SIZE, 0); }
2543           else
2544             { ADD_NEW(state_offset, count); }
2545           }
2546         }
2547       break;
2548 
2549 
2550 /* ========================================================================== */
2551       /* These are the class-handling opcodes */
2552 
2553       case OP_CLASS:
2554       case OP_NCLASS:
2555       case OP_XCLASS:
2556         {
2557         BOOL isinclass = FALSE;
2558         int next_state_offset;
2559         PCRE2_SPTR ecode;
2560 
2561         /* For a simple class, there is always just a 32-byte table, and we
2562         can set isinclass from it. */
2563 
2564         if (codevalue != OP_XCLASS)
2565           {
2566           ecode = code + 1 + (32 / sizeof(PCRE2_UCHAR));
2567           if (clen > 0)
2568             {
2569             isinclass = (c > 255)? (codevalue == OP_NCLASS) :
2570               ((((uint8_t *)(code + 1))[c/8] & (1u << (c&7))) != 0);
2571             }
2572           }
2573 
2574         /* An extended class may have a table or a list of single characters,
2575         ranges, or both, and it may be positive or negative. There's a
2576         function that sorts all this out. */
2577 
2578         else
2579          {
2580          ecode = code + GET(code, 1);
2581          if (clen > 0) isinclass = PRIV(xclass)(c, code + 1 + LINK_SIZE, utf);
2582          }
2583 
2584         /* At this point, isinclass is set for all kinds of class, and ecode
2585         points to the byte after the end of the class. If there is a
2586         quantifier, this is where it will be. */
2587 
2588         next_state_offset = (int)(ecode - start_code);
2589 
2590         switch (*ecode)
2591           {
2592           case OP_CRSTAR:
2593           case OP_CRMINSTAR:
2594           case OP_CRPOSSTAR:
2595           ADD_ACTIVE(next_state_offset + 1, 0);
2596           if (isinclass)
2597             {
2598             if (*ecode == OP_CRPOSSTAR)
2599               {
2600               active_count--;           /* Remove non-match possibility */
2601               next_active_state--;
2602               }
2603             ADD_NEW(state_offset, 0);
2604             }
2605           break;
2606 
2607           case OP_CRPLUS:
2608           case OP_CRMINPLUS:
2609           case OP_CRPOSPLUS:
2610           count = current_state->count;  /* Already matched */
2611           if (count > 0) { ADD_ACTIVE(next_state_offset + 1, 0); }
2612           if (isinclass)
2613             {
2614             if (count > 0 && *ecode == OP_CRPOSPLUS)
2615               {
2616               active_count--;           /* Remove non-match possibility */
2617               next_active_state--;
2618               }
2619             count++;
2620             ADD_NEW(state_offset, count);
2621             }
2622           break;
2623 
2624           case OP_CRQUERY:
2625           case OP_CRMINQUERY:
2626           case OP_CRPOSQUERY:
2627           ADD_ACTIVE(next_state_offset + 1, 0);
2628           if (isinclass)
2629             {
2630             if (*ecode == OP_CRPOSQUERY)
2631               {
2632               active_count--;           /* Remove non-match possibility */
2633               next_active_state--;
2634               }
2635             ADD_NEW(next_state_offset + 1, 0);
2636             }
2637           break;
2638 
2639           case OP_CRRANGE:
2640           case OP_CRMINRANGE:
2641           case OP_CRPOSRANGE:
2642           count = current_state->count;  /* Already matched */
2643           if (count >= (int)GET2(ecode, 1))
2644             { ADD_ACTIVE(next_state_offset + 1 + 2 * IMM2_SIZE, 0); }
2645           if (isinclass)
2646             {
2647             int max = (int)GET2(ecode, 1 + IMM2_SIZE);
2648 
2649             if (*ecode == OP_CRPOSRANGE && count >= (int)GET2(ecode, 1))
2650               {
2651               active_count--;           /* Remove non-match possibility */
2652               next_active_state--;
2653               }
2654 
2655             if (++count >= max && max != 0)   /* Max 0 => no limit */
2656               { ADD_NEW(next_state_offset + 1 + 2 * IMM2_SIZE, 0); }
2657             else
2658               { ADD_NEW(state_offset, count); }
2659             }
2660           break;
2661 
2662           default:
2663           if (isinclass) { ADD_NEW(next_state_offset, 0); }
2664           break;
2665           }
2666         }
2667       break;
2668 
2669 /* ========================================================================== */
2670       /* These are the opcodes for fancy brackets of various kinds. We have
2671       to use recursion in order to handle them. The "always failing" assertion
2672       (?!) is optimised to OP_FAIL when compiling, so we have to support that,
2673       though the other "backtracking verbs" are not supported. */
2674 
2675       case OP_FAIL:
2676       forced_fail++;    /* Count FAILs for multiple states */
2677       break;
2678 
2679       case OP_ASSERT:
2680       case OP_ASSERT_NOT:
2681       case OP_ASSERTBACK:
2682       case OP_ASSERTBACK_NOT:
2683         {
2684         int rc;
2685         int *local_workspace;
2686         PCRE2_SIZE *local_offsets;
2687         PCRE2_SPTR endasscode = code + GET(code, 1);
2688         RWS_anchor *rws = (RWS_anchor *)RWS;
2689 
2690         if (rws->free < RWS_RSIZE + RWS_OVEC_OSIZE)
2691           {
2692           rc = more_workspace(&rws, RWS_OVEC_OSIZE, mb);
2693           if (rc != 0) return rc;
2694           RWS = (int *)rws;
2695           }
2696 
2697         local_offsets = (PCRE2_SIZE *)(RWS + rws->size - rws->free);
2698         local_workspace = ((int *)local_offsets) + RWS_OVEC_OSIZE;
2699         rws->free -= RWS_RSIZE + RWS_OVEC_OSIZE;
2700 
2701         while (*endasscode == OP_ALT) endasscode += GET(endasscode, 1);
2702 
2703         rc = internal_dfa_match(
2704           mb,                                   /* static match data */
2705           code,                                 /* this subexpression's code */
2706           ptr,                                  /* where we currently are */
2707           (PCRE2_SIZE)(ptr - start_subject),    /* start offset */
2708           local_offsets,                        /* offset vector */
2709           RWS_OVEC_OSIZE/OVEC_UNIT,             /* size of same */
2710           local_workspace,                      /* workspace vector */
2711           RWS_RSIZE,                            /* size of same */
2712           rlevel,                               /* function recursion level */
2713           RWS);                                 /* recursion workspace */
2714 
2715         rws->free += RWS_RSIZE + RWS_OVEC_OSIZE;
2716 
2717         if (rc < 0 && rc != PCRE2_ERROR_NOMATCH) return rc;
2718         if ((rc >= 0) == (codevalue == OP_ASSERT || codevalue == OP_ASSERTBACK))
2719             { ADD_ACTIVE((int)(endasscode + LINK_SIZE + 1 - start_code), 0); }
2720         }
2721       break;
2722 
2723       /*-----------------------------------------------------------------*/
2724       case OP_COND:
2725       case OP_SCOND:
2726         {
2727         int codelink = (int)GET(code, 1);
2728         PCRE2_UCHAR condcode;
2729 
2730         /* Because of the way auto-callout works during compile, a callout item
2731         is inserted between OP_COND and an assertion condition. This does not
2732         happen for the other conditions. */
2733 
2734         if (code[LINK_SIZE + 1] == OP_CALLOUT
2735             || code[LINK_SIZE + 1] == OP_CALLOUT_STR)
2736           {
2737           PCRE2_SIZE callout_length;
2738           rrc = do_callout(code, offsets, current_subject, ptr, mb,
2739             1 + LINK_SIZE, &callout_length);
2740           if (rrc < 0) return rrc;                 /* Abandon */
2741           if (rrc > 0) break;                      /* Fail this thread */
2742           code += callout_length;                  /* Skip callout data */
2743           }
2744 
2745         condcode = code[LINK_SIZE+1];
2746 
2747         /* Back reference conditions and duplicate named recursion conditions
2748         are not supported */
2749 
2750         if (condcode == OP_CREF || condcode == OP_DNCREF ||
2751             condcode == OP_DNRREF)
2752           return PCRE2_ERROR_DFA_UCOND;
2753 
2754         /* The DEFINE condition is always false, and the assertion (?!) is
2755         converted to OP_FAIL. */
2756 
2757         if (condcode == OP_FALSE || condcode == OP_FAIL)
2758           { ADD_ACTIVE(state_offset + codelink + LINK_SIZE + 1, 0); }
2759 
2760         /* There is also an always-true condition */
2761 
2762         else if (condcode == OP_TRUE)
2763           { ADD_ACTIVE(state_offset + LINK_SIZE + 2, 0); }
2764 
2765         /* The only supported version of OP_RREF is for the value RREF_ANY,
2766         which means "test if in any recursion". We can't test for specifically
2767         recursed groups. */
2768 
2769         else if (condcode == OP_RREF)
2770           {
2771           unsigned int value = GET2(code, LINK_SIZE + 2);
2772           if (value != RREF_ANY) return PCRE2_ERROR_DFA_UCOND;
2773           if (mb->recursive != NULL)
2774             { ADD_ACTIVE(state_offset + LINK_SIZE + 2 + IMM2_SIZE, 0); }
2775           else { ADD_ACTIVE(state_offset + codelink + LINK_SIZE + 1, 0); }
2776           }
2777 
2778         /* Otherwise, the condition is an assertion */
2779 
2780         else
2781           {
2782           int rc;
2783           int *local_workspace;
2784           PCRE2_SIZE *local_offsets;
2785           PCRE2_SPTR asscode = code + LINK_SIZE + 1;
2786           PCRE2_SPTR endasscode = asscode + GET(asscode, 1);
2787           RWS_anchor *rws = (RWS_anchor *)RWS;
2788 
2789           if (rws->free < RWS_RSIZE + RWS_OVEC_OSIZE)
2790             {
2791             rc = more_workspace(&rws, RWS_OVEC_OSIZE, mb);
2792             if (rc != 0) return rc;
2793             RWS = (int *)rws;
2794             }
2795 
2796           local_offsets = (PCRE2_SIZE *)(RWS + rws->size - rws->free);
2797           local_workspace = ((int *)local_offsets) + RWS_OVEC_OSIZE;
2798           rws->free -= RWS_RSIZE + RWS_OVEC_OSIZE;
2799 
2800           while (*endasscode == OP_ALT) endasscode += GET(endasscode, 1);
2801 
2802           rc = internal_dfa_match(
2803             mb,                                   /* fixed match data */
2804             asscode,                              /* this subexpression's code */
2805             ptr,                                  /* where we currently are */
2806             (PCRE2_SIZE)(ptr - start_subject),    /* start offset */
2807             local_offsets,                        /* offset vector */
2808             RWS_OVEC_OSIZE/OVEC_UNIT,             /* size of same */
2809             local_workspace,                      /* workspace vector */
2810             RWS_RSIZE,                            /* size of same */
2811             rlevel,                               /* function recursion level */
2812             RWS);                                 /* recursion workspace */
2813 
2814           rws->free += RWS_RSIZE + RWS_OVEC_OSIZE;
2815 
2816           if (rc < 0 && rc != PCRE2_ERROR_NOMATCH) return rc;
2817           if ((rc >= 0) ==
2818                 (condcode == OP_ASSERT || condcode == OP_ASSERTBACK))
2819             { ADD_ACTIVE((int)(endasscode + LINK_SIZE + 1 - start_code), 0); }
2820           else
2821             { ADD_ACTIVE(state_offset + codelink + LINK_SIZE + 1, 0); }
2822           }
2823         }
2824       break;
2825 
2826       /*-----------------------------------------------------------------*/
2827       case OP_RECURSE:
2828         {
2829         int rc;
2830         int *local_workspace;
2831         PCRE2_SIZE *local_offsets;
2832         RWS_anchor *rws = (RWS_anchor *)RWS;
2833         dfa_recursion_info *ri;
2834         PCRE2_SPTR callpat = start_code + GET(code, 1);
2835         uint32_t recno = (callpat == mb->start_code)? 0 :
2836           GET2(callpat, 1 + LINK_SIZE);
2837 
2838         if (rws->free < RWS_RSIZE + RWS_OVEC_RSIZE)
2839           {
2840           rc = more_workspace(&rws, RWS_OVEC_RSIZE, mb);
2841           if (rc != 0) return rc;
2842           RWS = (int *)rws;
2843           }
2844 
2845         local_offsets = (PCRE2_SIZE *)(RWS + rws->size - rws->free);
2846         local_workspace = ((int *)local_offsets) + RWS_OVEC_RSIZE;
2847         rws->free -= RWS_RSIZE + RWS_OVEC_RSIZE;
2848 
2849         /* Check for repeating a recursion without advancing the subject
2850         pointer. This should catch convoluted mutual recursions. (Some simple
2851         cases are caught at compile time.) */
2852 
2853         for (ri = mb->recursive; ri != NULL; ri = ri->prevrec)
2854           if (recno == ri->group_num && ptr == ri->subject_position)
2855             return PCRE2_ERROR_RECURSELOOP;
2856 
2857         /* Remember this recursion and where we started it so as to
2858         catch infinite loops. */
2859 
2860         new_recursive.group_num = recno;
2861         new_recursive.subject_position = ptr;
2862         new_recursive.prevrec = mb->recursive;
2863         mb->recursive = &new_recursive;
2864 
2865         rc = internal_dfa_match(
2866           mb,                                   /* fixed match data */
2867           callpat,                              /* this subexpression's code */
2868           ptr,                                  /* where we currently are */
2869           (PCRE2_SIZE)(ptr - start_subject),    /* start offset */
2870           local_offsets,                        /* offset vector */
2871           RWS_OVEC_RSIZE/OVEC_UNIT,             /* size of same */
2872           local_workspace,                      /* workspace vector */
2873           RWS_RSIZE,                            /* size of same */
2874           rlevel,                               /* function recursion level */
2875           RWS);                                 /* recursion workspace */
2876 
2877         rws->free += RWS_RSIZE + RWS_OVEC_RSIZE;
2878         mb->recursive = new_recursive.prevrec;  /* Done this recursion */
2879 
2880         /* Ran out of internal offsets */
2881 
2882         if (rc == 0) return PCRE2_ERROR_DFA_RECURSE;
2883 
2884         /* For each successful matched substring, set up the next state with a
2885         count of characters to skip before trying it. Note that the count is in
2886         characters, not bytes. */
2887 
2888         if (rc > 0)
2889           {
2890           for (rc = rc*2 - 2; rc >= 0; rc -= 2)
2891             {
2892             PCRE2_SIZE charcount = local_offsets[rc+1] - local_offsets[rc];
2893 #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32
2894             if (utf)
2895               {
2896               PCRE2_SPTR p = start_subject + local_offsets[rc];
2897               PCRE2_SPTR pp = start_subject + local_offsets[rc+1];
2898               while (p < pp) if (NOT_FIRSTCU(*p++)) charcount--;
2899               }
2900 #endif
2901             if (charcount > 0)
2902               {
2903               ADD_NEW_DATA(-(state_offset + LINK_SIZE + 1), 0,
2904                 (int)(charcount - 1));
2905               }
2906             else
2907               {
2908               ADD_ACTIVE(state_offset + LINK_SIZE + 1, 0);
2909               }
2910             }
2911           }
2912         else if (rc != PCRE2_ERROR_NOMATCH) return rc;
2913         }
2914       break;
2915 
2916       /*-----------------------------------------------------------------*/
2917       case OP_BRAPOS:
2918       case OP_SBRAPOS:
2919       case OP_CBRAPOS:
2920       case OP_SCBRAPOS:
2921       case OP_BRAPOSZERO:
2922         {
2923         int rc;
2924         int *local_workspace;
2925         PCRE2_SIZE *local_offsets;
2926         PCRE2_SIZE charcount, matched_count;
2927         PCRE2_SPTR local_ptr = ptr;
2928         RWS_anchor *rws = (RWS_anchor *)RWS;
2929         BOOL allow_zero;
2930 
2931         if (rws->free < RWS_RSIZE + RWS_OVEC_OSIZE)
2932           {
2933           rc = more_workspace(&rws, RWS_OVEC_OSIZE, mb);
2934           if (rc != 0) return rc;
2935           RWS = (int *)rws;
2936           }
2937 
2938         local_offsets = (PCRE2_SIZE *)(RWS + rws->size - rws->free);
2939         local_workspace = ((int *)local_offsets) + RWS_OVEC_OSIZE;
2940         rws->free -= RWS_RSIZE + RWS_OVEC_OSIZE;
2941 
2942         if (codevalue == OP_BRAPOSZERO)
2943           {
2944           allow_zero = TRUE;
2945           codevalue = *(++code);  /* Codevalue will be one of above BRAs */
2946           }
2947         else allow_zero = FALSE;
2948 
2949         /* Loop to match the subpattern as many times as possible as if it were
2950         a complete pattern. */
2951 
2952         for (matched_count = 0;; matched_count++)
2953           {
2954           rc = internal_dfa_match(
2955             mb,                                   /* fixed match data */
2956             code,                                 /* this subexpression's code */
2957             local_ptr,                            /* where we currently are */
2958             (PCRE2_SIZE)(ptr - start_subject),    /* start offset */
2959             local_offsets,                        /* offset vector */
2960             RWS_OVEC_OSIZE/OVEC_UNIT,             /* size of same */
2961             local_workspace,                      /* workspace vector */
2962             RWS_RSIZE,                            /* size of same */
2963             rlevel,                               /* function recursion level */
2964             RWS);                                 /* recursion workspace */
2965 
2966           /* Failed to match */
2967 
2968           if (rc < 0)
2969             {
2970             if (rc != PCRE2_ERROR_NOMATCH) return rc;
2971             break;
2972             }
2973 
2974           /* Matched: break the loop if zero characters matched. */
2975 
2976           charcount = local_offsets[1] - local_offsets[0];
2977           if (charcount == 0) break;
2978           local_ptr += charcount;    /* Advance temporary position ptr */
2979           }
2980 
2981         rws->free += RWS_RSIZE + RWS_OVEC_OSIZE;
2982 
2983         /* At this point we have matched the subpattern matched_count
2984         times, and local_ptr is pointing to the character after the end of the
2985         last match. */
2986 
2987         if (matched_count > 0 || allow_zero)
2988           {
2989           PCRE2_SPTR end_subpattern = code;
2990           int next_state_offset;
2991 
2992           do { end_subpattern += GET(end_subpattern, 1); }
2993             while (*end_subpattern == OP_ALT);
2994           next_state_offset =
2995             (int)(end_subpattern - start_code + LINK_SIZE + 1);
2996 
2997           /* Optimization: if there are no more active states, and there
2998           are no new states yet set up, then skip over the subject string
2999           right here, to save looping. Otherwise, set up the new state to swing
3000           into action when the end of the matched substring is reached. */
3001 
3002           if (i + 1 >= active_count && new_count == 0)
3003             {
3004             ptr = local_ptr;
3005             clen = 0;
3006             ADD_NEW(next_state_offset, 0);
3007             }
3008           else
3009             {
3010             PCRE2_SPTR p = ptr;
3011             PCRE2_SPTR pp = local_ptr;
3012             charcount = (PCRE2_SIZE)(pp - p);
3013 #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32
3014             if (utf) while (p < pp) if (NOT_FIRSTCU(*p++)) charcount--;
3015 #endif
3016             ADD_NEW_DATA(-next_state_offset, 0, (int)(charcount - 1));
3017             }
3018           }
3019         }
3020       break;
3021 
3022       /*-----------------------------------------------------------------*/
3023       case OP_ONCE:
3024         {
3025         int rc;
3026         int *local_workspace;
3027         PCRE2_SIZE *local_offsets;
3028         RWS_anchor *rws = (RWS_anchor *)RWS;
3029 
3030         if (rws->free < RWS_RSIZE + RWS_OVEC_OSIZE)
3031           {
3032           rc = more_workspace(&rws, RWS_OVEC_OSIZE, mb);
3033           if (rc != 0) return rc;
3034           RWS = (int *)rws;
3035           }
3036 
3037         local_offsets = (PCRE2_SIZE *)(RWS + rws->size - rws->free);
3038         local_workspace = ((int *)local_offsets) + RWS_OVEC_OSIZE;
3039         rws->free -= RWS_RSIZE + RWS_OVEC_OSIZE;
3040 
3041         rc = internal_dfa_match(
3042           mb,                                   /* fixed match data */
3043           code,                                 /* this subexpression's code */
3044           ptr,                                  /* where we currently are */
3045           (PCRE2_SIZE)(ptr - start_subject),    /* start offset */
3046           local_offsets,                        /* offset vector */
3047           RWS_OVEC_OSIZE/OVEC_UNIT,             /* size of same */
3048           local_workspace,                      /* workspace vector */
3049           RWS_RSIZE,                            /* size of same */
3050           rlevel,                               /* function recursion level */
3051           RWS);                                 /* recursion workspace */
3052 
3053         rws->free += RWS_RSIZE + RWS_OVEC_OSIZE;
3054 
3055         if (rc >= 0)
3056           {
3057           PCRE2_SPTR end_subpattern = code;
3058           PCRE2_SIZE charcount = local_offsets[1] - local_offsets[0];
3059           int next_state_offset, repeat_state_offset;
3060 
3061           do { end_subpattern += GET(end_subpattern, 1); }
3062             while (*end_subpattern == OP_ALT);
3063           next_state_offset =
3064             (int)(end_subpattern - start_code + LINK_SIZE + 1);
3065 
3066           /* If the end of this subpattern is KETRMAX or KETRMIN, we must
3067           arrange for the repeat state also to be added to the relevant list.
3068           Calculate the offset, or set -1 for no repeat. */
3069 
3070           repeat_state_offset = (*end_subpattern == OP_KETRMAX ||
3071                                  *end_subpattern == OP_KETRMIN)?
3072             (int)(end_subpattern - start_code - GET(end_subpattern, 1)) : -1;
3073 
3074           /* If we have matched an empty string, add the next state at the
3075           current character pointer. This is important so that the duplicate
3076           checking kicks in, which is what breaks infinite loops that match an
3077           empty string. */
3078 
3079           if (charcount == 0)
3080             {
3081             ADD_ACTIVE(next_state_offset, 0);
3082             }
3083 
3084           /* Optimization: if there are no more active states, and there
3085           are no new states yet set up, then skip over the subject string
3086           right here, to save looping. Otherwise, set up the new state to swing
3087           into action when the end of the matched substring is reached. */
3088 
3089           else if (i + 1 >= active_count && new_count == 0)
3090             {
3091             ptr += charcount;
3092             clen = 0;
3093             ADD_NEW(next_state_offset, 0);
3094 
3095             /* If we are adding a repeat state at the new character position,
3096             we must fudge things so that it is the only current state.
3097             Otherwise, it might be a duplicate of one we processed before, and
3098             that would cause it to be skipped. */
3099 
3100             if (repeat_state_offset >= 0)
3101               {
3102               next_active_state = active_states;
3103               active_count = 0;
3104               i = -1;
3105               ADD_ACTIVE(repeat_state_offset, 0);
3106               }
3107             }
3108           else
3109             {
3110 #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32
3111             if (utf)
3112               {
3113               PCRE2_SPTR p = start_subject + local_offsets[0];
3114               PCRE2_SPTR pp = start_subject + local_offsets[1];
3115               while (p < pp) if (NOT_FIRSTCU(*p++)) charcount--;
3116               }
3117 #endif
3118             ADD_NEW_DATA(-next_state_offset, 0, (int)(charcount - 1));
3119             if (repeat_state_offset >= 0)
3120               { ADD_NEW_DATA(-repeat_state_offset, 0, (int)(charcount - 1)); }
3121             }
3122           }
3123         else if (rc != PCRE2_ERROR_NOMATCH) return rc;
3124         }
3125       break;
3126 
3127 
3128 /* ========================================================================== */
3129       /* Handle callouts */
3130 
3131       case OP_CALLOUT:
3132       case OP_CALLOUT_STR:
3133         {
3134         PCRE2_SIZE callout_length;
3135         rrc = do_callout(code, offsets, current_subject, ptr, mb, 0,
3136           &callout_length);
3137         if (rrc < 0) return rrc;   /* Abandon */
3138         if (rrc == 0)
3139           { ADD_ACTIVE(state_offset + (int)callout_length, 0); }
3140         }
3141       break;
3142 
3143 
3144 /* ========================================================================== */
3145       default:        /* Unsupported opcode */
3146       return PCRE2_ERROR_DFA_UITEM;
3147       }
3148 
3149     NEXT_ACTIVE_STATE: continue;
3150 
3151     }      /* End of loop scanning active states */
3152 
3153   /* We have finished the processing at the current subject character. If no
3154   new states have been set for the next character, we have found all the
3155   matches that we are going to find. If we are at the top level and partial
3156   matching has been requested, check for appropriate conditions.
3157 
3158   The "forced_ fail" variable counts the number of (*F) encountered for the
3159   character. If it is equal to the original active_count (saved in
3160   workspace[1]) it means that (*F) was found on every active state. In this
3161   case we don't want to give a partial match.
3162 
3163   The "could_continue" variable is true if a state could have continued but
3164   for the fact that the end of the subject was reached. */
3165 
3166   if (new_count <= 0)
3167     {
3168     if (rlevel == 1 &&                               /* Top level, and */
3169         could_continue &&                            /* Some could go on, and */
3170         forced_fail != workspace[1] &&               /* Not all forced fail & */
3171         (                                            /* either... */
3172         (mb->moptions & PCRE2_PARTIAL_HARD) != 0      /* Hard partial */
3173         ||                                           /* or... */
3174         ((mb->moptions & PCRE2_PARTIAL_SOFT) != 0 &&  /* Soft partial and */
3175          match_count < 0)                            /* no matches */
3176         ) &&                                         /* And... */
3177         (
3178         partial_newline ||                           /* Either partial NL */
3179           (                                          /* or ... */
3180           ptr >= end_subject &&                /* End of subject and */
3181           ptr > mb->start_used_ptr)            /* Inspected non-empty string */
3182           )
3183         )
3184       match_count = PCRE2_ERROR_PARTIAL;
3185     break;  /* Exit from loop along the subject string */
3186     }
3187 
3188   /* One or more states are active for the next character. */
3189 
3190   ptr += clen;    /* Advance to next subject character */
3191   }               /* Loop to move along the subject string */
3192 
3193 /* Control gets here from "break" a few lines above. If we have a match and
3194 PCRE2_ENDANCHORED is set, the match fails. */
3195 
3196 if (match_count >= 0 &&
3197     ((mb->moptions | mb->poptions) & PCRE2_ENDANCHORED) != 0 &&
3198     ptr < end_subject)
3199   match_count = PCRE2_ERROR_NOMATCH;
3200 
3201 return match_count;
3202 }
3203 
3204 
3205 
3206 /*************************************************
3207 *     Match a pattern using the DFA algorithm    *
3208 *************************************************/
3209 
3210 /* This function matches a compiled pattern to a subject string, using the
3211 alternate matching algorithm that finds all matches at once.
3212 
3213 Arguments:
3214   code          points to the compiled pattern
3215   subject       subject string
3216   length        length of subject string
3217   startoffset   where to start matching in the subject
3218   options       option bits
3219   match_data    points to a match data structure
3220   gcontext      points to a match context
3221   workspace     pointer to workspace
3222   wscount       size of workspace
3223 
3224 Returns:        > 0 => number of match offset pairs placed in offsets
3225                 = 0 => offsets overflowed; longest matches are present
3226                  -1 => failed to match
3227                < -1 => some kind of unexpected problem
3228 */
3229 
3230 PCRE2_EXP_DEFN int PCRE2_CALL_CONVENTION
pcre2_dfa_match(const pcre2_code * code,PCRE2_SPTR subject,PCRE2_SIZE length,PCRE2_SIZE start_offset,uint32_t options,pcre2_match_data * match_data,pcre2_match_context * mcontext,int * workspace,PCRE2_SIZE wscount)3231 pcre2_dfa_match(const pcre2_code *code, PCRE2_SPTR subject, PCRE2_SIZE length,
3232   PCRE2_SIZE start_offset, uint32_t options, pcre2_match_data *match_data,
3233   pcre2_match_context *mcontext, int *workspace, PCRE2_SIZE wscount)
3234 {
3235 int rc;
3236 int was_zero_terminated = 0;
3237 
3238 const pcre2_real_code *re = (const pcre2_real_code *)code;
3239 
3240 PCRE2_SPTR start_match;
3241 PCRE2_SPTR end_subject;
3242 PCRE2_SPTR bumpalong_limit;
3243 PCRE2_SPTR req_cu_ptr;
3244 
3245 BOOL utf, anchored, startline, firstline;
3246 BOOL has_first_cu = FALSE;
3247 BOOL has_req_cu = FALSE;
3248 
3249 PCRE2_UCHAR first_cu = 0;
3250 PCRE2_UCHAR first_cu2 = 0;
3251 PCRE2_UCHAR req_cu = 0;
3252 PCRE2_UCHAR req_cu2 = 0;
3253 
3254 const uint8_t *start_bits = NULL;
3255 
3256 /* We need to have mb pointing to a match block, because the IS_NEWLINE macro
3257 is used below, and it expects NLBLOCK to be defined as a pointer. */
3258 
3259 pcre2_callout_block cb;
3260 dfa_match_block actual_match_block;
3261 dfa_match_block *mb = &actual_match_block;
3262 
3263 /* Set up a starting block of memory for use during recursive calls to
3264 internal_dfa_match(). By putting this on the stack, it minimizes resource use
3265 in the case when it is not needed. If this is too small, more memory is
3266 obtained from the heap. At the start of each block is an anchor structure.*/
3267 
3268 int base_recursion_workspace[RWS_BASE_SIZE];
3269 RWS_anchor *rws = (RWS_anchor *)base_recursion_workspace;
3270 rws->next = NULL;
3271 rws->size = RWS_BASE_SIZE;
3272 rws->free = RWS_BASE_SIZE - RWS_ANCHOR_SIZE;
3273 
3274 /* A length equal to PCRE2_ZERO_TERMINATED implies a zero-terminated
3275 subject string. */
3276 
3277 if (length == PCRE2_ZERO_TERMINATED)
3278   {
3279   length = PRIV(strlen)(subject);
3280   was_zero_terminated = 1;
3281   }
3282 
3283 /* Plausibility checks */
3284 
3285 if ((options & ~PUBLIC_DFA_MATCH_OPTIONS) != 0) return PCRE2_ERROR_BADOPTION;
3286 if (re == NULL || subject == NULL || workspace == NULL || match_data == NULL)
3287   return PCRE2_ERROR_NULL;
3288 if (wscount < 20) return PCRE2_ERROR_DFA_WSSIZE;
3289 if (start_offset > length) return PCRE2_ERROR_BADOFFSET;
3290 
3291 /* Partial matching and PCRE2_ENDANCHORED are currently not allowed at the same
3292 time. */
3293 
3294 if ((options & (PCRE2_PARTIAL_HARD|PCRE2_PARTIAL_SOFT)) != 0 &&
3295    ((re->overall_options | options) & PCRE2_ENDANCHORED) != 0)
3296   return PCRE2_ERROR_BADOPTION;
3297 
3298 /* Check that the first field in the block is the magic number. If it is not,
3299 return with PCRE2_ERROR_BADMAGIC. */
3300 
3301 if (re->magic_number != MAGIC_NUMBER) return PCRE2_ERROR_BADMAGIC;
3302 
3303 /* Check the code unit width. */
3304 
3305 if ((re->flags & PCRE2_MODE_MASK) != PCRE2_CODE_UNIT_WIDTH/8)
3306   return PCRE2_ERROR_BADMODE;
3307 
3308 /* PCRE2_NOTEMPTY and PCRE2_NOTEMPTY_ATSTART are match-time flags in the
3309 options variable for this function. Users of PCRE2 who are not calling the
3310 function directly would like to have a way of setting these flags, in the same
3311 way that they can set pcre2_compile() flags like PCRE2_NO_AUTOPOSSESS with
3312 constructions like (*NO_AUTOPOSSESS). To enable this, (*NOTEMPTY) and
3313 (*NOTEMPTY_ATSTART) set bits in the pattern's "flag" function which can now be
3314 transferred to the options for this function. The bits are guaranteed to be
3315 adjacent, but do not have the same values. This bit of Boolean trickery assumes
3316 that the match-time bits are not more significant than the flag bits. If by
3317 accident this is not the case, a compile-time division by zero error will
3318 occur. */
3319 
3320 #define FF (PCRE2_NOTEMPTY_SET|PCRE2_NE_ATST_SET)
3321 #define OO (PCRE2_NOTEMPTY|PCRE2_NOTEMPTY_ATSTART)
3322 options |= (re->flags & FF) / ((FF & (~FF+1)) / (OO & (~OO+1)));
3323 #undef FF
3324 #undef OO
3325 
3326 /* If restarting after a partial match, do some sanity checks on the contents
3327 of the workspace. */
3328 
3329 if ((options & PCRE2_DFA_RESTART) != 0)
3330   {
3331   if ((workspace[0] & (-2)) != 0 || workspace[1] < 1 ||
3332     workspace[1] > (int)((wscount - 2)/INTS_PER_STATEBLOCK))
3333       return PCRE2_ERROR_DFA_BADRESTART;
3334   }
3335 
3336 /* Set some local values */
3337 
3338 utf = (re->overall_options & PCRE2_UTF) != 0;
3339 start_match = subject + start_offset;
3340 end_subject = subject + length;
3341 req_cu_ptr = start_match - 1;
3342 anchored = (options & (PCRE2_ANCHORED|PCRE2_DFA_RESTART)) != 0 ||
3343   (re->overall_options & PCRE2_ANCHORED) != 0;
3344 
3345 /* The "must be at the start of a line" flags are used in a loop when finding
3346 where to start. */
3347 
3348 startline = (re->flags & PCRE2_STARTLINE) != 0;
3349 firstline = (re->overall_options & PCRE2_FIRSTLINE) != 0;
3350 bumpalong_limit = end_subject;
3351 
3352 /* Initialize and set up the fixed fields in the callout block, with a pointer
3353 in the match block. */
3354 
3355 mb->cb = &cb;
3356 cb.version = 2;
3357 cb.subject = subject;
3358 cb.subject_length = (PCRE2_SIZE)(end_subject - subject);
3359 cb.callout_flags = 0;
3360 cb.capture_top      = 1;      /* No capture support */
3361 cb.capture_last     = 0;
3362 cb.mark             = NULL;   /* No (*MARK) support */
3363 
3364 /* Get data from the match context, if present, and fill in the remaining
3365 fields in the match block. It is an error to set an offset limit without
3366 setting the flag at compile time. */
3367 
3368 if (mcontext == NULL)
3369   {
3370   mb->callout = NULL;
3371   mb->memctl = re->memctl;
3372   mb->match_limit = PRIV(default_match_context).match_limit;
3373   mb->match_limit_depth = PRIV(default_match_context).depth_limit;
3374   mb->heap_limit = PRIV(default_match_context).heap_limit;
3375   }
3376 else
3377   {
3378   if (mcontext->offset_limit != PCRE2_UNSET)
3379     {
3380     if ((re->overall_options & PCRE2_USE_OFFSET_LIMIT) == 0)
3381       return PCRE2_ERROR_BADOFFSETLIMIT;
3382     bumpalong_limit = subject + mcontext->offset_limit;
3383     }
3384   mb->callout = mcontext->callout;
3385   mb->callout_data = mcontext->callout_data;
3386   mb->memctl = mcontext->memctl;
3387   mb->match_limit = mcontext->match_limit;
3388   mb->match_limit_depth = mcontext->depth_limit;
3389   mb->heap_limit = mcontext->heap_limit;
3390   }
3391 
3392 if (mb->match_limit > re->limit_match)
3393   mb->match_limit = re->limit_match;
3394 
3395 if (mb->match_limit_depth > re->limit_depth)
3396   mb->match_limit_depth = re->limit_depth;
3397 
3398 if (mb->heap_limit > re->limit_heap)
3399   mb->heap_limit = re->limit_heap;
3400 
3401 mb->start_code = (PCRE2_UCHAR *)((uint8_t *)re + sizeof(pcre2_real_code)) +
3402   re->name_count * re->name_entry_size;
3403 mb->tables = re->tables;
3404 mb->start_subject = subject;
3405 mb->end_subject = end_subject;
3406 mb->start_offset = start_offset;
3407 mb->moptions = options;
3408 mb->poptions = re->overall_options;
3409 mb->match_call_count = 0;
3410 mb->heap_used = 0;
3411 
3412 /* Process the \R and newline settings. */
3413 
3414 mb->bsr_convention = re->bsr_convention;
3415 mb->nltype = NLTYPE_FIXED;
3416 switch(re->newline_convention)
3417   {
3418   case PCRE2_NEWLINE_CR:
3419   mb->nllen = 1;
3420   mb->nl[0] = CHAR_CR;
3421   break;
3422 
3423   case PCRE2_NEWLINE_LF:
3424   mb->nllen = 1;
3425   mb->nl[0] = CHAR_NL;
3426   break;
3427 
3428   case PCRE2_NEWLINE_NUL:
3429   mb->nllen = 1;
3430   mb->nl[0] = CHAR_NUL;
3431   break;
3432 
3433   case PCRE2_NEWLINE_CRLF:
3434   mb->nllen = 2;
3435   mb->nl[0] = CHAR_CR;
3436   mb->nl[1] = CHAR_NL;
3437   break;
3438 
3439   case PCRE2_NEWLINE_ANY:
3440   mb->nltype = NLTYPE_ANY;
3441   break;
3442 
3443   case PCRE2_NEWLINE_ANYCRLF:
3444   mb->nltype = NLTYPE_ANYCRLF;
3445   break;
3446 
3447   default: return PCRE2_ERROR_INTERNAL;
3448   }
3449 
3450 /* Check a UTF string for validity if required. For 8-bit and 16-bit strings,
3451 we must also check that a starting offset does not point into the middle of a
3452 multiunit character. We check only the portion of the subject that is going to
3453 be inspected during matching - from the offset minus the maximum back reference
3454 to the given length. This saves time when a small part of a large subject is
3455 being matched by the use of a starting offset. Note that the maximum lookbehind
3456 is a number of characters, not code units. */
3457 
3458 #ifdef SUPPORT_UNICODE
3459 if (utf && (options & PCRE2_NO_UTF_CHECK) == 0)
3460   {
3461   PCRE2_SPTR check_subject = start_match;  /* start_match includes offset */
3462 
3463   if (start_offset > 0)
3464     {
3465 #if PCRE2_CODE_UNIT_WIDTH != 32
3466     unsigned int i;
3467     if (start_match < end_subject && NOT_FIRSTCU(*start_match))
3468       return PCRE2_ERROR_BADUTFOFFSET;
3469     for (i = re->max_lookbehind; i > 0 && check_subject > subject; i--)
3470       {
3471       check_subject--;
3472       while (check_subject > subject &&
3473 #if PCRE2_CODE_UNIT_WIDTH == 8
3474       (*check_subject & 0xc0) == 0x80)
3475 #else  /* 16-bit */
3476       (*check_subject & 0xfc00) == 0xdc00)
3477 #endif /* PCRE2_CODE_UNIT_WIDTH == 8 */
3478         check_subject--;
3479       }
3480 #else   /* In the 32-bit library, one code unit equals one character. */
3481     check_subject -= re->max_lookbehind;
3482     if (check_subject < subject) check_subject = subject;
3483 #endif  /* PCRE2_CODE_UNIT_WIDTH != 32 */
3484     }
3485 
3486   /* Validate the relevant portion of the subject. After an error, adjust the
3487   offset to be an absolute offset in the whole string. */
3488 
3489   match_data->rc = PRIV(valid_utf)(check_subject,
3490     length - (PCRE2_SIZE)(check_subject - subject), &(match_data->startchar));
3491   if (match_data->rc != 0)
3492     {
3493     match_data->startchar += (PCRE2_SIZE)(check_subject - subject);
3494     return match_data->rc;
3495     }
3496   }
3497 #endif  /* SUPPORT_UNICODE */
3498 
3499 /* Set up the first code unit to match, if available. If there's no first code
3500 unit there may be a bitmap of possible first characters. */
3501 
3502 if ((re->flags & PCRE2_FIRSTSET) != 0)
3503   {
3504   has_first_cu = TRUE;
3505   first_cu = first_cu2 = (PCRE2_UCHAR)(re->first_codeunit);
3506   if ((re->flags & PCRE2_FIRSTCASELESS) != 0)
3507     {
3508     first_cu2 = TABLE_GET(first_cu, mb->tables + fcc_offset, first_cu);
3509 #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 8
3510     if (utf && first_cu > 127)
3511       first_cu2 = (PCRE2_UCHAR)UCD_OTHERCASE(first_cu);
3512 #endif
3513     }
3514   }
3515 else
3516   if (!startline && (re->flags & PCRE2_FIRSTMAPSET) != 0)
3517     start_bits = re->start_bitmap;
3518 
3519 /* There may be a "last known required code unit" set. */
3520 
3521 if ((re->flags & PCRE2_LASTSET) != 0)
3522   {
3523   has_req_cu = TRUE;
3524   req_cu = req_cu2 = (PCRE2_UCHAR)(re->last_codeunit);
3525   if ((re->flags & PCRE2_LASTCASELESS) != 0)
3526     {
3527     req_cu2 = TABLE_GET(req_cu, mb->tables + fcc_offset, req_cu);
3528 #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 8
3529     if (utf && req_cu > 127) req_cu2 = (PCRE2_UCHAR)UCD_OTHERCASE(req_cu);
3530 #endif
3531     }
3532   }
3533 
3534 /* If the match data block was previously used with PCRE2_COPY_MATCHED_SUBJECT,
3535 free the memory that was obtained. */
3536 
3537 if ((match_data->flags & PCRE2_MD_COPIED_SUBJECT) != 0)
3538   {
3539   match_data->memctl.free((void *)match_data->subject,
3540     match_data->memctl.memory_data);
3541   match_data->flags &= ~PCRE2_MD_COPIED_SUBJECT;
3542   }
3543 
3544 /* Fill in fields that are always returned in the match data. */
3545 
3546 match_data->code = re;
3547 match_data->subject = NULL;  /* Default for no match */
3548 match_data->mark = NULL;
3549 match_data->matchedby = PCRE2_MATCHEDBY_DFA_INTERPRETER;
3550 
3551 /* Call the main matching function, looping for a non-anchored regex after a
3552 failed match. If not restarting, perform certain optimizations at the start of
3553 a match. */
3554 
3555 for (;;)
3556   {
3557   /* ----------------- Start of match optimizations ---------------- */
3558 
3559   /* There are some optimizations that avoid running the match if a known
3560   starting point is not found, or if a known later code unit is not present.
3561   However, there is an option (settable at compile time) that disables
3562   these, for testing and for ensuring that all callouts do actually occur.
3563   The optimizations must also be avoided when restarting a DFA match. */
3564 
3565   if ((re->overall_options & PCRE2_NO_START_OPTIMIZE) == 0 &&
3566       (options & PCRE2_DFA_RESTART) == 0)
3567     {
3568     /* If firstline is TRUE, the start of the match is constrained to the first
3569     line of a multiline string. That is, the match must be before or at the
3570     first newline following the start of matching. Temporarily adjust
3571     end_subject so that we stop the optimization scans for a first code unit
3572     immediately after the first character of a newline (the first code unit can
3573     legitimately be a newline). If the match fails at the newline, later code
3574     breaks this loop. */
3575 
3576     if (firstline)
3577       {
3578       PCRE2_SPTR t = start_match;
3579 #ifdef SUPPORT_UNICODE
3580       if (utf)
3581         {
3582         while (t < end_subject && !IS_NEWLINE(t))
3583           {
3584           t++;
3585           ACROSSCHAR(t < end_subject, t, t++);
3586           }
3587         }
3588       else
3589 #endif
3590       while (t < end_subject && !IS_NEWLINE(t)) t++;
3591       end_subject = t;
3592       }
3593 
3594     /* Anchored: check the first code unit if one is recorded. This may seem
3595     pointless but it can help in detecting a no match case without scanning for
3596     the required code unit. */
3597 
3598     if (anchored)
3599       {
3600       if (has_first_cu || start_bits != NULL)
3601         {
3602         BOOL ok = start_match < end_subject;
3603         if (ok)
3604           {
3605           PCRE2_UCHAR c = UCHAR21TEST(start_match);
3606           ok = has_first_cu && (c == first_cu || c == first_cu2);
3607           if (!ok && start_bits != NULL)
3608             {
3609 #if PCRE2_CODE_UNIT_WIDTH != 8
3610             if (c > 255) c = 255;
3611 #endif
3612             ok = (start_bits[c/8] & (1u << (c&7))) != 0;
3613             }
3614           }
3615         if (!ok) break;
3616         }
3617       }
3618 
3619     /* Not anchored. Advance to a unique first code unit if there is one. In
3620     8-bit mode, the use of memchr() gives a big speed up, even though we have
3621     to call it twice in caseless mode, in order to find the earliest occurrence
3622     of the character in either of its cases. */
3623 
3624     else
3625       {
3626       if (has_first_cu)
3627         {
3628         if (first_cu != first_cu2)  /* Caseless */
3629           {
3630 #if PCRE2_CODE_UNIT_WIDTH != 8
3631           PCRE2_UCHAR smc;
3632           while (start_match < end_subject &&
3633                 (smc = UCHAR21TEST(start_match)) != first_cu &&
3634                   smc != first_cu2)
3635             start_match++;
3636 #else  /* 8-bit code units */
3637           PCRE2_SPTR pp1 =
3638             memchr(start_match, first_cu, end_subject-start_match);
3639           PCRE2_SPTR pp2 =
3640             memchr(start_match, first_cu2, end_subject-start_match);
3641           if (pp1 == NULL)
3642             start_match = (pp2 == NULL)? end_subject : pp2;
3643           else
3644             start_match = (pp2 == NULL || pp1 < pp2)? pp1 : pp2;
3645 #endif
3646           }
3647 
3648         /* The caseful case */
3649 
3650         else
3651           {
3652 #if PCRE2_CODE_UNIT_WIDTH != 8
3653           while (start_match < end_subject && UCHAR21TEST(start_match) !=
3654                  first_cu)
3655             start_match++;
3656 #else
3657           start_match = memchr(start_match, first_cu, end_subject - start_match);
3658           if (start_match == NULL) start_match = end_subject;
3659 #endif
3660           }
3661 
3662         /* If we can't find the required code unit, having reached the true end
3663         of the subject, break the bumpalong loop, to force a match failure,
3664         except when doing partial matching, when we let the next cycle run at
3665         the end of the subject. To see why, consider the pattern /(?<=abc)def/,
3666         which partially matches "abc", even though the string does not contain
3667         the starting character "d". If we have not reached the true end of the
3668         subject (PCRE2_FIRSTLINE caused end_subject to be temporarily modified)
3669         we also let the cycle run, because the matching string is legitimately
3670         allowed to start with the first code unit of a newline. */
3671 
3672         if ((mb->moptions & (PCRE2_PARTIAL_HARD|PCRE2_PARTIAL_SOFT)) == 0 &&
3673             start_match >= mb->end_subject)
3674           break;
3675         }
3676 
3677       /* If there's no first code unit, advance to just after a linebreak for a
3678       multiline match if required. */
3679 
3680       else if (startline)
3681         {
3682         if (start_match > mb->start_subject + start_offset)
3683           {
3684 #ifdef SUPPORT_UNICODE
3685           if (utf)
3686             {
3687             while (start_match < end_subject && !WAS_NEWLINE(start_match))
3688               {
3689               start_match++;
3690               ACROSSCHAR(start_match < end_subject, start_match, start_match++);
3691               }
3692             }
3693           else
3694 #endif
3695           while (start_match < end_subject && !WAS_NEWLINE(start_match))
3696             start_match++;
3697 
3698           /* If we have just passed a CR and the newline option is ANY or
3699           ANYCRLF, and we are now at a LF, advance the match position by one
3700           more code unit. */
3701 
3702           if (start_match[-1] == CHAR_CR &&
3703                (mb->nltype == NLTYPE_ANY || mb->nltype == NLTYPE_ANYCRLF) &&
3704                start_match < end_subject &&
3705                UCHAR21TEST(start_match) == CHAR_NL)
3706             start_match++;
3707           }
3708         }
3709 
3710       /* If there's no first code unit or a requirement for a multiline line
3711       start, advance to a non-unique first code unit if any have been
3712       identified. The bitmap contains only 256 bits. When code units are 16 or
3713       32 bits wide, all code units greater than 254 set the 255 bit. */
3714 
3715       else if (start_bits != NULL)
3716         {
3717         while (start_match < end_subject)
3718           {
3719           uint32_t c = UCHAR21TEST(start_match);
3720 #if PCRE2_CODE_UNIT_WIDTH != 8
3721           if (c > 255) c = 255;
3722 #endif
3723           if ((start_bits[c/8] & (1u << (c&7))) != 0) break;
3724           start_match++;
3725           }
3726 
3727         /* See comment above in first_cu checking about the next line. */
3728 
3729         if ((mb->moptions & (PCRE2_PARTIAL_HARD|PCRE2_PARTIAL_SOFT)) == 0 &&
3730             start_match >= mb->end_subject)
3731           break;
3732         }
3733       }  /* End of first code unit handling */
3734 
3735     /* Restore fudged end_subject */
3736 
3737     end_subject = mb->end_subject;
3738 
3739     /* The following two optimizations are disabled for partial matching. */
3740 
3741     if ((mb->moptions & (PCRE2_PARTIAL_HARD|PCRE2_PARTIAL_SOFT)) == 0)
3742       {
3743       /* The minimum matching length is a lower bound; no actual string of that
3744       length may actually match the pattern. Although the value is, strictly,
3745       in characters, we treat it as code units to avoid spending too much time
3746       in this optimization. */
3747 
3748       if (end_subject - start_match < re->minlength) goto NOMATCH_EXIT;
3749 
3750       /* If req_cu is set, we know that that code unit must appear in the
3751       subject for the match to succeed. If the first code unit is set, req_cu
3752       must be later in the subject; otherwise the test starts at the match
3753       point. This optimization can save a huge amount of backtracking in
3754       patterns with nested unlimited repeats that aren't going to match.
3755       Writing separate code for cased/caseless versions makes it go faster, as
3756       does using an autoincrement and backing off on a match.
3757 
3758       HOWEVER: when the subject string is very, very long, searching to its end
3759       can take a long time, and give bad performance on quite ordinary
3760       patterns. This showed up when somebody was matching something like
3761       /^\d+C/ on a 32-megabyte string... so we don't do this when the string is
3762       sufficiently long. */
3763 
3764       if (has_req_cu && end_subject - start_match < REQ_CU_MAX)
3765         {
3766         PCRE2_SPTR p = start_match + (has_first_cu? 1:0);
3767 
3768         /* We don't need to repeat the search if we haven't yet reached the
3769         place we found it at last time. */
3770 
3771         if (p > req_cu_ptr)
3772           {
3773           if (req_cu != req_cu2)
3774             {
3775             while (p < end_subject)
3776               {
3777               uint32_t pp = UCHAR21INCTEST(p);
3778               if (pp == req_cu || pp == req_cu2) { p--; break; }
3779               }
3780             }
3781           else
3782             {
3783             while (p < end_subject)
3784               {
3785               if (UCHAR21INCTEST(p) == req_cu) { p--; break; }
3786               }
3787             }
3788 
3789           /* If we can't find the required code unit, break the matching loop,
3790           forcing a match failure. */
3791 
3792           if (p >= end_subject) break;
3793 
3794           /* If we have found the required code unit, save the point where we
3795           found it, so that we don't search again next time round the loop if
3796           the start hasn't passed this code unit yet. */
3797 
3798           req_cu_ptr = p;
3799           }
3800         }
3801       }
3802     }
3803 
3804   /* ------------ End of start of match optimizations ------------ */
3805 
3806   /* Give no match if we have passed the bumpalong limit. */
3807 
3808   if (start_match > bumpalong_limit) break;
3809 
3810   /* OK, now we can do the business */
3811 
3812   mb->start_used_ptr = start_match;
3813   mb->last_used_ptr = start_match;
3814   mb->recursive = NULL;
3815 
3816   rc = internal_dfa_match(
3817     mb,                           /* fixed match data */
3818     mb->start_code,               /* this subexpression's code */
3819     start_match,                  /* where we currently are */
3820     start_offset,                 /* start offset in subject */
3821     match_data->ovector,          /* offset vector */
3822     (uint32_t)match_data->oveccount * 2,  /* actual size of same */
3823     workspace,                    /* workspace vector */
3824     (int)wscount,                 /* size of same */
3825     0,                            /* function recurse level */
3826     base_recursion_workspace);    /* initial workspace for recursion */
3827 
3828   /* Anything other than "no match" means we are done, always; otherwise, carry
3829   on only if not anchored. */
3830 
3831   if (rc != PCRE2_ERROR_NOMATCH || anchored)
3832     {
3833     if (rc == PCRE2_ERROR_PARTIAL && match_data->oveccount > 0)
3834       {
3835       match_data->ovector[0] = (PCRE2_SIZE)(start_match - subject);
3836       match_data->ovector[1] = (PCRE2_SIZE)(end_subject - subject);
3837       }
3838     match_data->leftchar = (PCRE2_SIZE)(mb->start_used_ptr - subject);
3839     match_data->rightchar = (PCRE2_SIZE)( mb->last_used_ptr - subject);
3840     match_data->startchar = (PCRE2_SIZE)(start_match - subject);
3841     match_data->rc = rc;
3842 
3843     if (rc >= 0 &&(options & PCRE2_COPY_MATCHED_SUBJECT) != 0)
3844       {
3845       length = CU2BYTES(length + was_zero_terminated);
3846       match_data->subject = match_data->memctl.malloc(length,
3847         match_data->memctl.memory_data);
3848       if (match_data->subject == NULL) return PCRE2_ERROR_NOMEMORY;
3849       memcpy((void *)match_data->subject, subject, length);
3850       match_data->flags |= PCRE2_MD_COPIED_SUBJECT;
3851       }
3852     else
3853       {
3854       if (rc >= 0 || rc == PCRE2_ERROR_PARTIAL) match_data->subject = subject;
3855       }
3856     goto EXIT;
3857     }
3858 
3859   /* Advance to the next subject character unless we are at the end of a line
3860   and firstline is set. */
3861 
3862   if (firstline && IS_NEWLINE(start_match)) break;
3863   start_match++;
3864 #ifdef SUPPORT_UNICODE
3865   if (utf)
3866     {
3867     ACROSSCHAR(start_match < end_subject, start_match, start_match++);
3868     }
3869 #endif
3870   if (start_match > end_subject) break;
3871 
3872   /* If we have just passed a CR and we are now at a LF, and the pattern does
3873   not contain any explicit matches for \r or \n, and the newline option is CRLF
3874   or ANY or ANYCRLF, advance the match position by one more character. */
3875 
3876   if (UCHAR21TEST(start_match - 1) == CHAR_CR &&
3877       start_match < end_subject &&
3878       UCHAR21TEST(start_match) == CHAR_NL &&
3879       (re->flags & PCRE2_HASCRORLF) == 0 &&
3880         (mb->nltype == NLTYPE_ANY ||
3881          mb->nltype == NLTYPE_ANYCRLF ||
3882          mb->nllen == 2))
3883     start_match++;
3884 
3885   }   /* "Bumpalong" loop */
3886 
3887 NOMATCH_EXIT:
3888 rc = PCRE2_ERROR_NOMATCH;
3889 
3890 EXIT:
3891 while (rws->next != NULL)
3892   {
3893   RWS_anchor *next = rws->next;
3894   rws->next = next->next;
3895   mb->memctl.free(next, mb->memctl.memory_data);
3896   }
3897 
3898 return rc;
3899 }
3900 
3901 /* End of pcre2_dfa_match.c */
3902