• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*************************************************
2 *      Perl-Compatible Regular Expressions       *
3 *************************************************/
4 
5 /* PCRE is a library of functions to support regular expressions whose syntax
6 and semantics are as close as possible to those of the Perl 5 language.
7 
8                        Written by Philip Hazel
9      Original API code Copyright (c) 1997-2012 University of Cambridge
10           New API code Copyright (c) 2016-2018 University of Cambridge
11 
12 -----------------------------------------------------------------------------
13 Redistribution and use in source and binary forms, with or without
14 modification, are permitted provided that the following conditions are met:
15 
16     * Redistributions of source code must retain the above copyright notice,
17       this list of conditions and the following disclaimer.
18 
19     * Redistributions in binary form must reproduce the above copyright
20       notice, this list of conditions and the following disclaimer in the
21       documentation and/or other materials provided with the distribution.
22 
23     * Neither the name of the University of Cambridge nor the names of its
24       contributors may be used to endorse or promote products derived from
25       this software without specific prior written permission.
26 
27 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
28 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
29 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
30 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
31 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
32 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
33 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
34 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
35 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
36 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
37 POSSIBILITY OF SUCH DAMAGE.
38 -----------------------------------------------------------------------------
39 */
40 
41 
42 /* This module contains the external function pcre2_dfa_match(), which is an
43 alternative matching function that uses a sort of DFA algorithm (not a true
44 FSM). This is NOT Perl-compatible, but it has advantages in certain
45 applications. */
46 
47 
48 /* NOTE ABOUT PERFORMANCE: A user of this function sent some code that improved
49 the performance of his patterns greatly. I could not use it as it stood, as it
50 was not thread safe, and made assumptions about pattern sizes. Also, it caused
51 test 7 to loop, and test 9 to crash with a segfault.
52 
53 The issue is the check for duplicate states, which is done by a simple linear
54 search up the state list. (Grep for "duplicate" below to find the code.) For
55 many patterns, there will never be many states active at one time, so a simple
56 linear search is fine. In patterns that have many active states, it might be a
57 bottleneck. The suggested code used an indexing scheme to remember which states
58 had previously been used for each character, and avoided the linear search when
59 it knew there was no chance of a duplicate. This was implemented when adding
60 states to the state lists.
61 
62 I wrote some thread-safe, not-limited code to try something similar at the time
63 of checking for duplicates (instead of when adding states), using index vectors
64 on the stack. It did give a 13% improvement with one specially constructed
65 pattern for certain subject strings, but on other strings and on many of the
66 simpler patterns in the test suite it did worse. The major problem, I think,
67 was the extra time to initialize the index. This had to be done for each call
68 of internal_dfa_match(). (The supplied patch used a static vector, initialized
69 only once - I suspect this was the cause of the problems with the tests.)
70 
71 Overall, I concluded that the gains in some cases did not outweigh the losses
72 in others, so I abandoned this code. */
73 
74 
75 #ifdef HAVE_CONFIG_H
76 #include "config.h"
77 #endif
78 
79 #define NLBLOCK mb             /* Block containing newline information */
80 #define PSSTART start_subject  /* Field containing processed string start */
81 #define PSEND   end_subject    /* Field containing processed string end */
82 
83 #include "pcre2_internal.h"
84 
85 #define PUBLIC_DFA_MATCH_OPTIONS \
86   (PCRE2_ANCHORED|PCRE2_ENDANCHORED|PCRE2_NOTBOL|PCRE2_NOTEOL|PCRE2_NOTEMPTY| \
87    PCRE2_NOTEMPTY_ATSTART|PCRE2_NO_UTF_CHECK|PCRE2_PARTIAL_HARD| \
88    PCRE2_PARTIAL_SOFT|PCRE2_DFA_SHORTEST|PCRE2_DFA_RESTART)
89 
90 
91 /*************************************************
92 *      Code parameters and static tables         *
93 *************************************************/
94 
95 /* These are offsets that are used to turn the OP_TYPESTAR and friends opcodes
96 into others, under special conditions. A gap of 20 between the blocks should be
97 enough. The resulting opcodes don't have to be less than 256 because they are
98 never stored, so we push them well clear of the normal opcodes. */
99 
100 #define OP_PROP_EXTRA       300
101 #define OP_EXTUNI_EXTRA     320
102 #define OP_ANYNL_EXTRA      340
103 #define OP_HSPACE_EXTRA     360
104 #define OP_VSPACE_EXTRA     380
105 
106 
107 /* This table identifies those opcodes that are followed immediately by a
108 character that is to be tested in some way. This makes it possible to
109 centralize the loading of these characters. In the case of Type * etc, the
110 "character" is the opcode for \D, \d, \S, \s, \W, or \w, which will always be a
111 small value. Non-zero values in the table are the offsets from the opcode where
112 the character is to be found. ***NOTE*** If the start of this table is
113 modified, the three tables that follow must also be modified. */
114 
115 static const uint8_t coptable[] = {
116   0,                             /* End                                    */
117   0, 0, 0, 0, 0,                 /* \A, \G, \K, \B, \b                     */
118   0, 0, 0, 0, 0, 0,              /* \D, \d, \S, \s, \W, \w                 */
119   0, 0, 0,                       /* Any, AllAny, Anybyte                   */
120   0, 0,                          /* \P, \p                                 */
121   0, 0, 0, 0, 0,                 /* \R, \H, \h, \V, \v                     */
122   0,                             /* \X                                     */
123   0, 0, 0, 0, 0, 0,              /* \Z, \z, $, $M, ^, ^M                   */
124   1,                             /* Char                                   */
125   1,                             /* Chari                                  */
126   1,                             /* not                                    */
127   1,                             /* noti                                   */
128   /* Positive single-char repeats                                          */
129   1, 1, 1, 1, 1, 1,              /* *, *?, +, +?, ?, ??                    */
130   1+IMM2_SIZE, 1+IMM2_SIZE,      /* upto, minupto                          */
131   1+IMM2_SIZE,                   /* exact                                  */
132   1, 1, 1, 1+IMM2_SIZE,          /* *+, ++, ?+, upto+                      */
133   1, 1, 1, 1, 1, 1,              /* *I, *?I, +I, +?I, ?I, ??I              */
134   1+IMM2_SIZE, 1+IMM2_SIZE,      /* upto I, minupto I                      */
135   1+IMM2_SIZE,                   /* exact I                                */
136   1, 1, 1, 1+IMM2_SIZE,          /* *+I, ++I, ?+I, upto+I                  */
137   /* Negative single-char repeats - only for chars < 256                   */
138   1, 1, 1, 1, 1, 1,              /* NOT *, *?, +, +?, ?, ??                */
139   1+IMM2_SIZE, 1+IMM2_SIZE,      /* NOT upto, minupto                      */
140   1+IMM2_SIZE,                   /* NOT exact                              */
141   1, 1, 1, 1+IMM2_SIZE,          /* NOT *+, ++, ?+, upto+                  */
142   1, 1, 1, 1, 1, 1,              /* NOT *I, *?I, +I, +?I, ?I, ??I          */
143   1+IMM2_SIZE, 1+IMM2_SIZE,      /* NOT upto I, minupto I                  */
144   1+IMM2_SIZE,                   /* NOT exact I                            */
145   1, 1, 1, 1+IMM2_SIZE,          /* NOT *+I, ++I, ?+I, upto+I              */
146   /* Positive type repeats                                                 */
147   1, 1, 1, 1, 1, 1,              /* Type *, *?, +, +?, ?, ??               */
148   1+IMM2_SIZE, 1+IMM2_SIZE,      /* Type upto, minupto                     */
149   1+IMM2_SIZE,                   /* Type exact                             */
150   1, 1, 1, 1+IMM2_SIZE,          /* Type *+, ++, ?+, upto+                 */
151   /* Character class & ref repeats                                         */
152   0, 0, 0, 0, 0, 0,              /* *, *?, +, +?, ?, ??                    */
153   0, 0,                          /* CRRANGE, CRMINRANGE                    */
154   0, 0, 0, 0,                    /* Possessive *+, ++, ?+, CRPOSRANGE      */
155   0,                             /* CLASS                                  */
156   0,                             /* NCLASS                                 */
157   0,                             /* XCLASS - variable length               */
158   0,                             /* REF                                    */
159   0,                             /* REFI                                   */
160   0,                             /* DNREF                                  */
161   0,                             /* DNREFI                                 */
162   0,                             /* RECURSE                                */
163   0,                             /* CALLOUT                                */
164   0,                             /* CALLOUT_STR                            */
165   0,                             /* Alt                                    */
166   0,                             /* Ket                                    */
167   0,                             /* KetRmax                                */
168   0,                             /* KetRmin                                */
169   0,                             /* KetRpos                                */
170   0,                             /* Reverse                                */
171   0,                             /* Assert                                 */
172   0,                             /* Assert not                             */
173   0,                             /* Assert behind                          */
174   0,                             /* Assert behind not                      */
175   0,                             /* ONCE                                   */
176   0, 0, 0, 0, 0,                 /* BRA, BRAPOS, CBRA, CBRAPOS, COND       */
177   0, 0, 0, 0, 0,                 /* SBRA, SBRAPOS, SCBRA, SCBRAPOS, SCOND  */
178   0, 0,                          /* CREF, DNCREF                           */
179   0, 0,                          /* RREF, DNRREF                           */
180   0, 0,                          /* FALSE, TRUE                            */
181   0, 0, 0,                       /* BRAZERO, BRAMINZERO, BRAPOSZERO        */
182   0, 0, 0,                       /* MARK, PRUNE, PRUNE_ARG                 */
183   0, 0, 0, 0,                    /* SKIP, SKIP_ARG, THEN, THEN_ARG         */
184   0, 0,                          /* COMMIT, COMMIT_ARG                     */
185   0, 0, 0,                       /* FAIL, ACCEPT, ASSERT_ACCEPT            */
186   0, 0, 0                        /* CLOSE, SKIPZERO, DEFINE                */
187 };
188 
189 /* This table identifies those opcodes that inspect a character. It is used to
190 remember the fact that a character could have been inspected when the end of
191 the subject is reached. ***NOTE*** If the start of this table is modified, the
192 two tables that follow must also be modified. */
193 
194 static const uint8_t poptable[] = {
195   0,                             /* End                                    */
196   0, 0, 0, 1, 1,                 /* \A, \G, \K, \B, \b                     */
197   1, 1, 1, 1, 1, 1,              /* \D, \d, \S, \s, \W, \w                 */
198   1, 1, 1,                       /* Any, AllAny, Anybyte                   */
199   1, 1,                          /* \P, \p                                 */
200   1, 1, 1, 1, 1,                 /* \R, \H, \h, \V, \v                     */
201   1,                             /* \X                                     */
202   0, 0, 0, 0, 0, 0,              /* \Z, \z, $, $M, ^, ^M                   */
203   1,                             /* Char                                   */
204   1,                             /* Chari                                  */
205   1,                             /* not                                    */
206   1,                             /* noti                                   */
207   /* Positive single-char repeats                                          */
208   1, 1, 1, 1, 1, 1,              /* *, *?, +, +?, ?, ??                    */
209   1, 1, 1,                       /* upto, minupto, exact                   */
210   1, 1, 1, 1,                    /* *+, ++, ?+, upto+                      */
211   1, 1, 1, 1, 1, 1,              /* *I, *?I, +I, +?I, ?I, ??I              */
212   1, 1, 1,                       /* upto I, minupto I, exact I             */
213   1, 1, 1, 1,                    /* *+I, ++I, ?+I, upto+I                  */
214   /* Negative single-char repeats - only for chars < 256                   */
215   1, 1, 1, 1, 1, 1,              /* NOT *, *?, +, +?, ?, ??                */
216   1, 1, 1,                       /* NOT upto, minupto, exact               */
217   1, 1, 1, 1,                    /* NOT *+, ++, ?+, upto+                  */
218   1, 1, 1, 1, 1, 1,              /* NOT *I, *?I, +I, +?I, ?I, ??I          */
219   1, 1, 1,                       /* NOT upto I, minupto I, exact I         */
220   1, 1, 1, 1,                    /* NOT *+I, ++I, ?+I, upto+I              */
221   /* Positive type repeats                                                 */
222   1, 1, 1, 1, 1, 1,              /* Type *, *?, +, +?, ?, ??               */
223   1, 1, 1,                       /* Type upto, minupto, exact              */
224   1, 1, 1, 1,                    /* Type *+, ++, ?+, upto+                 */
225   /* Character class & ref repeats                                         */
226   1, 1, 1, 1, 1, 1,              /* *, *?, +, +?, ?, ??                    */
227   1, 1,                          /* CRRANGE, CRMINRANGE                    */
228   1, 1, 1, 1,                    /* Possessive *+, ++, ?+, CRPOSRANGE      */
229   1,                             /* CLASS                                  */
230   1,                             /* NCLASS                                 */
231   1,                             /* XCLASS - variable length               */
232   0,                             /* REF                                    */
233   0,                             /* REFI                                   */
234   0,                             /* DNREF                                  */
235   0,                             /* DNREFI                                 */
236   0,                             /* RECURSE                                */
237   0,                             /* CALLOUT                                */
238   0,                             /* CALLOUT_STR                            */
239   0,                             /* Alt                                    */
240   0,                             /* Ket                                    */
241   0,                             /* KetRmax                                */
242   0,                             /* KetRmin                                */
243   0,                             /* KetRpos                                */
244   0,                             /* Reverse                                */
245   0,                             /* Assert                                 */
246   0,                             /* Assert not                             */
247   0,                             /* Assert behind                          */
248   0,                             /* Assert behind not                      */
249   0,                             /* ONCE                                   */
250   0, 0, 0, 0, 0,                 /* BRA, BRAPOS, CBRA, CBRAPOS, COND       */
251   0, 0, 0, 0, 0,                 /* SBRA, SBRAPOS, SCBRA, SCBRAPOS, SCOND  */
252   0, 0,                          /* CREF, DNCREF                           */
253   0, 0,                          /* RREF, DNRREF                           */
254   0, 0,                          /* FALSE, TRUE                            */
255   0, 0, 0,                       /* BRAZERO, BRAMINZERO, BRAPOSZERO        */
256   0, 0, 0,                       /* MARK, PRUNE, PRUNE_ARG                 */
257   0, 0, 0, 0,                    /* SKIP, SKIP_ARG, THEN, THEN_ARG         */
258   0, 0,                          /* COMMIT, COMMIT_ARG                     */
259   0, 0, 0,                       /* FAIL, ACCEPT, ASSERT_ACCEPT            */
260   0, 0, 0                        /* CLOSE, SKIPZERO, DEFINE                */
261 };
262 
263 /* These 2 tables allow for compact code for testing for \D, \d, \S, \s, \W,
264 and \w */
265 
266 static const uint8_t toptable1[] = {
267   0, 0, 0, 0, 0, 0,
268   ctype_digit, ctype_digit,
269   ctype_space, ctype_space,
270   ctype_word,  ctype_word,
271   0, 0                            /* OP_ANY, OP_ALLANY */
272 };
273 
274 static const uint8_t toptable2[] = {
275   0, 0, 0, 0, 0, 0,
276   ctype_digit, 0,
277   ctype_space, 0,
278   ctype_word,  0,
279   1, 1                            /* OP_ANY, OP_ALLANY */
280 };
281 
282 
283 /* Structure for holding data about a particular state, which is in effect the
284 current data for an active path through the match tree. It must consist
285 entirely of ints because the working vector we are passed, and which we put
286 these structures in, is a vector of ints. */
287 
288 typedef struct stateblock {
289   int offset;                     /* Offset to opcode (-ve has meaning) */
290   int count;                      /* Count for repeats */
291   int data;                       /* Some use extra data */
292 } stateblock;
293 
294 #define INTS_PER_STATEBLOCK  (int)(sizeof(stateblock)/sizeof(int))
295 
296 
297 /* Before version 10.32 the recursive calls of internal_dfa_match() were passed
298 local working space and output vectors that were created on the stack. This has
299 caused issues for some patterns, especially in small-stack environments such as
300 Windows. A new scheme is now in use which sets up a vector on the stack, but if
301 this is too small, heap memory is used, up to the heap_limit. The main
302 parameters are all numbers of ints because the workspace is a vector of ints.
303 
304 The size of the starting stack vector, DFA_START_RWS_SIZE, is in bytes, and is
305 defined in pcre2_internal.h so as to be available to pcre2test when it is
306 finding the minimum heap requirement for a match. */
307 
308 #define OVEC_UNIT  (sizeof(PCRE2_SIZE)/sizeof(int))
309 
310 #define RWS_BASE_SIZE   (DFA_START_RWS_SIZE/sizeof(int))  /* Stack vector */
311 #define RWS_RSIZE       1000                    /* Work size for recursion */
312 #define RWS_OVEC_RSIZE  (1000*OVEC_UNIT)        /* Ovector for recursion */
313 #define RWS_OVEC_OSIZE  (2*OVEC_UNIT)           /* Ovector in other cases */
314 
315 /* This structure is at the start of each workspace block. */
316 
317 typedef struct RWS_anchor {
318   struct RWS_anchor *next;
319   unsigned int size;  /* Number of ints */
320   unsigned int free;  /* Number of ints */
321 } RWS_anchor;
322 
323 #define RWS_ANCHOR_SIZE (sizeof(RWS_anchor)/sizeof(int))
324 
325 
326 
327 /*************************************************
328 *               Process a callout                *
329 *************************************************/
330 
331 /* This function is called to perform a callout.
332 
333 Arguments:
334   code              current code pointer
335   offsets           points to current capture offsets
336   current_subject   start of current subject match
337   ptr               current position in subject
338   mb                the match block
339   extracode         extra code offset when called from condition
340   lengthptr         where to return the callout length
341 
342 Returns:            the return from the callout
343 */
344 
345 static int
do_callout(PCRE2_SPTR code,PCRE2_SIZE * offsets,PCRE2_SPTR current_subject,PCRE2_SPTR ptr,dfa_match_block * mb,PCRE2_SIZE extracode,PCRE2_SIZE * lengthptr)346 do_callout(PCRE2_SPTR code, PCRE2_SIZE *offsets, PCRE2_SPTR current_subject,
347   PCRE2_SPTR ptr, dfa_match_block *mb, PCRE2_SIZE extracode,
348   PCRE2_SIZE *lengthptr)
349 {
350 pcre2_callout_block *cb = mb->cb;
351 
352 *lengthptr = (code[extracode] == OP_CALLOUT)?
353   (PCRE2_SIZE)PRIV(OP_lengths)[OP_CALLOUT] :
354   (PCRE2_SIZE)GET(code, 1 + 2*LINK_SIZE + extracode);
355 
356 if (mb->callout == NULL) return 0;    /* No callout provided */
357 
358 /* Fixed fields in the callout block are set once and for all at the start of
359 matching. */
360 
361 cb->offset_vector    = offsets;
362 cb->start_match      = (PCRE2_SIZE)(current_subject - mb->start_subject);
363 cb->current_position = (PCRE2_SIZE)(ptr - mb->start_subject);
364 cb->pattern_position = GET(code, 1 + extracode);
365 cb->next_item_length = GET(code, 1 + LINK_SIZE + extracode);
366 
367 if (code[extracode] == OP_CALLOUT)
368   {
369   cb->callout_number = code[1 + 2*LINK_SIZE + extracode];
370   cb->callout_string_offset = 0;
371   cb->callout_string = NULL;
372   cb->callout_string_length = 0;
373   }
374 else
375   {
376   cb->callout_number = 0;
377   cb->callout_string_offset = GET(code, 1 + 3*LINK_SIZE + extracode);
378   cb->callout_string = code + (1 + 4*LINK_SIZE + extracode) + 1;
379   cb->callout_string_length = *lengthptr - (1 + 4*LINK_SIZE) - 2;
380   }
381 
382 return (mb->callout)(cb, mb->callout_data);
383 }
384 
385 
386 
387 /*************************************************
388 *         Expand local workspace memory          *
389 *************************************************/
390 
391 /* This function is called when internal_dfa_match() is about to be called
392 recursively and there is insufficient working space left in the current
393 workspace block. If there's an existing next block, use it; otherwise get a new
394 block unless the heap limit is reached.
395 
396 Arguments:
397   rwsptr     pointer to block pointer (updated)
398   ovecsize   space needed for an ovector
399   mb         the match block
400 
401 Returns:     0 rwsptr has been updated
402             !0 an error code
403 */
404 
405 static int
more_workspace(RWS_anchor ** rwsptr,unsigned int ovecsize,dfa_match_block * mb)406 more_workspace(RWS_anchor **rwsptr, unsigned int ovecsize, dfa_match_block *mb)
407 {
408 RWS_anchor *rws = *rwsptr;
409 RWS_anchor *new;
410 
411 if (rws->next != NULL)
412   {
413   new = rws->next;
414   }
415 
416 /* All sizes are in units of sizeof(int), except for mb->heaplimit, which is in
417 kibibytes. */
418 
419 else
420   {
421   unsigned int newsize = rws->size * 2;
422   unsigned int heapleft = (unsigned int)
423     (((1024/sizeof(int))*mb->heap_limit - mb->heap_used));
424   if (newsize > heapleft) newsize = heapleft;
425   if (newsize < RWS_RSIZE + ovecsize + RWS_ANCHOR_SIZE)
426     return PCRE2_ERROR_HEAPLIMIT;
427   new = mb->memctl.malloc(newsize*sizeof(int), mb->memctl.memory_data);
428   if (new == NULL) return PCRE2_ERROR_NOMEMORY;
429   mb->heap_used += newsize;
430   new->next = NULL;
431   new->size = newsize;
432   rws->next = new;
433   }
434 
435 new->free = new->size - RWS_ANCHOR_SIZE;
436 *rwsptr = new;
437 return 0;
438 }
439 
440 
441 
442 /*************************************************
443 *     Match a Regular Expression - DFA engine    *
444 *************************************************/
445 
446 /* This internal function applies a compiled pattern to a subject string,
447 starting at a given point, using a DFA engine. This function is called from the
448 external one, possibly multiple times if the pattern is not anchored. The
449 function calls itself recursively for some kinds of subpattern.
450 
451 Arguments:
452   mb                the match_data block with fixed information
453   this_start_code   the opening bracket of this subexpression's code
454   current_subject   where we currently are in the subject string
455   start_offset      start offset in the subject string
456   offsets           vector to contain the matching string offsets
457   offsetcount       size of same
458   workspace         vector of workspace
459   wscount           size of same
460   rlevel            function call recursion level
461 
462 Returns:            > 0 => number of match offset pairs placed in offsets
463                     = 0 => offsets overflowed; longest matches are present
464                      -1 => failed to match
465                    < -1 => some kind of unexpected problem
466 
467 The following macros are used for adding states to the two state vectors (one
468 for the current character, one for the following character). */
469 
470 #define ADD_ACTIVE(x,y) \
471   if (active_count++ < wscount) \
472     { \
473     next_active_state->offset = (x); \
474     next_active_state->count  = (y); \
475     next_active_state++; \
476     } \
477   else return PCRE2_ERROR_DFA_WSSIZE
478 
479 #define ADD_ACTIVE_DATA(x,y,z) \
480   if (active_count++ < wscount) \
481     { \
482     next_active_state->offset = (x); \
483     next_active_state->count  = (y); \
484     next_active_state->data   = (z); \
485     next_active_state++; \
486     } \
487   else return PCRE2_ERROR_DFA_WSSIZE
488 
489 #define ADD_NEW(x,y) \
490   if (new_count++ < wscount) \
491     { \
492     next_new_state->offset = (x); \
493     next_new_state->count  = (y); \
494     next_new_state++; \
495     } \
496   else return PCRE2_ERROR_DFA_WSSIZE
497 
498 #define ADD_NEW_DATA(x,y,z) \
499   if (new_count++ < wscount) \
500     { \
501     next_new_state->offset = (x); \
502     next_new_state->count  = (y); \
503     next_new_state->data   = (z); \
504     next_new_state++; \
505     } \
506   else return PCRE2_ERROR_DFA_WSSIZE
507 
508 /* And now, here is the code */
509 
510 static int
internal_dfa_match(dfa_match_block * mb,PCRE2_SPTR this_start_code,PCRE2_SPTR current_subject,PCRE2_SIZE start_offset,PCRE2_SIZE * offsets,uint32_t offsetcount,int * workspace,int wscount,uint32_t rlevel,int * RWS)511 internal_dfa_match(
512   dfa_match_block *mb,
513   PCRE2_SPTR this_start_code,
514   PCRE2_SPTR current_subject,
515   PCRE2_SIZE start_offset,
516   PCRE2_SIZE *offsets,
517   uint32_t offsetcount,
518   int *workspace,
519   int wscount,
520   uint32_t rlevel,
521   int *RWS)
522 {
523 stateblock *active_states, *new_states, *temp_states;
524 stateblock *next_active_state, *next_new_state;
525 const uint8_t *ctypes, *lcc, *fcc;
526 PCRE2_SPTR ptr;
527 PCRE2_SPTR end_code;
528 dfa_recursion_info new_recursive;
529 int active_count, new_count, match_count;
530 
531 /* Some fields in the mb block are frequently referenced, so we load them into
532 independent variables in the hope that this will perform better. */
533 
534 PCRE2_SPTR start_subject = mb->start_subject;
535 PCRE2_SPTR end_subject = mb->end_subject;
536 PCRE2_SPTR start_code = mb->start_code;
537 
538 #ifdef SUPPORT_UNICODE
539 BOOL utf = (mb->poptions & PCRE2_UTF) != 0;
540 #else
541 BOOL utf = FALSE;
542 #endif
543 
544 BOOL reset_could_continue = FALSE;
545 
546 if (mb->match_call_count++ >= mb->match_limit) return PCRE2_ERROR_MATCHLIMIT;
547 if (rlevel++ > mb->match_limit_depth) return PCRE2_ERROR_DEPTHLIMIT;
548 offsetcount &= (uint32_t)(-2);  /* Round down */
549 
550 wscount -= 2;
551 wscount = (wscount - (wscount % (INTS_PER_STATEBLOCK * 2))) /
552           (2 * INTS_PER_STATEBLOCK);
553 
554 ctypes = mb->tables + ctypes_offset;
555 lcc = mb->tables + lcc_offset;
556 fcc = mb->tables + fcc_offset;
557 
558 match_count = PCRE2_ERROR_NOMATCH;   /* A negative number */
559 
560 active_states = (stateblock *)(workspace + 2);
561 next_new_state = new_states = active_states + wscount;
562 new_count = 0;
563 
564 /* The first thing in any (sub) pattern is a bracket of some sort. Push all
565 the alternative states onto the list, and find out where the end is. This
566 makes is possible to use this function recursively, when we want to stop at a
567 matching internal ket rather than at the end.
568 
569 If we are dealing with a backward assertion we have to find out the maximum
570 amount to move back, and set up each alternative appropriately. */
571 
572 if (*this_start_code == OP_ASSERTBACK || *this_start_code == OP_ASSERTBACK_NOT)
573   {
574   size_t max_back = 0;
575   size_t gone_back;
576 
577   end_code = this_start_code;
578   do
579     {
580     size_t back = (size_t)GET(end_code, 2+LINK_SIZE);
581     if (back > max_back) max_back = back;
582     end_code += GET(end_code, 1);
583     }
584   while (*end_code == OP_ALT);
585 
586   /* If we can't go back the amount required for the longest lookbehind
587   pattern, go back as far as we can; some alternatives may still be viable. */
588 
589 #ifdef SUPPORT_UNICODE
590   /* In character mode we have to step back character by character */
591 
592   if (utf)
593     {
594     for (gone_back = 0; gone_back < max_back; gone_back++)
595       {
596       if (current_subject <= start_subject) break;
597       current_subject--;
598       ACROSSCHAR(current_subject > start_subject, current_subject,
599         current_subject--);
600       }
601     }
602   else
603 #endif
604 
605   /* In byte-mode we can do this quickly. */
606 
607     {
608     size_t current_offset = (size_t)(current_subject - start_subject);
609     gone_back = (current_offset < max_back)? current_offset : max_back;
610     current_subject -= gone_back;
611     }
612 
613   /* Save the earliest consulted character */
614 
615   if (current_subject < mb->start_used_ptr)
616     mb->start_used_ptr = current_subject;
617 
618   /* Now we can process the individual branches. There will be an OP_REVERSE at
619   the start of each branch, except when the length of the branch is zero. */
620 
621   end_code = this_start_code;
622   do
623     {
624     uint32_t revlen = (end_code[1+LINK_SIZE] == OP_REVERSE)? 1 + LINK_SIZE : 0;
625     size_t back = (revlen == 0)? 0 : (size_t)GET(end_code, 2+LINK_SIZE);
626     if (back <= gone_back)
627       {
628       int bstate = (int)(end_code - start_code + 1 + LINK_SIZE + revlen);
629       ADD_NEW_DATA(-bstate, 0, (int)(gone_back - back));
630       }
631     end_code += GET(end_code, 1);
632     }
633   while (*end_code == OP_ALT);
634  }
635 
636 /* This is the code for a "normal" subpattern (not a backward assertion). The
637 start of a whole pattern is always one of these. If we are at the top level,
638 we may be asked to restart matching from the same point that we reached for a
639 previous partial match. We still have to scan through the top-level branches to
640 find the end state. */
641 
642 else
643   {
644   end_code = this_start_code;
645 
646   /* Restarting */
647 
648   if (rlevel == 1 && (mb->moptions & PCRE2_DFA_RESTART) != 0)
649     {
650     do { end_code += GET(end_code, 1); } while (*end_code == OP_ALT);
651     new_count = workspace[1];
652     if (!workspace[0])
653       memcpy(new_states, active_states, (size_t)new_count * sizeof(stateblock));
654     }
655 
656   /* Not restarting */
657 
658   else
659     {
660     int length = 1 + LINK_SIZE +
661       ((*this_start_code == OP_CBRA || *this_start_code == OP_SCBRA ||
662         *this_start_code == OP_CBRAPOS || *this_start_code == OP_SCBRAPOS)
663         ? IMM2_SIZE:0);
664     do
665       {
666       ADD_NEW((int)(end_code - start_code + length), 0);
667       end_code += GET(end_code, 1);
668       length = 1 + LINK_SIZE;
669       }
670     while (*end_code == OP_ALT);
671     }
672   }
673 
674 workspace[0] = 0;    /* Bit indicating which vector is current */
675 
676 /* Loop for scanning the subject */
677 
678 ptr = current_subject;
679 for (;;)
680   {
681   int i, j;
682   int clen, dlen;
683   uint32_t c, d;
684   int forced_fail = 0;
685   BOOL partial_newline = FALSE;
686   BOOL could_continue = reset_could_continue;
687   reset_could_continue = FALSE;
688 
689   if (ptr > mb->last_used_ptr) mb->last_used_ptr = ptr;
690 
691   /* Make the new state list into the active state list and empty the
692   new state list. */
693 
694   temp_states = active_states;
695   active_states = new_states;
696   new_states = temp_states;
697   active_count = new_count;
698   new_count = 0;
699 
700   workspace[0] ^= 1;              /* Remember for the restarting feature */
701   workspace[1] = active_count;
702 
703   /* Set the pointers for adding new states */
704 
705   next_active_state = active_states + active_count;
706   next_new_state = new_states;
707 
708   /* Load the current character from the subject outside the loop, as many
709   different states may want to look at it, and we assume that at least one
710   will. */
711 
712   if (ptr < end_subject)
713     {
714     clen = 1;        /* Number of data items in the character */
715 #ifdef SUPPORT_UNICODE
716     GETCHARLENTEST(c, ptr, clen);
717 #else
718     c = *ptr;
719 #endif  /* SUPPORT_UNICODE */
720     }
721   else
722     {
723     clen = 0;        /* This indicates the end of the subject */
724     c = NOTACHAR;    /* This value should never actually be used */
725     }
726 
727   /* Scan up the active states and act on each one. The result of an action
728   may be to add more states to the currently active list (e.g. on hitting a
729   parenthesis) or it may be to put states on the new list, for considering
730   when we move the character pointer on. */
731 
732   for (i = 0; i < active_count; i++)
733     {
734     stateblock *current_state = active_states + i;
735     BOOL caseless = FALSE;
736     PCRE2_SPTR code;
737     uint32_t codevalue;
738     int state_offset = current_state->offset;
739     int rrc;
740     int count;
741 
742     /* A negative offset is a special case meaning "hold off going to this
743     (negated) state until the number of characters in the data field have
744     been skipped". If the could_continue flag was passed over from a previous
745     state, arrange for it to passed on. */
746 
747     if (state_offset < 0)
748       {
749       if (current_state->data > 0)
750         {
751         ADD_NEW_DATA(state_offset, current_state->count,
752           current_state->data - 1);
753         if (could_continue) reset_could_continue = TRUE;
754         continue;
755         }
756       else
757         {
758         current_state->offset = state_offset = -state_offset;
759         }
760       }
761 
762     /* Check for a duplicate state with the same count, and skip if found.
763     See the note at the head of this module about the possibility of improving
764     performance here. */
765 
766     for (j = 0; j < i; j++)
767       {
768       if (active_states[j].offset == state_offset &&
769           active_states[j].count == current_state->count)
770         goto NEXT_ACTIVE_STATE;
771       }
772 
773     /* The state offset is the offset to the opcode */
774 
775     code = start_code + state_offset;
776     codevalue = *code;
777 
778     /* If this opcode inspects a character, but we are at the end of the
779     subject, remember the fact for use when testing for a partial match. */
780 
781     if (clen == 0 && poptable[codevalue] != 0)
782       could_continue = TRUE;
783 
784     /* If this opcode is followed by an inline character, load it. It is
785     tempting to test for the presence of a subject character here, but that
786     is wrong, because sometimes zero repetitions of the subject are
787     permitted.
788 
789     We also use this mechanism for opcodes such as OP_TYPEPLUS that take an
790     argument that is not a data character - but is always one byte long because
791     the values are small. We have to take special action to deal with  \P, \p,
792     \H, \h, \V, \v and \X in this case. To keep the other cases fast, convert
793     these ones to new opcodes. */
794 
795     if (coptable[codevalue] > 0)
796       {
797       dlen = 1;
798 #ifdef SUPPORT_UNICODE
799       if (utf) { GETCHARLEN(d, (code + coptable[codevalue]), dlen); } else
800 #endif  /* SUPPORT_UNICODE */
801       d = code[coptable[codevalue]];
802       if (codevalue >= OP_TYPESTAR)
803         {
804         switch(d)
805           {
806           case OP_ANYBYTE: return PCRE2_ERROR_DFA_UITEM;
807           case OP_NOTPROP:
808           case OP_PROP: codevalue += OP_PROP_EXTRA; break;
809           case OP_ANYNL: codevalue += OP_ANYNL_EXTRA; break;
810           case OP_EXTUNI: codevalue += OP_EXTUNI_EXTRA; break;
811           case OP_NOT_HSPACE:
812           case OP_HSPACE: codevalue += OP_HSPACE_EXTRA; break;
813           case OP_NOT_VSPACE:
814           case OP_VSPACE: codevalue += OP_VSPACE_EXTRA; break;
815           default: break;
816           }
817         }
818       }
819     else
820       {
821       dlen = 0;         /* Not strictly necessary, but compilers moan */
822       d = NOTACHAR;     /* if these variables are not set. */
823       }
824 
825 
826     /* Now process the individual opcodes */
827 
828     switch (codevalue)
829       {
830 /* ========================================================================== */
831       /* These cases are never obeyed. This is a fudge that causes a compile-
832       time error if the vectors coptable or poptable, which are indexed by
833       opcode, are not the correct length. It seems to be the only way to do
834       such a check at compile time, as the sizeof() operator does not work
835       in the C preprocessor. */
836 
837       case OP_TABLE_LENGTH:
838       case OP_TABLE_LENGTH +
839         ((sizeof(coptable) == OP_TABLE_LENGTH) &&
840          (sizeof(poptable) == OP_TABLE_LENGTH)):
841       return 0;
842 
843 /* ========================================================================== */
844       /* Reached a closing bracket. If not at the end of the pattern, carry
845       on with the next opcode. For repeating opcodes, also add the repeat
846       state. Note that KETRPOS will always be encountered at the end of the
847       subpattern, because the possessive subpattern repeats are always handled
848       using recursive calls. Thus, it never adds any new states.
849 
850       At the end of the (sub)pattern, unless we have an empty string and
851       PCRE2_NOTEMPTY is set, or PCRE2_NOTEMPTY_ATSTART is set and we are at the
852       start of the subject, save the match data, shifting up all previous
853       matches so we always have the longest first. */
854 
855       case OP_KET:
856       case OP_KETRMIN:
857       case OP_KETRMAX:
858       case OP_KETRPOS:
859       if (code != end_code)
860         {
861         ADD_ACTIVE(state_offset + 1 + LINK_SIZE, 0);
862         if (codevalue != OP_KET)
863           {
864           ADD_ACTIVE(state_offset - (int)GET(code, 1), 0);
865           }
866         }
867       else
868         {
869         if (ptr > current_subject ||
870             ((mb->moptions & PCRE2_NOTEMPTY) == 0 &&
871               ((mb->moptions & PCRE2_NOTEMPTY_ATSTART) == 0 ||
872                 current_subject > start_subject + mb->start_offset)))
873           {
874           if (match_count < 0) match_count = (offsetcount >= 2)? 1 : 0;
875             else if (match_count > 0 && ++match_count * 2 > (int)offsetcount)
876               match_count = 0;
877           count = ((match_count == 0)? (int)offsetcount : match_count * 2) - 2;
878           if (count > 0) (void)memmove(offsets + 2, offsets,
879             (size_t)count * sizeof(PCRE2_SIZE));
880           if (offsetcount >= 2)
881             {
882             offsets[0] = (PCRE2_SIZE)(current_subject - start_subject);
883             offsets[1] = (PCRE2_SIZE)(ptr - start_subject);
884             }
885           if ((mb->moptions & PCRE2_DFA_SHORTEST) != 0) return match_count;
886           }
887         }
888       break;
889 
890 /* ========================================================================== */
891       /* These opcodes add to the current list of states without looking
892       at the current character. */
893 
894       /*-----------------------------------------------------------------*/
895       case OP_ALT:
896       do { code += GET(code, 1); } while (*code == OP_ALT);
897       ADD_ACTIVE((int)(code - start_code), 0);
898       break;
899 
900       /*-----------------------------------------------------------------*/
901       case OP_BRA:
902       case OP_SBRA:
903       do
904         {
905         ADD_ACTIVE((int)(code - start_code + 1 + LINK_SIZE), 0);
906         code += GET(code, 1);
907         }
908       while (*code == OP_ALT);
909       break;
910 
911       /*-----------------------------------------------------------------*/
912       case OP_CBRA:
913       case OP_SCBRA:
914       ADD_ACTIVE((int)(code - start_code + 1 + LINK_SIZE + IMM2_SIZE),  0);
915       code += GET(code, 1);
916       while (*code == OP_ALT)
917         {
918         ADD_ACTIVE((int)(code - start_code + 1 + LINK_SIZE),  0);
919         code += GET(code, 1);
920         }
921       break;
922 
923       /*-----------------------------------------------------------------*/
924       case OP_BRAZERO:
925       case OP_BRAMINZERO:
926       ADD_ACTIVE(state_offset + 1, 0);
927       code += 1 + GET(code, 2);
928       while (*code == OP_ALT) code += GET(code, 1);
929       ADD_ACTIVE((int)(code - start_code + 1 + LINK_SIZE), 0);
930       break;
931 
932       /*-----------------------------------------------------------------*/
933       case OP_SKIPZERO:
934       code += 1 + GET(code, 2);
935       while (*code == OP_ALT) code += GET(code, 1);
936       ADD_ACTIVE((int)(code - start_code + 1 + LINK_SIZE), 0);
937       break;
938 
939       /*-----------------------------------------------------------------*/
940       case OP_CIRC:
941       if (ptr == start_subject && (mb->moptions & PCRE2_NOTBOL) == 0)
942         { ADD_ACTIVE(state_offset + 1, 0); }
943       break;
944 
945       /*-----------------------------------------------------------------*/
946       case OP_CIRCM:
947       if ((ptr == start_subject && (mb->moptions & PCRE2_NOTBOL) == 0) ||
948           ((ptr != end_subject || (mb->poptions & PCRE2_ALT_CIRCUMFLEX) != 0 )
949             && WAS_NEWLINE(ptr)))
950         { ADD_ACTIVE(state_offset + 1, 0); }
951       break;
952 
953       /*-----------------------------------------------------------------*/
954       case OP_EOD:
955       if (ptr >= end_subject)
956         {
957         if ((mb->moptions & PCRE2_PARTIAL_HARD) != 0)
958           could_continue = TRUE;
959         else { ADD_ACTIVE(state_offset + 1, 0); }
960         }
961       break;
962 
963       /*-----------------------------------------------------------------*/
964       case OP_SOD:
965       if (ptr == start_subject) { ADD_ACTIVE(state_offset + 1, 0); }
966       break;
967 
968       /*-----------------------------------------------------------------*/
969       case OP_SOM:
970       if (ptr == start_subject + start_offset) { ADD_ACTIVE(state_offset + 1, 0); }
971       break;
972 
973 
974 /* ========================================================================== */
975       /* These opcodes inspect the next subject character, and sometimes
976       the previous one as well, but do not have an argument. The variable
977       clen contains the length of the current character and is zero if we are
978       at the end of the subject. */
979 
980       /*-----------------------------------------------------------------*/
981       case OP_ANY:
982       if (clen > 0 && !IS_NEWLINE(ptr))
983         {
984         if (ptr + 1 >= mb->end_subject &&
985             (mb->moptions & (PCRE2_PARTIAL_HARD)) != 0 &&
986             NLBLOCK->nltype == NLTYPE_FIXED &&
987             NLBLOCK->nllen == 2 &&
988             c == NLBLOCK->nl[0])
989           {
990           could_continue = partial_newline = TRUE;
991           }
992         else
993           {
994           ADD_NEW(state_offset + 1, 0);
995           }
996         }
997       break;
998 
999       /*-----------------------------------------------------------------*/
1000       case OP_ALLANY:
1001       if (clen > 0)
1002         { ADD_NEW(state_offset + 1, 0); }
1003       break;
1004 
1005       /*-----------------------------------------------------------------*/
1006       case OP_EODN:
1007       if (clen == 0 && (mb->moptions & PCRE2_PARTIAL_HARD) != 0)
1008         could_continue = TRUE;
1009       else if (clen == 0 || (IS_NEWLINE(ptr) && ptr == end_subject - mb->nllen))
1010         { ADD_ACTIVE(state_offset + 1, 0); }
1011       break;
1012 
1013       /*-----------------------------------------------------------------*/
1014       case OP_DOLL:
1015       if ((mb->moptions & PCRE2_NOTEOL) == 0)
1016         {
1017         if (clen == 0 && (mb->moptions & PCRE2_PARTIAL_HARD) != 0)
1018           could_continue = TRUE;
1019         else if (clen == 0 ||
1020             ((mb->poptions & PCRE2_DOLLAR_ENDONLY) == 0 && IS_NEWLINE(ptr) &&
1021                (ptr == end_subject - mb->nllen)
1022             ))
1023           { ADD_ACTIVE(state_offset + 1, 0); }
1024         else if (ptr + 1 >= mb->end_subject &&
1025                  (mb->moptions & (PCRE2_PARTIAL_HARD|PCRE2_PARTIAL_SOFT)) != 0 &&
1026                  NLBLOCK->nltype == NLTYPE_FIXED &&
1027                  NLBLOCK->nllen == 2 &&
1028                  c == NLBLOCK->nl[0])
1029           {
1030           if ((mb->moptions & PCRE2_PARTIAL_HARD) != 0)
1031             {
1032             reset_could_continue = TRUE;
1033             ADD_NEW_DATA(-(state_offset + 1), 0, 1);
1034             }
1035           else could_continue = partial_newline = TRUE;
1036           }
1037         }
1038       break;
1039 
1040       /*-----------------------------------------------------------------*/
1041       case OP_DOLLM:
1042       if ((mb->moptions & PCRE2_NOTEOL) == 0)
1043         {
1044         if (clen == 0 && (mb->moptions & PCRE2_PARTIAL_HARD) != 0)
1045           could_continue = TRUE;
1046         else if (clen == 0 ||
1047             ((mb->poptions & PCRE2_DOLLAR_ENDONLY) == 0 && IS_NEWLINE(ptr)))
1048           { ADD_ACTIVE(state_offset + 1, 0); }
1049         else if (ptr + 1 >= mb->end_subject &&
1050                  (mb->moptions & (PCRE2_PARTIAL_HARD|PCRE2_PARTIAL_SOFT)) != 0 &&
1051                  NLBLOCK->nltype == NLTYPE_FIXED &&
1052                  NLBLOCK->nllen == 2 &&
1053                  c == NLBLOCK->nl[0])
1054           {
1055           if ((mb->moptions & PCRE2_PARTIAL_HARD) != 0)
1056             {
1057             reset_could_continue = TRUE;
1058             ADD_NEW_DATA(-(state_offset + 1), 0, 1);
1059             }
1060           else could_continue = partial_newline = TRUE;
1061           }
1062         }
1063       else if (IS_NEWLINE(ptr))
1064         { ADD_ACTIVE(state_offset + 1, 0); }
1065       break;
1066 
1067       /*-----------------------------------------------------------------*/
1068 
1069       case OP_DIGIT:
1070       case OP_WHITESPACE:
1071       case OP_WORDCHAR:
1072       if (clen > 0 && c < 256 &&
1073             ((ctypes[c] & toptable1[codevalue]) ^ toptable2[codevalue]) != 0)
1074         { ADD_NEW(state_offset + 1, 0); }
1075       break;
1076 
1077       /*-----------------------------------------------------------------*/
1078       case OP_NOT_DIGIT:
1079       case OP_NOT_WHITESPACE:
1080       case OP_NOT_WORDCHAR:
1081       if (clen > 0 && (c >= 256 ||
1082             ((ctypes[c] & toptable1[codevalue]) ^ toptable2[codevalue]) != 0))
1083         { ADD_NEW(state_offset + 1, 0); }
1084       break;
1085 
1086       /*-----------------------------------------------------------------*/
1087       case OP_WORD_BOUNDARY:
1088       case OP_NOT_WORD_BOUNDARY:
1089         {
1090         int left_word, right_word;
1091 
1092         if (ptr > start_subject)
1093           {
1094           PCRE2_SPTR temp = ptr - 1;
1095           if (temp < mb->start_used_ptr) mb->start_used_ptr = temp;
1096 #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32
1097           if (utf) { BACKCHAR(temp); }
1098 #endif
1099           GETCHARTEST(d, temp);
1100 #ifdef SUPPORT_UNICODE
1101           if ((mb->poptions & PCRE2_UCP) != 0)
1102             {
1103             if (d == '_') left_word = TRUE; else
1104               {
1105               uint32_t cat = UCD_CATEGORY(d);
1106               left_word = (cat == ucp_L || cat == ucp_N);
1107               }
1108             }
1109           else
1110 #endif
1111           left_word = d < 256 && (ctypes[d] & ctype_word) != 0;
1112           }
1113         else left_word = FALSE;
1114 
1115         if (clen > 0)
1116           {
1117           if (ptr >= mb->last_used_ptr)
1118             {
1119             PCRE2_SPTR temp = ptr + 1;
1120 #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32
1121             if (utf) { FORWARDCHARTEST(temp, mb->end_subject); }
1122 #endif
1123             mb->last_used_ptr = temp;
1124             }
1125 #ifdef SUPPORT_UNICODE
1126           if ((mb->poptions & PCRE2_UCP) != 0)
1127             {
1128             if (c == '_') right_word = TRUE; else
1129               {
1130               uint32_t cat = UCD_CATEGORY(c);
1131               right_word = (cat == ucp_L || cat == ucp_N);
1132               }
1133             }
1134           else
1135 #endif
1136           right_word = c < 256 && (ctypes[c] & ctype_word) != 0;
1137           }
1138         else right_word = FALSE;
1139 
1140         if ((left_word == right_word) == (codevalue == OP_NOT_WORD_BOUNDARY))
1141           { ADD_ACTIVE(state_offset + 1, 0); }
1142         }
1143       break;
1144 
1145 
1146       /*-----------------------------------------------------------------*/
1147       /* Check the next character by Unicode property. We will get here only
1148       if the support is in the binary; otherwise a compile-time error occurs.
1149       */
1150 
1151 #ifdef SUPPORT_UNICODE
1152       case OP_PROP:
1153       case OP_NOTPROP:
1154       if (clen > 0)
1155         {
1156         BOOL OK;
1157         const uint32_t *cp;
1158         const ucd_record * prop = GET_UCD(c);
1159         switch(code[1])
1160           {
1161           case PT_ANY:
1162           OK = TRUE;
1163           break;
1164 
1165           case PT_LAMP:
1166           OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll ||
1167                prop->chartype == ucp_Lt;
1168           break;
1169 
1170           case PT_GC:
1171           OK = PRIV(ucp_gentype)[prop->chartype] == code[2];
1172           break;
1173 
1174           case PT_PC:
1175           OK = prop->chartype == code[2];
1176           break;
1177 
1178           case PT_SC:
1179           OK = prop->script == code[2];
1180           break;
1181 
1182           /* These are specials for combination cases. */
1183 
1184           case PT_ALNUM:
1185           OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
1186                PRIV(ucp_gentype)[prop->chartype] == ucp_N;
1187           break;
1188 
1189           /* Perl space used to exclude VT, but from Perl 5.18 it is included,
1190           which means that Perl space and POSIX space are now identical. PCRE
1191           was changed at release 8.34. */
1192 
1193           case PT_SPACE:    /* Perl space */
1194           case PT_PXSPACE:  /* POSIX space */
1195           switch(c)
1196             {
1197             HSPACE_CASES:
1198             VSPACE_CASES:
1199             OK = TRUE;
1200             break;
1201 
1202             default:
1203             OK = PRIV(ucp_gentype)[prop->chartype] == ucp_Z;
1204             break;
1205             }
1206           break;
1207 
1208           case PT_WORD:
1209           OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
1210                PRIV(ucp_gentype)[prop->chartype] == ucp_N ||
1211                c == CHAR_UNDERSCORE;
1212           break;
1213 
1214           case PT_CLIST:
1215           cp = PRIV(ucd_caseless_sets) + code[2];
1216           for (;;)
1217             {
1218             if (c < *cp) { OK = FALSE; break; }
1219             if (c == *cp++) { OK = TRUE; break; }
1220             }
1221           break;
1222 
1223           case PT_UCNC:
1224           OK = c == CHAR_DOLLAR_SIGN || c == CHAR_COMMERCIAL_AT ||
1225                c == CHAR_GRAVE_ACCENT || (c >= 0xa0 && c <= 0xd7ff) ||
1226                c >= 0xe000;
1227           break;
1228 
1229           /* Should never occur, but keep compilers from grumbling. */
1230 
1231           default:
1232           OK = codevalue != OP_PROP;
1233           break;
1234           }
1235 
1236         if (OK == (codevalue == OP_PROP)) { ADD_NEW(state_offset + 3, 0); }
1237         }
1238       break;
1239 #endif
1240 
1241 
1242 
1243 /* ========================================================================== */
1244       /* These opcodes likewise inspect the subject character, but have an
1245       argument that is not a data character. It is one of these opcodes:
1246       OP_ANY, OP_ALLANY, OP_DIGIT, OP_NOT_DIGIT, OP_WHITESPACE, OP_NOT_SPACE,
1247       OP_WORDCHAR, OP_NOT_WORDCHAR. The value is loaded into d. */
1248 
1249       case OP_TYPEPLUS:
1250       case OP_TYPEMINPLUS:
1251       case OP_TYPEPOSPLUS:
1252       count = current_state->count;  /* Already matched */
1253       if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1254       if (clen > 0)
1255         {
1256         if (d == OP_ANY && ptr + 1 >= mb->end_subject &&
1257             (mb->moptions & (PCRE2_PARTIAL_HARD)) != 0 &&
1258             NLBLOCK->nltype == NLTYPE_FIXED &&
1259             NLBLOCK->nllen == 2 &&
1260             c == NLBLOCK->nl[0])
1261           {
1262           could_continue = partial_newline = TRUE;
1263           }
1264         else if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
1265             (c < 256 &&
1266               (d != OP_ANY || !IS_NEWLINE(ptr)) &&
1267               ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
1268           {
1269           if (count > 0 && codevalue == OP_TYPEPOSPLUS)
1270             {
1271             active_count--;            /* Remove non-match possibility */
1272             next_active_state--;
1273             }
1274           count++;
1275           ADD_NEW(state_offset, count);
1276           }
1277         }
1278       break;
1279 
1280       /*-----------------------------------------------------------------*/
1281       case OP_TYPEQUERY:
1282       case OP_TYPEMINQUERY:
1283       case OP_TYPEPOSQUERY:
1284       ADD_ACTIVE(state_offset + 2, 0);
1285       if (clen > 0)
1286         {
1287         if (d == OP_ANY && ptr + 1 >= mb->end_subject &&
1288             (mb->moptions & (PCRE2_PARTIAL_HARD)) != 0 &&
1289             NLBLOCK->nltype == NLTYPE_FIXED &&
1290             NLBLOCK->nllen == 2 &&
1291             c == NLBLOCK->nl[0])
1292           {
1293           could_continue = partial_newline = TRUE;
1294           }
1295         else if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
1296             (c < 256 &&
1297               (d != OP_ANY || !IS_NEWLINE(ptr)) &&
1298               ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
1299           {
1300           if (codevalue == OP_TYPEPOSQUERY)
1301             {
1302             active_count--;            /* Remove non-match possibility */
1303             next_active_state--;
1304             }
1305           ADD_NEW(state_offset + 2, 0);
1306           }
1307         }
1308       break;
1309 
1310       /*-----------------------------------------------------------------*/
1311       case OP_TYPESTAR:
1312       case OP_TYPEMINSTAR:
1313       case OP_TYPEPOSSTAR:
1314       ADD_ACTIVE(state_offset + 2, 0);
1315       if (clen > 0)
1316         {
1317         if (d == OP_ANY && ptr + 1 >= mb->end_subject &&
1318             (mb->moptions & (PCRE2_PARTIAL_HARD)) != 0 &&
1319             NLBLOCK->nltype == NLTYPE_FIXED &&
1320             NLBLOCK->nllen == 2 &&
1321             c == NLBLOCK->nl[0])
1322           {
1323           could_continue = partial_newline = TRUE;
1324           }
1325         else if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
1326             (c < 256 &&
1327               (d != OP_ANY || !IS_NEWLINE(ptr)) &&
1328               ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
1329           {
1330           if (codevalue == OP_TYPEPOSSTAR)
1331             {
1332             active_count--;            /* Remove non-match possibility */
1333             next_active_state--;
1334             }
1335           ADD_NEW(state_offset, 0);
1336           }
1337         }
1338       break;
1339 
1340       /*-----------------------------------------------------------------*/
1341       case OP_TYPEEXACT:
1342       count = current_state->count;  /* Number already matched */
1343       if (clen > 0)
1344         {
1345         if (d == OP_ANY && ptr + 1 >= mb->end_subject &&
1346             (mb->moptions & (PCRE2_PARTIAL_HARD)) != 0 &&
1347             NLBLOCK->nltype == NLTYPE_FIXED &&
1348             NLBLOCK->nllen == 2 &&
1349             c == NLBLOCK->nl[0])
1350           {
1351           could_continue = partial_newline = TRUE;
1352           }
1353         else if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
1354             (c < 256 &&
1355               (d != OP_ANY || !IS_NEWLINE(ptr)) &&
1356               ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
1357           {
1358           if (++count >= (int)GET2(code, 1))
1359             { ADD_NEW(state_offset + 1 + IMM2_SIZE + 1, 0); }
1360           else
1361             { ADD_NEW(state_offset, count); }
1362           }
1363         }
1364       break;
1365 
1366       /*-----------------------------------------------------------------*/
1367       case OP_TYPEUPTO:
1368       case OP_TYPEMINUPTO:
1369       case OP_TYPEPOSUPTO:
1370       ADD_ACTIVE(state_offset + 2 + IMM2_SIZE, 0);
1371       count = current_state->count;  /* Number already matched */
1372       if (clen > 0)
1373         {
1374         if (d == OP_ANY && ptr + 1 >= mb->end_subject &&
1375             (mb->moptions & (PCRE2_PARTIAL_HARD)) != 0 &&
1376             NLBLOCK->nltype == NLTYPE_FIXED &&
1377             NLBLOCK->nllen == 2 &&
1378             c == NLBLOCK->nl[0])
1379           {
1380           could_continue = partial_newline = TRUE;
1381           }
1382         else if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
1383             (c < 256 &&
1384               (d != OP_ANY || !IS_NEWLINE(ptr)) &&
1385               ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
1386           {
1387           if (codevalue == OP_TYPEPOSUPTO)
1388             {
1389             active_count--;           /* Remove non-match possibility */
1390             next_active_state--;
1391             }
1392           if (++count >= (int)GET2(code, 1))
1393             { ADD_NEW(state_offset + 2 + IMM2_SIZE, 0); }
1394           else
1395             { ADD_NEW(state_offset, count); }
1396           }
1397         }
1398       break;
1399 
1400 /* ========================================================================== */
1401       /* These are virtual opcodes that are used when something like
1402       OP_TYPEPLUS has OP_PROP, OP_NOTPROP, OP_ANYNL, or OP_EXTUNI as its
1403       argument. It keeps the code above fast for the other cases. The argument
1404       is in the d variable. */
1405 
1406 #ifdef SUPPORT_UNICODE
1407       case OP_PROP_EXTRA + OP_TYPEPLUS:
1408       case OP_PROP_EXTRA + OP_TYPEMINPLUS:
1409       case OP_PROP_EXTRA + OP_TYPEPOSPLUS:
1410       count = current_state->count;           /* Already matched */
1411       if (count > 0) { ADD_ACTIVE(state_offset + 4, 0); }
1412       if (clen > 0)
1413         {
1414         BOOL OK;
1415         const uint32_t *cp;
1416         const ucd_record * prop = GET_UCD(c);
1417         switch(code[2])
1418           {
1419           case PT_ANY:
1420           OK = TRUE;
1421           break;
1422 
1423           case PT_LAMP:
1424           OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll ||
1425             prop->chartype == ucp_Lt;
1426           break;
1427 
1428           case PT_GC:
1429           OK = PRIV(ucp_gentype)[prop->chartype] == code[3];
1430           break;
1431 
1432           case PT_PC:
1433           OK = prop->chartype == code[3];
1434           break;
1435 
1436           case PT_SC:
1437           OK = prop->script == code[3];
1438           break;
1439 
1440           /* These are specials for combination cases. */
1441 
1442           case PT_ALNUM:
1443           OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
1444                PRIV(ucp_gentype)[prop->chartype] == ucp_N;
1445           break;
1446 
1447           /* Perl space used to exclude VT, but from Perl 5.18 it is included,
1448           which means that Perl space and POSIX space are now identical. PCRE
1449           was changed at release 8.34. */
1450 
1451           case PT_SPACE:    /* Perl space */
1452           case PT_PXSPACE:  /* POSIX space */
1453           switch(c)
1454             {
1455             HSPACE_CASES:
1456             VSPACE_CASES:
1457             OK = TRUE;
1458             break;
1459 
1460             default:
1461             OK = PRIV(ucp_gentype)[prop->chartype] == ucp_Z;
1462             break;
1463             }
1464           break;
1465 
1466           case PT_WORD:
1467           OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
1468                PRIV(ucp_gentype)[prop->chartype] == ucp_N ||
1469                c == CHAR_UNDERSCORE;
1470           break;
1471 
1472           case PT_CLIST:
1473           cp = PRIV(ucd_caseless_sets) + code[3];
1474           for (;;)
1475             {
1476             if (c < *cp) { OK = FALSE; break; }
1477             if (c == *cp++) { OK = TRUE; break; }
1478             }
1479           break;
1480 
1481           case PT_UCNC:
1482           OK = c == CHAR_DOLLAR_SIGN || c == CHAR_COMMERCIAL_AT ||
1483                c == CHAR_GRAVE_ACCENT || (c >= 0xa0 && c <= 0xd7ff) ||
1484                c >= 0xe000;
1485           break;
1486 
1487           /* Should never occur, but keep compilers from grumbling. */
1488 
1489           default:
1490           OK = codevalue != OP_PROP;
1491           break;
1492           }
1493 
1494         if (OK == (d == OP_PROP))
1495           {
1496           if (count > 0 && codevalue == OP_PROP_EXTRA + OP_TYPEPOSPLUS)
1497             {
1498             active_count--;           /* Remove non-match possibility */
1499             next_active_state--;
1500             }
1501           count++;
1502           ADD_NEW(state_offset, count);
1503           }
1504         }
1505       break;
1506 
1507       /*-----------------------------------------------------------------*/
1508       case OP_EXTUNI_EXTRA + OP_TYPEPLUS:
1509       case OP_EXTUNI_EXTRA + OP_TYPEMINPLUS:
1510       case OP_EXTUNI_EXTRA + OP_TYPEPOSPLUS:
1511       count = current_state->count;  /* Already matched */
1512       if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1513       if (clen > 0)
1514         {
1515         int ncount = 0;
1516         if (count > 0 && codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSPLUS)
1517           {
1518           active_count--;           /* Remove non-match possibility */
1519           next_active_state--;
1520           }
1521         (void)PRIV(extuni)(c, ptr + clen, mb->start_subject, end_subject, utf,
1522           &ncount);
1523         count++;
1524         ADD_NEW_DATA(-state_offset, count, ncount);
1525         }
1526       break;
1527 #endif
1528 
1529       /*-----------------------------------------------------------------*/
1530       case OP_ANYNL_EXTRA + OP_TYPEPLUS:
1531       case OP_ANYNL_EXTRA + OP_TYPEMINPLUS:
1532       case OP_ANYNL_EXTRA + OP_TYPEPOSPLUS:
1533       count = current_state->count;  /* Already matched */
1534       if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1535       if (clen > 0)
1536         {
1537         int ncount = 0;
1538         switch (c)
1539           {
1540           case CHAR_VT:
1541           case CHAR_FF:
1542           case CHAR_NEL:
1543 #ifndef EBCDIC
1544           case 0x2028:
1545           case 0x2029:
1546 #endif  /* Not EBCDIC */
1547           if (mb->bsr_convention == PCRE2_BSR_ANYCRLF) break;
1548           goto ANYNL01;
1549 
1550           case CHAR_CR:
1551           if (ptr + 1 < end_subject && UCHAR21TEST(ptr + 1) == CHAR_LF) ncount = 1;
1552           /* Fall through */
1553 
1554           ANYNL01:
1555           case CHAR_LF:
1556           if (count > 0 && codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSPLUS)
1557             {
1558             active_count--;           /* Remove non-match possibility */
1559             next_active_state--;
1560             }
1561           count++;
1562           ADD_NEW_DATA(-state_offset, count, ncount);
1563           break;
1564 
1565           default:
1566           break;
1567           }
1568         }
1569       break;
1570 
1571       /*-----------------------------------------------------------------*/
1572       case OP_VSPACE_EXTRA + OP_TYPEPLUS:
1573       case OP_VSPACE_EXTRA + OP_TYPEMINPLUS:
1574       case OP_VSPACE_EXTRA + OP_TYPEPOSPLUS:
1575       count = current_state->count;  /* Already matched */
1576       if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1577       if (clen > 0)
1578         {
1579         BOOL OK;
1580         switch (c)
1581           {
1582           VSPACE_CASES:
1583           OK = TRUE;
1584           break;
1585 
1586           default:
1587           OK = FALSE;
1588           break;
1589           }
1590 
1591         if (OK == (d == OP_VSPACE))
1592           {
1593           if (count > 0 && codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSPLUS)
1594             {
1595             active_count--;           /* Remove non-match possibility */
1596             next_active_state--;
1597             }
1598           count++;
1599           ADD_NEW_DATA(-state_offset, count, 0);
1600           }
1601         }
1602       break;
1603 
1604       /*-----------------------------------------------------------------*/
1605       case OP_HSPACE_EXTRA + OP_TYPEPLUS:
1606       case OP_HSPACE_EXTRA + OP_TYPEMINPLUS:
1607       case OP_HSPACE_EXTRA + OP_TYPEPOSPLUS:
1608       count = current_state->count;  /* Already matched */
1609       if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1610       if (clen > 0)
1611         {
1612         BOOL OK;
1613         switch (c)
1614           {
1615           HSPACE_CASES:
1616           OK = TRUE;
1617           break;
1618 
1619           default:
1620           OK = FALSE;
1621           break;
1622           }
1623 
1624         if (OK == (d == OP_HSPACE))
1625           {
1626           if (count > 0 && codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSPLUS)
1627             {
1628             active_count--;           /* Remove non-match possibility */
1629             next_active_state--;
1630             }
1631           count++;
1632           ADD_NEW_DATA(-state_offset, count, 0);
1633           }
1634         }
1635       break;
1636 
1637       /*-----------------------------------------------------------------*/
1638 #ifdef SUPPORT_UNICODE
1639       case OP_PROP_EXTRA + OP_TYPEQUERY:
1640       case OP_PROP_EXTRA + OP_TYPEMINQUERY:
1641       case OP_PROP_EXTRA + OP_TYPEPOSQUERY:
1642       count = 4;
1643       goto QS1;
1644 
1645       case OP_PROP_EXTRA + OP_TYPESTAR:
1646       case OP_PROP_EXTRA + OP_TYPEMINSTAR:
1647       case OP_PROP_EXTRA + OP_TYPEPOSSTAR:
1648       count = 0;
1649 
1650       QS1:
1651 
1652       ADD_ACTIVE(state_offset + 4, 0);
1653       if (clen > 0)
1654         {
1655         BOOL OK;
1656         const uint32_t *cp;
1657         const ucd_record * prop = GET_UCD(c);
1658         switch(code[2])
1659           {
1660           case PT_ANY:
1661           OK = TRUE;
1662           break;
1663 
1664           case PT_LAMP:
1665           OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll ||
1666             prop->chartype == ucp_Lt;
1667           break;
1668 
1669           case PT_GC:
1670           OK = PRIV(ucp_gentype)[prop->chartype] == code[3];
1671           break;
1672 
1673           case PT_PC:
1674           OK = prop->chartype == code[3];
1675           break;
1676 
1677           case PT_SC:
1678           OK = prop->script == code[3];
1679           break;
1680 
1681           /* These are specials for combination cases. */
1682 
1683           case PT_ALNUM:
1684           OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
1685                PRIV(ucp_gentype)[prop->chartype] == ucp_N;
1686           break;
1687 
1688           /* Perl space used to exclude VT, but from Perl 5.18 it is included,
1689           which means that Perl space and POSIX space are now identical. PCRE
1690           was changed at release 8.34. */
1691 
1692           case PT_SPACE:    /* Perl space */
1693           case PT_PXSPACE:  /* POSIX space */
1694           switch(c)
1695             {
1696             HSPACE_CASES:
1697             VSPACE_CASES:
1698             OK = TRUE;
1699             break;
1700 
1701             default:
1702             OK = PRIV(ucp_gentype)[prop->chartype] == ucp_Z;
1703             break;
1704             }
1705           break;
1706 
1707           case PT_WORD:
1708           OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
1709                PRIV(ucp_gentype)[prop->chartype] == ucp_N ||
1710                c == CHAR_UNDERSCORE;
1711           break;
1712 
1713           case PT_CLIST:
1714           cp = PRIV(ucd_caseless_sets) + code[3];
1715           for (;;)
1716             {
1717             if (c < *cp) { OK = FALSE; break; }
1718             if (c == *cp++) { OK = TRUE; break; }
1719             }
1720           break;
1721 
1722           case PT_UCNC:
1723           OK = c == CHAR_DOLLAR_SIGN || c == CHAR_COMMERCIAL_AT ||
1724                c == CHAR_GRAVE_ACCENT || (c >= 0xa0 && c <= 0xd7ff) ||
1725                c >= 0xe000;
1726           break;
1727 
1728           /* Should never occur, but keep compilers from grumbling. */
1729 
1730           default:
1731           OK = codevalue != OP_PROP;
1732           break;
1733           }
1734 
1735         if (OK == (d == OP_PROP))
1736           {
1737           if (codevalue == OP_PROP_EXTRA + OP_TYPEPOSSTAR ||
1738               codevalue == OP_PROP_EXTRA + OP_TYPEPOSQUERY)
1739             {
1740             active_count--;           /* Remove non-match possibility */
1741             next_active_state--;
1742             }
1743           ADD_NEW(state_offset + count, 0);
1744           }
1745         }
1746       break;
1747 
1748       /*-----------------------------------------------------------------*/
1749       case OP_EXTUNI_EXTRA + OP_TYPEQUERY:
1750       case OP_EXTUNI_EXTRA + OP_TYPEMINQUERY:
1751       case OP_EXTUNI_EXTRA + OP_TYPEPOSQUERY:
1752       count = 2;
1753       goto QS2;
1754 
1755       case OP_EXTUNI_EXTRA + OP_TYPESTAR:
1756       case OP_EXTUNI_EXTRA + OP_TYPEMINSTAR:
1757       case OP_EXTUNI_EXTRA + OP_TYPEPOSSTAR:
1758       count = 0;
1759 
1760       QS2:
1761 
1762       ADD_ACTIVE(state_offset + 2, 0);
1763       if (clen > 0)
1764         {
1765         int ncount = 0;
1766         if (codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSSTAR ||
1767             codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSQUERY)
1768           {
1769           active_count--;           /* Remove non-match possibility */
1770           next_active_state--;
1771           }
1772         (void)PRIV(extuni)(c, ptr + clen, mb->start_subject, end_subject, utf,
1773           &ncount);
1774         ADD_NEW_DATA(-(state_offset + count), 0, ncount);
1775         }
1776       break;
1777 #endif
1778 
1779       /*-----------------------------------------------------------------*/
1780       case OP_ANYNL_EXTRA + OP_TYPEQUERY:
1781       case OP_ANYNL_EXTRA + OP_TYPEMINQUERY:
1782       case OP_ANYNL_EXTRA + OP_TYPEPOSQUERY:
1783       count = 2;
1784       goto QS3;
1785 
1786       case OP_ANYNL_EXTRA + OP_TYPESTAR:
1787       case OP_ANYNL_EXTRA + OP_TYPEMINSTAR:
1788       case OP_ANYNL_EXTRA + OP_TYPEPOSSTAR:
1789       count = 0;
1790 
1791       QS3:
1792       ADD_ACTIVE(state_offset + 2, 0);
1793       if (clen > 0)
1794         {
1795         int ncount = 0;
1796         switch (c)
1797           {
1798           case CHAR_VT:
1799           case CHAR_FF:
1800           case CHAR_NEL:
1801 #ifndef EBCDIC
1802           case 0x2028:
1803           case 0x2029:
1804 #endif  /* Not EBCDIC */
1805           if (mb->bsr_convention == PCRE2_BSR_ANYCRLF) break;
1806           goto ANYNL02;
1807 
1808           case CHAR_CR:
1809           if (ptr + 1 < end_subject && UCHAR21TEST(ptr + 1) == CHAR_LF) ncount = 1;
1810           /* Fall through */
1811 
1812           ANYNL02:
1813           case CHAR_LF:
1814           if (codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSSTAR ||
1815               codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSQUERY)
1816             {
1817             active_count--;           /* Remove non-match possibility */
1818             next_active_state--;
1819             }
1820           ADD_NEW_DATA(-(state_offset + (int)count), 0, ncount);
1821           break;
1822 
1823           default:
1824           break;
1825           }
1826         }
1827       break;
1828 
1829       /*-----------------------------------------------------------------*/
1830       case OP_VSPACE_EXTRA + OP_TYPEQUERY:
1831       case OP_VSPACE_EXTRA + OP_TYPEMINQUERY:
1832       case OP_VSPACE_EXTRA + OP_TYPEPOSQUERY:
1833       count = 2;
1834       goto QS4;
1835 
1836       case OP_VSPACE_EXTRA + OP_TYPESTAR:
1837       case OP_VSPACE_EXTRA + OP_TYPEMINSTAR:
1838       case OP_VSPACE_EXTRA + OP_TYPEPOSSTAR:
1839       count = 0;
1840 
1841       QS4:
1842       ADD_ACTIVE(state_offset + 2, 0);
1843       if (clen > 0)
1844         {
1845         BOOL OK;
1846         switch (c)
1847           {
1848           VSPACE_CASES:
1849           OK = TRUE;
1850           break;
1851 
1852           default:
1853           OK = FALSE;
1854           break;
1855           }
1856         if (OK == (d == OP_VSPACE))
1857           {
1858           if (codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSSTAR ||
1859               codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSQUERY)
1860             {
1861             active_count--;           /* Remove non-match possibility */
1862             next_active_state--;
1863             }
1864           ADD_NEW_DATA(-(state_offset + (int)count), 0, 0);
1865           }
1866         }
1867       break;
1868 
1869       /*-----------------------------------------------------------------*/
1870       case OP_HSPACE_EXTRA + OP_TYPEQUERY:
1871       case OP_HSPACE_EXTRA + OP_TYPEMINQUERY:
1872       case OP_HSPACE_EXTRA + OP_TYPEPOSQUERY:
1873       count = 2;
1874       goto QS5;
1875 
1876       case OP_HSPACE_EXTRA + OP_TYPESTAR:
1877       case OP_HSPACE_EXTRA + OP_TYPEMINSTAR:
1878       case OP_HSPACE_EXTRA + OP_TYPEPOSSTAR:
1879       count = 0;
1880 
1881       QS5:
1882       ADD_ACTIVE(state_offset + 2, 0);
1883       if (clen > 0)
1884         {
1885         BOOL OK;
1886         switch (c)
1887           {
1888           HSPACE_CASES:
1889           OK = TRUE;
1890           break;
1891 
1892           default:
1893           OK = FALSE;
1894           break;
1895           }
1896 
1897         if (OK == (d == OP_HSPACE))
1898           {
1899           if (codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSSTAR ||
1900               codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSQUERY)
1901             {
1902             active_count--;           /* Remove non-match possibility */
1903             next_active_state--;
1904             }
1905           ADD_NEW_DATA(-(state_offset + (int)count), 0, 0);
1906           }
1907         }
1908       break;
1909 
1910       /*-----------------------------------------------------------------*/
1911 #ifdef SUPPORT_UNICODE
1912       case OP_PROP_EXTRA + OP_TYPEEXACT:
1913       case OP_PROP_EXTRA + OP_TYPEUPTO:
1914       case OP_PROP_EXTRA + OP_TYPEMINUPTO:
1915       case OP_PROP_EXTRA + OP_TYPEPOSUPTO:
1916       if (codevalue != OP_PROP_EXTRA + OP_TYPEEXACT)
1917         { ADD_ACTIVE(state_offset + 1 + IMM2_SIZE + 3, 0); }
1918       count = current_state->count;  /* Number already matched */
1919       if (clen > 0)
1920         {
1921         BOOL OK;
1922         const uint32_t *cp;
1923         const ucd_record * prop = GET_UCD(c);
1924         switch(code[1 + IMM2_SIZE + 1])
1925           {
1926           case PT_ANY:
1927           OK = TRUE;
1928           break;
1929 
1930           case PT_LAMP:
1931           OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll ||
1932             prop->chartype == ucp_Lt;
1933           break;
1934 
1935           case PT_GC:
1936           OK = PRIV(ucp_gentype)[prop->chartype] == code[1 + IMM2_SIZE + 2];
1937           break;
1938 
1939           case PT_PC:
1940           OK = prop->chartype == code[1 + IMM2_SIZE + 2];
1941           break;
1942 
1943           case PT_SC:
1944           OK = prop->script == code[1 + IMM2_SIZE + 2];
1945           break;
1946 
1947           /* These are specials for combination cases. */
1948 
1949           case PT_ALNUM:
1950           OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
1951                PRIV(ucp_gentype)[prop->chartype] == ucp_N;
1952           break;
1953 
1954           /* Perl space used to exclude VT, but from Perl 5.18 it is included,
1955           which means that Perl space and POSIX space are now identical. PCRE
1956           was changed at release 8.34. */
1957 
1958           case PT_SPACE:    /* Perl space */
1959           case PT_PXSPACE:  /* POSIX space */
1960           switch(c)
1961             {
1962             HSPACE_CASES:
1963             VSPACE_CASES:
1964             OK = TRUE;
1965             break;
1966 
1967             default:
1968             OK = PRIV(ucp_gentype)[prop->chartype] == ucp_Z;
1969             break;
1970             }
1971           break;
1972 
1973           case PT_WORD:
1974           OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
1975                PRIV(ucp_gentype)[prop->chartype] == ucp_N ||
1976                c == CHAR_UNDERSCORE;
1977           break;
1978 
1979           case PT_CLIST:
1980           cp = PRIV(ucd_caseless_sets) + code[1 + IMM2_SIZE + 2];
1981           for (;;)
1982             {
1983             if (c < *cp) { OK = FALSE; break; }
1984             if (c == *cp++) { OK = TRUE; break; }
1985             }
1986           break;
1987 
1988           case PT_UCNC:
1989           OK = c == CHAR_DOLLAR_SIGN || c == CHAR_COMMERCIAL_AT ||
1990                c == CHAR_GRAVE_ACCENT || (c >= 0xa0 && c <= 0xd7ff) ||
1991                c >= 0xe000;
1992           break;
1993 
1994           /* Should never occur, but keep compilers from grumbling. */
1995 
1996           default:
1997           OK = codevalue != OP_PROP;
1998           break;
1999           }
2000 
2001         if (OK == (d == OP_PROP))
2002           {
2003           if (codevalue == OP_PROP_EXTRA + OP_TYPEPOSUPTO)
2004             {
2005             active_count--;           /* Remove non-match possibility */
2006             next_active_state--;
2007             }
2008           if (++count >= (int)GET2(code, 1))
2009             { ADD_NEW(state_offset + 1 + IMM2_SIZE + 3, 0); }
2010           else
2011             { ADD_NEW(state_offset, count); }
2012           }
2013         }
2014       break;
2015 
2016       /*-----------------------------------------------------------------*/
2017       case OP_EXTUNI_EXTRA + OP_TYPEEXACT:
2018       case OP_EXTUNI_EXTRA + OP_TYPEUPTO:
2019       case OP_EXTUNI_EXTRA + OP_TYPEMINUPTO:
2020       case OP_EXTUNI_EXTRA + OP_TYPEPOSUPTO:
2021       if (codevalue != OP_EXTUNI_EXTRA + OP_TYPEEXACT)
2022         { ADD_ACTIVE(state_offset + 2 + IMM2_SIZE, 0); }
2023       count = current_state->count;  /* Number already matched */
2024       if (clen > 0)
2025         {
2026         PCRE2_SPTR nptr;
2027         int ncount = 0;
2028         if (codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSUPTO)
2029           {
2030           active_count--;           /* Remove non-match possibility */
2031           next_active_state--;
2032           }
2033         nptr = PRIV(extuni)(c, ptr + clen, mb->start_subject, end_subject, utf,
2034           &ncount);
2035         if (nptr >= end_subject && (mb->moptions & PCRE2_PARTIAL_HARD) != 0)
2036             reset_could_continue = TRUE;
2037         if (++count >= (int)GET2(code, 1))
2038           { ADD_NEW_DATA(-(state_offset + 2 + IMM2_SIZE), 0, ncount); }
2039         else
2040           { ADD_NEW_DATA(-state_offset, count, ncount); }
2041         }
2042       break;
2043 #endif
2044 
2045       /*-----------------------------------------------------------------*/
2046       case OP_ANYNL_EXTRA + OP_TYPEEXACT:
2047       case OP_ANYNL_EXTRA + OP_TYPEUPTO:
2048       case OP_ANYNL_EXTRA + OP_TYPEMINUPTO:
2049       case OP_ANYNL_EXTRA + OP_TYPEPOSUPTO:
2050       if (codevalue != OP_ANYNL_EXTRA + OP_TYPEEXACT)
2051         { ADD_ACTIVE(state_offset + 2 + IMM2_SIZE, 0); }
2052       count = current_state->count;  /* Number already matched */
2053       if (clen > 0)
2054         {
2055         int ncount = 0;
2056         switch (c)
2057           {
2058           case CHAR_VT:
2059           case CHAR_FF:
2060           case CHAR_NEL:
2061 #ifndef EBCDIC
2062           case 0x2028:
2063           case 0x2029:
2064 #endif  /* Not EBCDIC */
2065           if (mb->bsr_convention == PCRE2_BSR_ANYCRLF) break;
2066           goto ANYNL03;
2067 
2068           case CHAR_CR:
2069           if (ptr + 1 < end_subject && UCHAR21TEST(ptr + 1) == CHAR_LF) ncount = 1;
2070           /* Fall through */
2071 
2072           ANYNL03:
2073           case CHAR_LF:
2074           if (codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSUPTO)
2075             {
2076             active_count--;           /* Remove non-match possibility */
2077             next_active_state--;
2078             }
2079           if (++count >= (int)GET2(code, 1))
2080             { ADD_NEW_DATA(-(state_offset + 2 + IMM2_SIZE), 0, ncount); }
2081           else
2082             { ADD_NEW_DATA(-state_offset, count, ncount); }
2083           break;
2084 
2085           default:
2086           break;
2087           }
2088         }
2089       break;
2090 
2091       /*-----------------------------------------------------------------*/
2092       case OP_VSPACE_EXTRA + OP_TYPEEXACT:
2093       case OP_VSPACE_EXTRA + OP_TYPEUPTO:
2094       case OP_VSPACE_EXTRA + OP_TYPEMINUPTO:
2095       case OP_VSPACE_EXTRA + OP_TYPEPOSUPTO:
2096       if (codevalue != OP_VSPACE_EXTRA + OP_TYPEEXACT)
2097         { ADD_ACTIVE(state_offset + 2 + IMM2_SIZE, 0); }
2098       count = current_state->count;  /* Number already matched */
2099       if (clen > 0)
2100         {
2101         BOOL OK;
2102         switch (c)
2103           {
2104           VSPACE_CASES:
2105           OK = TRUE;
2106           break;
2107 
2108           default:
2109           OK = FALSE;
2110           }
2111 
2112         if (OK == (d == OP_VSPACE))
2113           {
2114           if (codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSUPTO)
2115             {
2116             active_count--;           /* Remove non-match possibility */
2117             next_active_state--;
2118             }
2119           if (++count >= (int)GET2(code, 1))
2120             { ADD_NEW_DATA(-(state_offset + 2 + IMM2_SIZE), 0, 0); }
2121           else
2122             { ADD_NEW_DATA(-state_offset, count, 0); }
2123           }
2124         }
2125       break;
2126 
2127       /*-----------------------------------------------------------------*/
2128       case OP_HSPACE_EXTRA + OP_TYPEEXACT:
2129       case OP_HSPACE_EXTRA + OP_TYPEUPTO:
2130       case OP_HSPACE_EXTRA + OP_TYPEMINUPTO:
2131       case OP_HSPACE_EXTRA + OP_TYPEPOSUPTO:
2132       if (codevalue != OP_HSPACE_EXTRA + OP_TYPEEXACT)
2133         { ADD_ACTIVE(state_offset + 2 + IMM2_SIZE, 0); }
2134       count = current_state->count;  /* Number already matched */
2135       if (clen > 0)
2136         {
2137         BOOL OK;
2138         switch (c)
2139           {
2140           HSPACE_CASES:
2141           OK = TRUE;
2142           break;
2143 
2144           default:
2145           OK = FALSE;
2146           break;
2147           }
2148 
2149         if (OK == (d == OP_HSPACE))
2150           {
2151           if (codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSUPTO)
2152             {
2153             active_count--;           /* Remove non-match possibility */
2154             next_active_state--;
2155             }
2156           if (++count >= (int)GET2(code, 1))
2157             { ADD_NEW_DATA(-(state_offset + 2 + IMM2_SIZE), 0, 0); }
2158           else
2159             { ADD_NEW_DATA(-state_offset, count, 0); }
2160           }
2161         }
2162       break;
2163 
2164 /* ========================================================================== */
2165       /* These opcodes are followed by a character that is usually compared
2166       to the current subject character; it is loaded into d. We still get
2167       here even if there is no subject character, because in some cases zero
2168       repetitions are permitted. */
2169 
2170       /*-----------------------------------------------------------------*/
2171       case OP_CHAR:
2172       if (clen > 0 && c == d) { ADD_NEW(state_offset + dlen + 1, 0); }
2173       break;
2174 
2175       /*-----------------------------------------------------------------*/
2176       case OP_CHARI:
2177       if (clen == 0) break;
2178 
2179 #ifdef SUPPORT_UNICODE
2180       if (utf)
2181         {
2182         if (c == d) { ADD_NEW(state_offset + dlen + 1, 0); } else
2183           {
2184           unsigned int othercase;
2185           if (c < 128)
2186             othercase = fcc[c];
2187           else
2188             othercase = UCD_OTHERCASE(c);
2189           if (d == othercase) { ADD_NEW(state_offset + dlen + 1, 0); }
2190           }
2191         }
2192       else
2193 #endif  /* SUPPORT_UNICODE */
2194       /* Not UTF mode */
2195         {
2196         if (TABLE_GET(c, lcc, c) == TABLE_GET(d, lcc, d))
2197           { ADD_NEW(state_offset + 2, 0); }
2198         }
2199       break;
2200 
2201 
2202 #ifdef SUPPORT_UNICODE
2203       /*-----------------------------------------------------------------*/
2204       /* This is a tricky one because it can match more than one character.
2205       Find out how many characters to skip, and then set up a negative state
2206       to wait for them to pass before continuing. */
2207 
2208       case OP_EXTUNI:
2209       if (clen > 0)
2210         {
2211         int ncount = 0;
2212         PCRE2_SPTR nptr = PRIV(extuni)(c, ptr + clen, mb->start_subject,
2213           end_subject, utf, &ncount);
2214         if (nptr >= end_subject && (mb->moptions & PCRE2_PARTIAL_HARD) != 0)
2215             reset_could_continue = TRUE;
2216         ADD_NEW_DATA(-(state_offset + 1), 0, ncount);
2217         }
2218       break;
2219 #endif
2220 
2221       /*-----------------------------------------------------------------*/
2222       /* This is a tricky like EXTUNI because it too can match more than one
2223       character (when CR is followed by LF). In this case, set up a negative
2224       state to wait for one character to pass before continuing. */
2225 
2226       case OP_ANYNL:
2227       if (clen > 0) switch(c)
2228         {
2229         case CHAR_VT:
2230         case CHAR_FF:
2231         case CHAR_NEL:
2232 #ifndef EBCDIC
2233         case 0x2028:
2234         case 0x2029:
2235 #endif  /* Not EBCDIC */
2236         if (mb->bsr_convention == PCRE2_BSR_ANYCRLF) break;
2237         /* Fall through */
2238 
2239         case CHAR_LF:
2240         ADD_NEW(state_offset + 1, 0);
2241         break;
2242 
2243         case CHAR_CR:
2244         if (ptr + 1 >= end_subject)
2245           {
2246           ADD_NEW(state_offset + 1, 0);
2247           if ((mb->moptions & PCRE2_PARTIAL_HARD) != 0)
2248             reset_could_continue = TRUE;
2249           }
2250         else if (UCHAR21TEST(ptr + 1) == CHAR_LF)
2251           {
2252           ADD_NEW_DATA(-(state_offset + 1), 0, 1);
2253           }
2254         else
2255           {
2256           ADD_NEW(state_offset + 1, 0);
2257           }
2258         break;
2259         }
2260       break;
2261 
2262       /*-----------------------------------------------------------------*/
2263       case OP_NOT_VSPACE:
2264       if (clen > 0) switch(c)
2265         {
2266         VSPACE_CASES:
2267         break;
2268 
2269         default:
2270         ADD_NEW(state_offset + 1, 0);
2271         break;
2272         }
2273       break;
2274 
2275       /*-----------------------------------------------------------------*/
2276       case OP_VSPACE:
2277       if (clen > 0) switch(c)
2278         {
2279         VSPACE_CASES:
2280         ADD_NEW(state_offset + 1, 0);
2281         break;
2282 
2283         default:
2284         break;
2285         }
2286       break;
2287 
2288       /*-----------------------------------------------------------------*/
2289       case OP_NOT_HSPACE:
2290       if (clen > 0) switch(c)
2291         {
2292         HSPACE_CASES:
2293         break;
2294 
2295         default:
2296         ADD_NEW(state_offset + 1, 0);
2297         break;
2298         }
2299       break;
2300 
2301       /*-----------------------------------------------------------------*/
2302       case OP_HSPACE:
2303       if (clen > 0) switch(c)
2304         {
2305         HSPACE_CASES:
2306         ADD_NEW(state_offset + 1, 0);
2307         break;
2308 
2309         default:
2310         break;
2311         }
2312       break;
2313 
2314       /*-----------------------------------------------------------------*/
2315       /* Match a negated single character casefully. */
2316 
2317       case OP_NOT:
2318       if (clen > 0 && c != d) { ADD_NEW(state_offset + dlen + 1, 0); }
2319       break;
2320 
2321       /*-----------------------------------------------------------------*/
2322       /* Match a negated single character caselessly. */
2323 
2324       case OP_NOTI:
2325       if (clen > 0)
2326         {
2327         uint32_t otherd;
2328 #ifdef SUPPORT_UNICODE
2329         if (utf && d >= 128)
2330           otherd = UCD_OTHERCASE(d);
2331         else
2332 #endif  /* SUPPORT_UNICODE */
2333         otherd = TABLE_GET(d, fcc, d);
2334         if (c != d && c != otherd)
2335           { ADD_NEW(state_offset + dlen + 1, 0); }
2336         }
2337       break;
2338 
2339       /*-----------------------------------------------------------------*/
2340       case OP_PLUSI:
2341       case OP_MINPLUSI:
2342       case OP_POSPLUSI:
2343       case OP_NOTPLUSI:
2344       case OP_NOTMINPLUSI:
2345       case OP_NOTPOSPLUSI:
2346       caseless = TRUE;
2347       codevalue -= OP_STARI - OP_STAR;
2348 
2349       /* Fall through */
2350       case OP_PLUS:
2351       case OP_MINPLUS:
2352       case OP_POSPLUS:
2353       case OP_NOTPLUS:
2354       case OP_NOTMINPLUS:
2355       case OP_NOTPOSPLUS:
2356       count = current_state->count;  /* Already matched */
2357       if (count > 0) { ADD_ACTIVE(state_offset + dlen + 1, 0); }
2358       if (clen > 0)
2359         {
2360         uint32_t otherd = NOTACHAR;
2361         if (caseless)
2362           {
2363 #ifdef SUPPORT_UNICODE
2364           if (utf && d >= 128)
2365             otherd = UCD_OTHERCASE(d);
2366           else
2367 #endif  /* SUPPORT_UNICODE */
2368           otherd = TABLE_GET(d, fcc, d);
2369           }
2370         if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
2371           {
2372           if (count > 0 &&
2373               (codevalue == OP_POSPLUS || codevalue == OP_NOTPOSPLUS))
2374             {
2375             active_count--;             /* Remove non-match possibility */
2376             next_active_state--;
2377             }
2378           count++;
2379           ADD_NEW(state_offset, count);
2380           }
2381         }
2382       break;
2383 
2384       /*-----------------------------------------------------------------*/
2385       case OP_QUERYI:
2386       case OP_MINQUERYI:
2387       case OP_POSQUERYI:
2388       case OP_NOTQUERYI:
2389       case OP_NOTMINQUERYI:
2390       case OP_NOTPOSQUERYI:
2391       caseless = TRUE;
2392       codevalue -= OP_STARI - OP_STAR;
2393       /* Fall through */
2394       case OP_QUERY:
2395       case OP_MINQUERY:
2396       case OP_POSQUERY:
2397       case OP_NOTQUERY:
2398       case OP_NOTMINQUERY:
2399       case OP_NOTPOSQUERY:
2400       ADD_ACTIVE(state_offset + dlen + 1, 0);
2401       if (clen > 0)
2402         {
2403         uint32_t otherd = NOTACHAR;
2404         if (caseless)
2405           {
2406 #ifdef SUPPORT_UNICODE
2407           if (utf && d >= 128)
2408             otherd = UCD_OTHERCASE(d);
2409           else
2410 #endif  /* SUPPORT_UNICODE */
2411           otherd = TABLE_GET(d, fcc, d);
2412           }
2413         if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
2414           {
2415           if (codevalue == OP_POSQUERY || codevalue == OP_NOTPOSQUERY)
2416             {
2417             active_count--;            /* Remove non-match possibility */
2418             next_active_state--;
2419             }
2420           ADD_NEW(state_offset + dlen + 1, 0);
2421           }
2422         }
2423       break;
2424 
2425       /*-----------------------------------------------------------------*/
2426       case OP_STARI:
2427       case OP_MINSTARI:
2428       case OP_POSSTARI:
2429       case OP_NOTSTARI:
2430       case OP_NOTMINSTARI:
2431       case OP_NOTPOSSTARI:
2432       caseless = TRUE;
2433       codevalue -= OP_STARI - OP_STAR;
2434       /* Fall through */
2435       case OP_STAR:
2436       case OP_MINSTAR:
2437       case OP_POSSTAR:
2438       case OP_NOTSTAR:
2439       case OP_NOTMINSTAR:
2440       case OP_NOTPOSSTAR:
2441       ADD_ACTIVE(state_offset + dlen + 1, 0);
2442       if (clen > 0)
2443         {
2444         uint32_t otherd = NOTACHAR;
2445         if (caseless)
2446           {
2447 #ifdef SUPPORT_UNICODE
2448           if (utf && d >= 128)
2449             otherd = UCD_OTHERCASE(d);
2450           else
2451 #endif  /* SUPPORT_UNICODE */
2452           otherd = TABLE_GET(d, fcc, d);
2453           }
2454         if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
2455           {
2456           if (codevalue == OP_POSSTAR || codevalue == OP_NOTPOSSTAR)
2457             {
2458             active_count--;            /* Remove non-match possibility */
2459             next_active_state--;
2460             }
2461           ADD_NEW(state_offset, 0);
2462           }
2463         }
2464       break;
2465 
2466       /*-----------------------------------------------------------------*/
2467       case OP_EXACTI:
2468       case OP_NOTEXACTI:
2469       caseless = TRUE;
2470       codevalue -= OP_STARI - OP_STAR;
2471       /* Fall through */
2472       case OP_EXACT:
2473       case OP_NOTEXACT:
2474       count = current_state->count;  /* Number already matched */
2475       if (clen > 0)
2476         {
2477         uint32_t otherd = NOTACHAR;
2478         if (caseless)
2479           {
2480 #ifdef SUPPORT_UNICODE
2481           if (utf && d >= 128)
2482             otherd = UCD_OTHERCASE(d);
2483           else
2484 #endif  /* SUPPORT_UNICODE */
2485           otherd = TABLE_GET(d, fcc, d);
2486           }
2487         if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
2488           {
2489           if (++count >= (int)GET2(code, 1))
2490             { ADD_NEW(state_offset + dlen + 1 + IMM2_SIZE, 0); }
2491           else
2492             { ADD_NEW(state_offset, count); }
2493           }
2494         }
2495       break;
2496 
2497       /*-----------------------------------------------------------------*/
2498       case OP_UPTOI:
2499       case OP_MINUPTOI:
2500       case OP_POSUPTOI:
2501       case OP_NOTUPTOI:
2502       case OP_NOTMINUPTOI:
2503       case OP_NOTPOSUPTOI:
2504       caseless = TRUE;
2505       codevalue -= OP_STARI - OP_STAR;
2506       /* Fall through */
2507       case OP_UPTO:
2508       case OP_MINUPTO:
2509       case OP_POSUPTO:
2510       case OP_NOTUPTO:
2511       case OP_NOTMINUPTO:
2512       case OP_NOTPOSUPTO:
2513       ADD_ACTIVE(state_offset + dlen + 1 + IMM2_SIZE, 0);
2514       count = current_state->count;  /* Number already matched */
2515       if (clen > 0)
2516         {
2517         uint32_t otherd = NOTACHAR;
2518         if (caseless)
2519           {
2520 #ifdef SUPPORT_UNICODE
2521           if (utf && d >= 128)
2522             otherd = UCD_OTHERCASE(d);
2523           else
2524 #endif  /* SUPPORT_UNICODE */
2525           otherd = TABLE_GET(d, fcc, d);
2526           }
2527         if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
2528           {
2529           if (codevalue == OP_POSUPTO || codevalue == OP_NOTPOSUPTO)
2530             {
2531             active_count--;             /* Remove non-match possibility */
2532             next_active_state--;
2533             }
2534           if (++count >= (int)GET2(code, 1))
2535             { ADD_NEW(state_offset + dlen + 1 + IMM2_SIZE, 0); }
2536           else
2537             { ADD_NEW(state_offset, count); }
2538           }
2539         }
2540       break;
2541 
2542 
2543 /* ========================================================================== */
2544       /* These are the class-handling opcodes */
2545 
2546       case OP_CLASS:
2547       case OP_NCLASS:
2548       case OP_XCLASS:
2549         {
2550         BOOL isinclass = FALSE;
2551         int next_state_offset;
2552         PCRE2_SPTR ecode;
2553 
2554         /* For a simple class, there is always just a 32-byte table, and we
2555         can set isinclass from it. */
2556 
2557         if (codevalue != OP_XCLASS)
2558           {
2559           ecode = code + 1 + (32 / sizeof(PCRE2_UCHAR));
2560           if (clen > 0)
2561             {
2562             isinclass = (c > 255)? (codevalue == OP_NCLASS) :
2563               ((((uint8_t *)(code + 1))[c/8] & (1 << (c&7))) != 0);
2564             }
2565           }
2566 
2567         /* An extended class may have a table or a list of single characters,
2568         ranges, or both, and it may be positive or negative. There's a
2569         function that sorts all this out. */
2570 
2571         else
2572          {
2573          ecode = code + GET(code, 1);
2574          if (clen > 0) isinclass = PRIV(xclass)(c, code + 1 + LINK_SIZE, utf);
2575          }
2576 
2577         /* At this point, isinclass is set for all kinds of class, and ecode
2578         points to the byte after the end of the class. If there is a
2579         quantifier, this is where it will be. */
2580 
2581         next_state_offset = (int)(ecode - start_code);
2582 
2583         switch (*ecode)
2584           {
2585           case OP_CRSTAR:
2586           case OP_CRMINSTAR:
2587           case OP_CRPOSSTAR:
2588           ADD_ACTIVE(next_state_offset + 1, 0);
2589           if (isinclass)
2590             {
2591             if (*ecode == OP_CRPOSSTAR)
2592               {
2593               active_count--;           /* Remove non-match possibility */
2594               next_active_state--;
2595               }
2596             ADD_NEW(state_offset, 0);
2597             }
2598           break;
2599 
2600           case OP_CRPLUS:
2601           case OP_CRMINPLUS:
2602           case OP_CRPOSPLUS:
2603           count = current_state->count;  /* Already matched */
2604           if (count > 0) { ADD_ACTIVE(next_state_offset + 1, 0); }
2605           if (isinclass)
2606             {
2607             if (count > 0 && *ecode == OP_CRPOSPLUS)
2608               {
2609               active_count--;           /* Remove non-match possibility */
2610               next_active_state--;
2611               }
2612             count++;
2613             ADD_NEW(state_offset, count);
2614             }
2615           break;
2616 
2617           case OP_CRQUERY:
2618           case OP_CRMINQUERY:
2619           case OP_CRPOSQUERY:
2620           ADD_ACTIVE(next_state_offset + 1, 0);
2621           if (isinclass)
2622             {
2623             if (*ecode == OP_CRPOSQUERY)
2624               {
2625               active_count--;           /* Remove non-match possibility */
2626               next_active_state--;
2627               }
2628             ADD_NEW(next_state_offset + 1, 0);
2629             }
2630           break;
2631 
2632           case OP_CRRANGE:
2633           case OP_CRMINRANGE:
2634           case OP_CRPOSRANGE:
2635           count = current_state->count;  /* Already matched */
2636           if (count >= (int)GET2(ecode, 1))
2637             { ADD_ACTIVE(next_state_offset + 1 + 2 * IMM2_SIZE, 0); }
2638           if (isinclass)
2639             {
2640             int max = (int)GET2(ecode, 1 + IMM2_SIZE);
2641 
2642             if (*ecode == OP_CRPOSRANGE && count >= (int)GET2(ecode, 1))
2643               {
2644               active_count--;           /* Remove non-match possibility */
2645               next_active_state--;
2646               }
2647 
2648             if (++count >= max && max != 0)   /* Max 0 => no limit */
2649               { ADD_NEW(next_state_offset + 1 + 2 * IMM2_SIZE, 0); }
2650             else
2651               { ADD_NEW(state_offset, count); }
2652             }
2653           break;
2654 
2655           default:
2656           if (isinclass) { ADD_NEW(next_state_offset, 0); }
2657           break;
2658           }
2659         }
2660       break;
2661 
2662 /* ========================================================================== */
2663       /* These are the opcodes for fancy brackets of various kinds. We have
2664       to use recursion in order to handle them. The "always failing" assertion
2665       (?!) is optimised to OP_FAIL when compiling, so we have to support that,
2666       though the other "backtracking verbs" are not supported. */
2667 
2668       case OP_FAIL:
2669       forced_fail++;    /* Count FAILs for multiple states */
2670       break;
2671 
2672       case OP_ASSERT:
2673       case OP_ASSERT_NOT:
2674       case OP_ASSERTBACK:
2675       case OP_ASSERTBACK_NOT:
2676         {
2677         int rc;
2678         int *local_workspace;
2679         PCRE2_SIZE *local_offsets;
2680         PCRE2_SPTR endasscode = code + GET(code, 1);
2681         RWS_anchor *rws = (RWS_anchor *)RWS;
2682 
2683         if (rws->free < RWS_RSIZE + RWS_OVEC_OSIZE)
2684           {
2685           rc = more_workspace(&rws, RWS_OVEC_OSIZE, mb);
2686           if (rc != 0) return rc;
2687           RWS = (int *)rws;
2688           }
2689 
2690         local_offsets = (PCRE2_SIZE *)(RWS + rws->size - rws->free);
2691         local_workspace = ((int *)local_offsets) + RWS_OVEC_OSIZE;
2692         rws->free -= RWS_RSIZE + RWS_OVEC_OSIZE;
2693 
2694         while (*endasscode == OP_ALT) endasscode += GET(endasscode, 1);
2695 
2696         rc = internal_dfa_match(
2697           mb,                                   /* static match data */
2698           code,                                 /* this subexpression's code */
2699           ptr,                                  /* where we currently are */
2700           (PCRE2_SIZE)(ptr - start_subject),    /* start offset */
2701           local_offsets,                        /* offset vector */
2702           RWS_OVEC_OSIZE/OVEC_UNIT,             /* size of same */
2703           local_workspace,                      /* workspace vector */
2704           RWS_RSIZE,                            /* size of same */
2705           rlevel,                               /* function recursion level */
2706           RWS);                                 /* recursion workspace */
2707 
2708         rws->free += RWS_RSIZE + RWS_OVEC_OSIZE;
2709 
2710         if (rc < 0 && rc != PCRE2_ERROR_NOMATCH) return rc;
2711         if ((rc >= 0) == (codevalue == OP_ASSERT || codevalue == OP_ASSERTBACK))
2712             { ADD_ACTIVE((int)(endasscode + LINK_SIZE + 1 - start_code), 0); }
2713         }
2714       break;
2715 
2716       /*-----------------------------------------------------------------*/
2717       case OP_COND:
2718       case OP_SCOND:
2719         {
2720         int codelink = (int)GET(code, 1);
2721         PCRE2_UCHAR condcode;
2722 
2723         /* Because of the way auto-callout works during compile, a callout item
2724         is inserted between OP_COND and an assertion condition. This does not
2725         happen for the other conditions. */
2726 
2727         if (code[LINK_SIZE + 1] == OP_CALLOUT
2728             || code[LINK_SIZE + 1] == OP_CALLOUT_STR)
2729           {
2730           PCRE2_SIZE callout_length;
2731           rrc = do_callout(code, offsets, current_subject, ptr, mb,
2732             1 + LINK_SIZE, &callout_length);
2733           if (rrc < 0) return rrc;                 /* Abandon */
2734           if (rrc > 0) break;                      /* Fail this thread */
2735           code += callout_length;                  /* Skip callout data */
2736           }
2737 
2738         condcode = code[LINK_SIZE+1];
2739 
2740         /* Back reference conditions and duplicate named recursion conditions
2741         are not supported */
2742 
2743         if (condcode == OP_CREF || condcode == OP_DNCREF ||
2744             condcode == OP_DNRREF)
2745           return PCRE2_ERROR_DFA_UCOND;
2746 
2747         /* The DEFINE condition is always false, and the assertion (?!) is
2748         converted to OP_FAIL. */
2749 
2750         if (condcode == OP_FALSE || condcode == OP_FAIL)
2751           { ADD_ACTIVE(state_offset + codelink + LINK_SIZE + 1, 0); }
2752 
2753         /* There is also an always-true condition */
2754 
2755         else if (condcode == OP_TRUE)
2756           { ADD_ACTIVE(state_offset + LINK_SIZE + 2 + IMM2_SIZE, 0); }
2757 
2758         /* The only supported version of OP_RREF is for the value RREF_ANY,
2759         which means "test if in any recursion". We can't test for specifically
2760         recursed groups. */
2761 
2762         else if (condcode == OP_RREF)
2763           {
2764           unsigned int value = GET2(code, LINK_SIZE + 2);
2765           if (value != RREF_ANY) return PCRE2_ERROR_DFA_UCOND;
2766           if (mb->recursive != NULL)
2767             { ADD_ACTIVE(state_offset + LINK_SIZE + 2 + IMM2_SIZE, 0); }
2768           else { ADD_ACTIVE(state_offset + codelink + LINK_SIZE + 1, 0); }
2769           }
2770 
2771         /* Otherwise, the condition is an assertion */
2772 
2773         else
2774           {
2775           int rc;
2776           int *local_workspace;
2777           PCRE2_SIZE *local_offsets;
2778           PCRE2_SPTR asscode = code + LINK_SIZE + 1;
2779           PCRE2_SPTR endasscode = asscode + GET(asscode, 1);
2780           RWS_anchor *rws = (RWS_anchor *)RWS;
2781 
2782           if (rws->free < RWS_RSIZE + RWS_OVEC_OSIZE)
2783             {
2784             rc = more_workspace(&rws, RWS_OVEC_OSIZE, mb);
2785             if (rc != 0) return rc;
2786             RWS = (int *)rws;
2787             }
2788 
2789           local_offsets = (PCRE2_SIZE *)(RWS + rws->size - rws->free);
2790           local_workspace = ((int *)local_offsets) + RWS_OVEC_OSIZE;
2791           rws->free -= RWS_RSIZE + RWS_OVEC_OSIZE;
2792 
2793           while (*endasscode == OP_ALT) endasscode += GET(endasscode, 1);
2794 
2795           rc = internal_dfa_match(
2796             mb,                                   /* fixed match data */
2797             asscode,                              /* this subexpression's code */
2798             ptr,                                  /* where we currently are */
2799             (PCRE2_SIZE)(ptr - start_subject),    /* start offset */
2800             local_offsets,                        /* offset vector */
2801             RWS_OVEC_OSIZE/OVEC_UNIT,             /* size of same */
2802             local_workspace,                      /* workspace vector */
2803             RWS_RSIZE,                            /* size of same */
2804             rlevel,                               /* function recursion level */
2805             RWS);                                 /* recursion workspace */
2806 
2807           rws->free += RWS_RSIZE + RWS_OVEC_OSIZE;
2808 
2809           if (rc < 0 && rc != PCRE2_ERROR_NOMATCH) return rc;
2810           if ((rc >= 0) ==
2811                 (condcode == OP_ASSERT || condcode == OP_ASSERTBACK))
2812             { ADD_ACTIVE((int)(endasscode + LINK_SIZE + 1 - start_code), 0); }
2813           else
2814             { ADD_ACTIVE(state_offset + codelink + LINK_SIZE + 1, 0); }
2815           }
2816         }
2817       break;
2818 
2819       /*-----------------------------------------------------------------*/
2820       case OP_RECURSE:
2821         {
2822         int rc;
2823         int *local_workspace;
2824         PCRE2_SIZE *local_offsets;
2825         RWS_anchor *rws = (RWS_anchor *)RWS;
2826         dfa_recursion_info *ri;
2827         PCRE2_SPTR callpat = start_code + GET(code, 1);
2828         uint32_t recno = (callpat == mb->start_code)? 0 :
2829           GET2(callpat, 1 + LINK_SIZE);
2830 
2831         if (rws->free < RWS_RSIZE + RWS_OVEC_RSIZE)
2832           {
2833           rc = more_workspace(&rws, RWS_OVEC_RSIZE, mb);
2834           if (rc != 0) return rc;
2835           RWS = (int *)rws;
2836           }
2837 
2838         local_offsets = (PCRE2_SIZE *)(RWS + rws->size - rws->free);
2839         local_workspace = ((int *)local_offsets) + RWS_OVEC_RSIZE;
2840         rws->free -= RWS_RSIZE + RWS_OVEC_RSIZE;
2841 
2842         /* Check for repeating a recursion without advancing the subject
2843         pointer. This should catch convoluted mutual recursions. (Some simple
2844         cases are caught at compile time.) */
2845 
2846         for (ri = mb->recursive; ri != NULL; ri = ri->prevrec)
2847           if (recno == ri->group_num && ptr == ri->subject_position)
2848             return PCRE2_ERROR_RECURSELOOP;
2849 
2850         /* Remember this recursion and where we started it so as to
2851         catch infinite loops. */
2852 
2853         new_recursive.group_num = recno;
2854         new_recursive.subject_position = ptr;
2855         new_recursive.prevrec = mb->recursive;
2856         mb->recursive = &new_recursive;
2857 
2858         rc = internal_dfa_match(
2859           mb,                                   /* fixed match data */
2860           callpat,                              /* this subexpression's code */
2861           ptr,                                  /* where we currently are */
2862           (PCRE2_SIZE)(ptr - start_subject),    /* start offset */
2863           local_offsets,                        /* offset vector */
2864           RWS_OVEC_RSIZE/OVEC_UNIT,             /* size of same */
2865           local_workspace,                      /* workspace vector */
2866           RWS_RSIZE,                            /* size of same */
2867           rlevel,                               /* function recursion level */
2868           RWS);                                 /* recursion workspace */
2869 
2870         rws->free += RWS_RSIZE + RWS_OVEC_RSIZE;
2871         mb->recursive = new_recursive.prevrec;  /* Done this recursion */
2872 
2873         /* Ran out of internal offsets */
2874 
2875         if (rc == 0) return PCRE2_ERROR_DFA_RECURSE;
2876 
2877         /* For each successful matched substring, set up the next state with a
2878         count of characters to skip before trying it. Note that the count is in
2879         characters, not bytes. */
2880 
2881         if (rc > 0)
2882           {
2883           for (rc = rc*2 - 2; rc >= 0; rc -= 2)
2884             {
2885             PCRE2_SIZE charcount = local_offsets[rc+1] - local_offsets[rc];
2886 #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32
2887             if (utf)
2888               {
2889               PCRE2_SPTR p = start_subject + local_offsets[rc];
2890               PCRE2_SPTR pp = start_subject + local_offsets[rc+1];
2891               while (p < pp) if (NOT_FIRSTCU(*p++)) charcount--;
2892               }
2893 #endif
2894             if (charcount > 0)
2895               {
2896               ADD_NEW_DATA(-(state_offset + LINK_SIZE + 1), 0,
2897                 (int)(charcount - 1));
2898               }
2899             else
2900               {
2901               ADD_ACTIVE(state_offset + LINK_SIZE + 1, 0);
2902               }
2903             }
2904           }
2905         else if (rc != PCRE2_ERROR_NOMATCH) return rc;
2906         }
2907       break;
2908 
2909       /*-----------------------------------------------------------------*/
2910       case OP_BRAPOS:
2911       case OP_SBRAPOS:
2912       case OP_CBRAPOS:
2913       case OP_SCBRAPOS:
2914       case OP_BRAPOSZERO:
2915         {
2916         int rc;
2917         int *local_workspace;
2918         PCRE2_SIZE *local_offsets;
2919         PCRE2_SIZE charcount, matched_count;
2920         PCRE2_SPTR local_ptr = ptr;
2921         RWS_anchor *rws = (RWS_anchor *)RWS;
2922         BOOL allow_zero;
2923 
2924         if (rws->free < RWS_RSIZE + RWS_OVEC_OSIZE)
2925           {
2926           rc = more_workspace(&rws, RWS_OVEC_OSIZE, mb);
2927           if (rc != 0) return rc;
2928           RWS = (int *)rws;
2929           }
2930 
2931         local_offsets = (PCRE2_SIZE *)(RWS + rws->size - rws->free);
2932         local_workspace = ((int *)local_offsets) + RWS_OVEC_OSIZE;
2933         rws->free -= RWS_RSIZE + RWS_OVEC_OSIZE;
2934 
2935         if (codevalue == OP_BRAPOSZERO)
2936           {
2937           allow_zero = TRUE;
2938           codevalue = *(++code);  /* Codevalue will be one of above BRAs */
2939           }
2940         else allow_zero = FALSE;
2941 
2942         /* Loop to match the subpattern as many times as possible as if it were
2943         a complete pattern. */
2944 
2945         for (matched_count = 0;; matched_count++)
2946           {
2947           rc = internal_dfa_match(
2948             mb,                                   /* fixed match data */
2949             code,                                 /* this subexpression's code */
2950             local_ptr,                            /* where we currently are */
2951             (PCRE2_SIZE)(ptr - start_subject),    /* start offset */
2952             local_offsets,                        /* offset vector */
2953             RWS_OVEC_OSIZE/OVEC_UNIT,             /* size of same */
2954             local_workspace,                      /* workspace vector */
2955             RWS_RSIZE,                            /* size of same */
2956             rlevel,                               /* function recursion level */
2957             RWS);                                 /* recursion workspace */
2958 
2959           /* Failed to match */
2960 
2961           if (rc < 0)
2962             {
2963             if (rc != PCRE2_ERROR_NOMATCH) return rc;
2964             break;
2965             }
2966 
2967           /* Matched: break the loop if zero characters matched. */
2968 
2969           charcount = local_offsets[1] - local_offsets[0];
2970           if (charcount == 0) break;
2971           local_ptr += charcount;    /* Advance temporary position ptr */
2972           }
2973 
2974         rws->free += RWS_RSIZE + RWS_OVEC_OSIZE;
2975 
2976         /* At this point we have matched the subpattern matched_count
2977         times, and local_ptr is pointing to the character after the end of the
2978         last match. */
2979 
2980         if (matched_count > 0 || allow_zero)
2981           {
2982           PCRE2_SPTR end_subpattern = code;
2983           int next_state_offset;
2984 
2985           do { end_subpattern += GET(end_subpattern, 1); }
2986             while (*end_subpattern == OP_ALT);
2987           next_state_offset =
2988             (int)(end_subpattern - start_code + LINK_SIZE + 1);
2989 
2990           /* Optimization: if there are no more active states, and there
2991           are no new states yet set up, then skip over the subject string
2992           right here, to save looping. Otherwise, set up the new state to swing
2993           into action when the end of the matched substring is reached. */
2994 
2995           if (i + 1 >= active_count && new_count == 0)
2996             {
2997             ptr = local_ptr;
2998             clen = 0;
2999             ADD_NEW(next_state_offset, 0);
3000             }
3001           else
3002             {
3003             PCRE2_SPTR p = ptr;
3004             PCRE2_SPTR pp = local_ptr;
3005             charcount = (PCRE2_SIZE)(pp - p);
3006 #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32
3007             if (utf) while (p < pp) if (NOT_FIRSTCU(*p++)) charcount--;
3008 #endif
3009             ADD_NEW_DATA(-next_state_offset, 0, (int)(charcount - 1));
3010             }
3011           }
3012         }
3013       break;
3014 
3015       /*-----------------------------------------------------------------*/
3016       case OP_ONCE:
3017         {
3018         int rc;
3019         int *local_workspace;
3020         PCRE2_SIZE *local_offsets;
3021         RWS_anchor *rws = (RWS_anchor *)RWS;
3022 
3023         if (rws->free < RWS_RSIZE + RWS_OVEC_OSIZE)
3024           {
3025           rc = more_workspace(&rws, RWS_OVEC_OSIZE, mb);
3026           if (rc != 0) return rc;
3027           RWS = (int *)rws;
3028           }
3029 
3030         local_offsets = (PCRE2_SIZE *)(RWS + rws->size - rws->free);
3031         local_workspace = ((int *)local_offsets) + RWS_OVEC_OSIZE;
3032         rws->free -= RWS_RSIZE + RWS_OVEC_OSIZE;
3033 
3034         rc = internal_dfa_match(
3035           mb,                                   /* fixed match data */
3036           code,                                 /* this subexpression's code */
3037           ptr,                                  /* where we currently are */
3038           (PCRE2_SIZE)(ptr - start_subject),    /* start offset */
3039           local_offsets,                        /* offset vector */
3040           RWS_OVEC_OSIZE/OVEC_UNIT,             /* size of same */
3041           local_workspace,                      /* workspace vector */
3042           RWS_RSIZE,                            /* size of same */
3043           rlevel,                               /* function recursion level */
3044           RWS);                                 /* recursion workspace */
3045 
3046         rws->free += RWS_RSIZE + RWS_OVEC_OSIZE;
3047 
3048         if (rc >= 0)
3049           {
3050           PCRE2_SPTR end_subpattern = code;
3051           PCRE2_SIZE charcount = local_offsets[1] - local_offsets[0];
3052           int next_state_offset, repeat_state_offset;
3053 
3054           do { end_subpattern += GET(end_subpattern, 1); }
3055             while (*end_subpattern == OP_ALT);
3056           next_state_offset =
3057             (int)(end_subpattern - start_code + LINK_SIZE + 1);
3058 
3059           /* If the end of this subpattern is KETRMAX or KETRMIN, we must
3060           arrange for the repeat state also to be added to the relevant list.
3061           Calculate the offset, or set -1 for no repeat. */
3062 
3063           repeat_state_offset = (*end_subpattern == OP_KETRMAX ||
3064                                  *end_subpattern == OP_KETRMIN)?
3065             (int)(end_subpattern - start_code - GET(end_subpattern, 1)) : -1;
3066 
3067           /* If we have matched an empty string, add the next state at the
3068           current character pointer. This is important so that the duplicate
3069           checking kicks in, which is what breaks infinite loops that match an
3070           empty string. */
3071 
3072           if (charcount == 0)
3073             {
3074             ADD_ACTIVE(next_state_offset, 0);
3075             }
3076 
3077           /* Optimization: if there are no more active states, and there
3078           are no new states yet set up, then skip over the subject string
3079           right here, to save looping. Otherwise, set up the new state to swing
3080           into action when the end of the matched substring is reached. */
3081 
3082           else if (i + 1 >= active_count && new_count == 0)
3083             {
3084             ptr += charcount;
3085             clen = 0;
3086             ADD_NEW(next_state_offset, 0);
3087 
3088             /* If we are adding a repeat state at the new character position,
3089             we must fudge things so that it is the only current state.
3090             Otherwise, it might be a duplicate of one we processed before, and
3091             that would cause it to be skipped. */
3092 
3093             if (repeat_state_offset >= 0)
3094               {
3095               next_active_state = active_states;
3096               active_count = 0;
3097               i = -1;
3098               ADD_ACTIVE(repeat_state_offset, 0);
3099               }
3100             }
3101           else
3102             {
3103 #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32
3104             if (utf)
3105               {
3106               PCRE2_SPTR p = start_subject + local_offsets[0];
3107               PCRE2_SPTR pp = start_subject + local_offsets[1];
3108               while (p < pp) if (NOT_FIRSTCU(*p++)) charcount--;
3109               }
3110 #endif
3111             ADD_NEW_DATA(-next_state_offset, 0, (int)(charcount - 1));
3112             if (repeat_state_offset >= 0)
3113               { ADD_NEW_DATA(-repeat_state_offset, 0, (int)(charcount - 1)); }
3114             }
3115           }
3116         else if (rc != PCRE2_ERROR_NOMATCH) return rc;
3117         }
3118       break;
3119 
3120 
3121 /* ========================================================================== */
3122       /* Handle callouts */
3123 
3124       case OP_CALLOUT:
3125       case OP_CALLOUT_STR:
3126         {
3127         PCRE2_SIZE callout_length;
3128         rrc = do_callout(code, offsets, current_subject, ptr, mb, 0,
3129           &callout_length);
3130         if (rrc < 0) return rrc;   /* Abandon */
3131         if (rrc == 0)
3132           { ADD_ACTIVE(state_offset + (int)callout_length, 0); }
3133         }
3134       break;
3135 
3136 
3137 /* ========================================================================== */
3138       default:        /* Unsupported opcode */
3139       return PCRE2_ERROR_DFA_UITEM;
3140       }
3141 
3142     NEXT_ACTIVE_STATE: continue;
3143 
3144     }      /* End of loop scanning active states */
3145 
3146   /* We have finished the processing at the current subject character. If no
3147   new states have been set for the next character, we have found all the
3148   matches that we are going to find. If we are at the top level and partial
3149   matching has been requested, check for appropriate conditions.
3150 
3151   The "forced_ fail" variable counts the number of (*F) encountered for the
3152   character. If it is equal to the original active_count (saved in
3153   workspace[1]) it means that (*F) was found on every active state. In this
3154   case we don't want to give a partial match.
3155 
3156   The "could_continue" variable is true if a state could have continued but
3157   for the fact that the end of the subject was reached. */
3158 
3159   if (new_count <= 0)
3160     {
3161     if (rlevel == 1 &&                               /* Top level, and */
3162         could_continue &&                            /* Some could go on, and */
3163         forced_fail != workspace[1] &&               /* Not all forced fail & */
3164         (                                            /* either... */
3165         (mb->moptions & PCRE2_PARTIAL_HARD) != 0      /* Hard partial */
3166         ||                                           /* or... */
3167         ((mb->moptions & PCRE2_PARTIAL_SOFT) != 0 &&  /* Soft partial and */
3168          match_count < 0)                            /* no matches */
3169         ) &&                                         /* And... */
3170         (
3171         partial_newline ||                           /* Either partial NL */
3172           (                                          /* or ... */
3173           ptr >= end_subject &&                /* End of subject and */
3174           ptr > mb->start_used_ptr)            /* Inspected non-empty string */
3175           )
3176         )
3177       match_count = PCRE2_ERROR_PARTIAL;
3178     break;  /* Exit from loop along the subject string */
3179     }
3180 
3181   /* One or more states are active for the next character. */
3182 
3183   ptr += clen;    /* Advance to next subject character */
3184   }               /* Loop to move along the subject string */
3185 
3186 /* Control gets here from "break" a few lines above. If we have a match and
3187 PCRE2_ENDANCHORED is set, the match fails. */
3188 
3189 if (match_count >= 0 &&
3190     ((mb->moptions | mb->poptions) & PCRE2_ENDANCHORED) != 0 &&
3191     ptr < end_subject)
3192   match_count = PCRE2_ERROR_NOMATCH;
3193 
3194 return match_count;
3195 }
3196 
3197 
3198 
3199 /*************************************************
3200 *     Match a pattern using the DFA algorithm    *
3201 *************************************************/
3202 
3203 /* This function matches a compiled pattern to a subject string, using the
3204 alternate matching algorithm that finds all matches at once.
3205 
3206 Arguments:
3207   code          points to the compiled pattern
3208   subject       subject string
3209   length        length of subject string
3210   startoffset   where to start matching in the subject
3211   options       option bits
3212   match_data    points to a match data structure
3213   gcontext      points to a match context
3214   workspace     pointer to workspace
3215   wscount       size of workspace
3216 
3217 Returns:        > 0 => number of match offset pairs placed in offsets
3218                 = 0 => offsets overflowed; longest matches are present
3219                  -1 => failed to match
3220                < -1 => some kind of unexpected problem
3221 */
3222 
3223 PCRE2_EXP_DEFN int PCRE2_CALL_CONVENTION
pcre2_dfa_match(const pcre2_code * code,PCRE2_SPTR subject,PCRE2_SIZE length,PCRE2_SIZE start_offset,uint32_t options,pcre2_match_data * match_data,pcre2_match_context * mcontext,int * workspace,PCRE2_SIZE wscount)3224 pcre2_dfa_match(const pcre2_code *code, PCRE2_SPTR subject, PCRE2_SIZE length,
3225   PCRE2_SIZE start_offset, uint32_t options, pcre2_match_data *match_data,
3226   pcre2_match_context *mcontext, int *workspace, PCRE2_SIZE wscount)
3227 {
3228 int rc;
3229 const pcre2_real_code *re = (const pcre2_real_code *)code;
3230 
3231 PCRE2_SPTR start_match;
3232 PCRE2_SPTR end_subject;
3233 PCRE2_SPTR bumpalong_limit;
3234 PCRE2_SPTR req_cu_ptr;
3235 
3236 BOOL utf, anchored, startline, firstline;
3237 BOOL has_first_cu = FALSE;
3238 BOOL has_req_cu = FALSE;
3239 
3240 PCRE2_UCHAR first_cu = 0;
3241 PCRE2_UCHAR first_cu2 = 0;
3242 PCRE2_UCHAR req_cu = 0;
3243 PCRE2_UCHAR req_cu2 = 0;
3244 
3245 const uint8_t *start_bits = NULL;
3246 
3247 /* We need to have mb pointing to a match block, because the IS_NEWLINE macro
3248 is used below, and it expects NLBLOCK to be defined as a pointer. */
3249 
3250 pcre2_callout_block cb;
3251 dfa_match_block actual_match_block;
3252 dfa_match_block *mb = &actual_match_block;
3253 
3254 /* Set up a starting block of memory for use during recursive calls to
3255 internal_dfa_match(). By putting this on the stack, it minimizes resource use
3256 in the case when it is not needed. If this is too small, more memory is
3257 obtained from the heap. At the start of each block is an anchor structure.*/
3258 
3259 int base_recursion_workspace[RWS_BASE_SIZE];
3260 RWS_anchor *rws = (RWS_anchor *)base_recursion_workspace;
3261 rws->next = NULL;
3262 rws->size = RWS_BASE_SIZE;
3263 rws->free = RWS_BASE_SIZE - RWS_ANCHOR_SIZE;
3264 
3265 /* A length equal to PCRE2_ZERO_TERMINATED implies a zero-terminated
3266 subject string. */
3267 
3268 if (length == PCRE2_ZERO_TERMINATED) length = PRIV(strlen)(subject);
3269 
3270 /* Plausibility checks */
3271 
3272 if ((options & ~PUBLIC_DFA_MATCH_OPTIONS) != 0) return PCRE2_ERROR_BADOPTION;
3273 if (re == NULL || subject == NULL || workspace == NULL || match_data == NULL)
3274   return PCRE2_ERROR_NULL;
3275 if (wscount < 20) return PCRE2_ERROR_DFA_WSSIZE;
3276 if (start_offset > length) return PCRE2_ERROR_BADOFFSET;
3277 
3278 /* Partial matching and PCRE2_ENDANCHORED are currently not allowed at the same
3279 time. */
3280 
3281 if ((options & (PCRE2_PARTIAL_HARD|PCRE2_PARTIAL_SOFT)) != 0 &&
3282    ((re->overall_options | options) & PCRE2_ENDANCHORED) != 0)
3283   return PCRE2_ERROR_BADOPTION;
3284 
3285 /* Check that the first field in the block is the magic number. If it is not,
3286 return with PCRE2_ERROR_BADMAGIC. */
3287 
3288 if (re->magic_number != MAGIC_NUMBER) return PCRE2_ERROR_BADMAGIC;
3289 
3290 /* Check the code unit width. */
3291 
3292 if ((re->flags & PCRE2_MODE_MASK) != PCRE2_CODE_UNIT_WIDTH/8)
3293   return PCRE2_ERROR_BADMODE;
3294 
3295 /* PCRE2_NOTEMPTY and PCRE2_NOTEMPTY_ATSTART are match-time flags in the
3296 options variable for this function. Users of PCRE2 who are not calling the
3297 function directly would like to have a way of setting these flags, in the same
3298 way that they can set pcre2_compile() flags like PCRE2_NO_AUTOPOSSESS with
3299 constructions like (*NO_AUTOPOSSESS). To enable this, (*NOTEMPTY) and
3300 (*NOTEMPTY_ATSTART) set bits in the pattern's "flag" function which can now be
3301 transferred to the options for this function. The bits are guaranteed to be
3302 adjacent, but do not have the same values. This bit of Boolean trickery assumes
3303 that the match-time bits are not more significant than the flag bits. If by
3304 accident this is not the case, a compile-time division by zero error will
3305 occur. */
3306 
3307 #define FF (PCRE2_NOTEMPTY_SET|PCRE2_NE_ATST_SET)
3308 #define OO (PCRE2_NOTEMPTY|PCRE2_NOTEMPTY_ATSTART)
3309 options |= (re->flags & FF) / ((FF & (~FF+1)) / (OO & (~OO+1)));
3310 #undef FF
3311 #undef OO
3312 
3313 /* If restarting after a partial match, do some sanity checks on the contents
3314 of the workspace. */
3315 
3316 if ((options & PCRE2_DFA_RESTART) != 0)
3317   {
3318   if ((workspace[0] & (-2)) != 0 || workspace[1] < 1 ||
3319     workspace[1] > (int)((wscount - 2)/INTS_PER_STATEBLOCK))
3320       return PCRE2_ERROR_DFA_BADRESTART;
3321   }
3322 
3323 /* Set some local values */
3324 
3325 utf = (re->overall_options & PCRE2_UTF) != 0;
3326 start_match = subject + start_offset;
3327 end_subject = subject + length;
3328 req_cu_ptr = start_match - 1;
3329 anchored = (options & (PCRE2_ANCHORED|PCRE2_DFA_RESTART)) != 0 ||
3330   (re->overall_options & PCRE2_ANCHORED) != 0;
3331 
3332 /* The "must be at the start of a line" flags are used in a loop when finding
3333 where to start. */
3334 
3335 startline = (re->flags & PCRE2_STARTLINE) != 0;
3336 firstline = (re->overall_options & PCRE2_FIRSTLINE) != 0;
3337 bumpalong_limit = end_subject;
3338 
3339 /* Initialize and set up the fixed fields in the callout block, with a pointer
3340 in the match block. */
3341 
3342 mb->cb = &cb;
3343 cb.version = 2;
3344 cb.subject = subject;
3345 cb.subject_length = (PCRE2_SIZE)(end_subject - subject);
3346 cb.callout_flags = 0;
3347 cb.capture_top      = 1;      /* No capture support */
3348 cb.capture_last     = 0;
3349 cb.mark             = NULL;   /* No (*MARK) support */
3350 
3351 /* Get data from the match context, if present, and fill in the remaining
3352 fields in the match block. It is an error to set an offset limit without
3353 setting the flag at compile time. */
3354 
3355 if (mcontext == NULL)
3356   {
3357   mb->callout = NULL;
3358   mb->memctl = re->memctl;
3359   mb->match_limit = PRIV(default_match_context).match_limit;
3360   mb->match_limit_depth = PRIV(default_match_context).depth_limit;
3361   mb->heap_limit = PRIV(default_match_context).heap_limit;
3362   }
3363 else
3364   {
3365   if (mcontext->offset_limit != PCRE2_UNSET)
3366     {
3367     if ((re->overall_options & PCRE2_USE_OFFSET_LIMIT) == 0)
3368       return PCRE2_ERROR_BADOFFSETLIMIT;
3369     bumpalong_limit = subject + mcontext->offset_limit;
3370     }
3371   mb->callout = mcontext->callout;
3372   mb->callout_data = mcontext->callout_data;
3373   mb->memctl = mcontext->memctl;
3374   mb->match_limit = mcontext->match_limit;
3375   mb->match_limit_depth = mcontext->depth_limit;
3376   mb->heap_limit = mcontext->heap_limit;
3377   }
3378 
3379 if (mb->match_limit > re->limit_match)
3380   mb->match_limit = re->limit_match;
3381 
3382 if (mb->match_limit_depth > re->limit_depth)
3383   mb->match_limit_depth = re->limit_depth;
3384 
3385 if (mb->heap_limit > re->limit_heap)
3386   mb->heap_limit = re->limit_heap;
3387 
3388 mb->start_code = (PCRE2_UCHAR *)((uint8_t *)re + sizeof(pcre2_real_code)) +
3389   re->name_count * re->name_entry_size;
3390 mb->tables = re->tables;
3391 mb->start_subject = subject;
3392 mb->end_subject = end_subject;
3393 mb->start_offset = start_offset;
3394 mb->moptions = options;
3395 mb->poptions = re->overall_options;
3396 mb->match_call_count = 0;
3397 mb->heap_used = 0;
3398 
3399 /* Process the \R and newline settings. */
3400 
3401 mb->bsr_convention = re->bsr_convention;
3402 mb->nltype = NLTYPE_FIXED;
3403 switch(re->newline_convention)
3404   {
3405   case PCRE2_NEWLINE_CR:
3406   mb->nllen = 1;
3407   mb->nl[0] = CHAR_CR;
3408   break;
3409 
3410   case PCRE2_NEWLINE_LF:
3411   mb->nllen = 1;
3412   mb->nl[0] = CHAR_NL;
3413   break;
3414 
3415   case PCRE2_NEWLINE_NUL:
3416   mb->nllen = 1;
3417   mb->nl[0] = CHAR_NUL;
3418   break;
3419 
3420   case PCRE2_NEWLINE_CRLF:
3421   mb->nllen = 2;
3422   mb->nl[0] = CHAR_CR;
3423   mb->nl[1] = CHAR_NL;
3424   break;
3425 
3426   case PCRE2_NEWLINE_ANY:
3427   mb->nltype = NLTYPE_ANY;
3428   break;
3429 
3430   case PCRE2_NEWLINE_ANYCRLF:
3431   mb->nltype = NLTYPE_ANYCRLF;
3432   break;
3433 
3434   default: return PCRE2_ERROR_INTERNAL;
3435   }
3436 
3437 /* Check a UTF string for validity if required. For 8-bit and 16-bit strings,
3438 we must also check that a starting offset does not point into the middle of a
3439 multiunit character. We check only the portion of the subject that is going to
3440 be inspected during matching - from the offset minus the maximum back reference
3441 to the given length. This saves time when a small part of a large subject is
3442 being matched by the use of a starting offset. Note that the maximum lookbehind
3443 is a number of characters, not code units. */
3444 
3445 #ifdef SUPPORT_UNICODE
3446 if (utf && (options & PCRE2_NO_UTF_CHECK) == 0)
3447   {
3448   PCRE2_SPTR check_subject = start_match;  /* start_match includes offset */
3449 
3450   if (start_offset > 0)
3451     {
3452 #if PCRE2_CODE_UNIT_WIDTH != 32
3453     unsigned int i;
3454     if (start_match < end_subject && NOT_FIRSTCU(*start_match))
3455       return PCRE2_ERROR_BADUTFOFFSET;
3456     for (i = re->max_lookbehind; i > 0 && check_subject > subject; i--)
3457       {
3458       check_subject--;
3459       while (check_subject > subject &&
3460 #if PCRE2_CODE_UNIT_WIDTH == 8
3461       (*check_subject & 0xc0) == 0x80)
3462 #else  /* 16-bit */
3463       (*check_subject & 0xfc00) == 0xdc00)
3464 #endif /* PCRE2_CODE_UNIT_WIDTH == 8 */
3465         check_subject--;
3466       }
3467 #else   /* In the 32-bit library, one code unit equals one character. */
3468     check_subject -= re->max_lookbehind;
3469     if (check_subject < subject) check_subject = subject;
3470 #endif  /* PCRE2_CODE_UNIT_WIDTH != 32 */
3471     }
3472 
3473   /* Validate the relevant portion of the subject. After an error, adjust the
3474   offset to be an absolute offset in the whole string. */
3475 
3476   match_data->rc = PRIV(valid_utf)(check_subject,
3477     length - (PCRE2_SIZE)(check_subject - subject), &(match_data->startchar));
3478   if (match_data->rc != 0)
3479     {
3480     match_data->startchar += (PCRE2_SIZE)(check_subject - subject);
3481     return match_data->rc;
3482     }
3483   }
3484 #endif  /* SUPPORT_UNICODE */
3485 
3486 /* Set up the first code unit to match, if available. If there's no first code
3487 unit there may be a bitmap of possible first characters. */
3488 
3489 if ((re->flags & PCRE2_FIRSTSET) != 0)
3490   {
3491   has_first_cu = TRUE;
3492   first_cu = first_cu2 = (PCRE2_UCHAR)(re->first_codeunit);
3493   if ((re->flags & PCRE2_FIRSTCASELESS) != 0)
3494     {
3495     first_cu2 = TABLE_GET(first_cu, mb->tables + fcc_offset, first_cu);
3496 #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 8
3497     if (utf && first_cu > 127)
3498       first_cu2 = (PCRE2_UCHAR)UCD_OTHERCASE(first_cu);
3499 #endif
3500     }
3501   }
3502 else
3503   if (!startline && (re->flags & PCRE2_FIRSTMAPSET) != 0)
3504     start_bits = re->start_bitmap;
3505 
3506 /* There may be a "last known required code unit" set. */
3507 
3508 if ((re->flags & PCRE2_LASTSET) != 0)
3509   {
3510   has_req_cu = TRUE;
3511   req_cu = req_cu2 = (PCRE2_UCHAR)(re->last_codeunit);
3512   if ((re->flags & PCRE2_LASTCASELESS) != 0)
3513     {
3514     req_cu2 = TABLE_GET(req_cu, mb->tables + fcc_offset, req_cu);
3515 #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 8
3516     if (utf && req_cu > 127) req_cu2 = (PCRE2_UCHAR)UCD_OTHERCASE(req_cu);
3517 #endif
3518     }
3519   }
3520 
3521 /* Fill in fields that are always returned in the match data. */
3522 
3523 match_data->code = re;
3524 match_data->subject = subject;
3525 match_data->mark = NULL;
3526 match_data->matchedby = PCRE2_MATCHEDBY_DFA_INTERPRETER;
3527 
3528 /* Call the main matching function, looping for a non-anchored regex after a
3529 failed match. If not restarting, perform certain optimizations at the start of
3530 a match. */
3531 
3532 for (;;)
3533   {
3534   /* ----------------- Start of match optimizations ---------------- */
3535 
3536   /* There are some optimizations that avoid running the match if a known
3537   starting point is not found, or if a known later code unit is not present.
3538   However, there is an option (settable at compile time) that disables
3539   these, for testing and for ensuring that all callouts do actually occur.
3540   The optimizations must also be avoided when restarting a DFA match. */
3541 
3542   if ((re->overall_options & PCRE2_NO_START_OPTIMIZE) == 0 &&
3543       (options & PCRE2_DFA_RESTART) == 0)
3544     {
3545     /* If firstline is TRUE, the start of the match is constrained to the first
3546     line of a multiline string. That is, the match must be before or at the
3547     first newline following the start of matching. Temporarily adjust
3548     end_subject so that we stop the optimization scans for a first code unit
3549     immediately after the first character of a newline (the first code unit can
3550     legitimately be a newline). If the match fails at the newline, later code
3551     breaks this loop. */
3552 
3553     if (firstline)
3554       {
3555       PCRE2_SPTR t = start_match;
3556 #ifdef SUPPORT_UNICODE
3557       if (utf)
3558         {
3559         while (t < end_subject && !IS_NEWLINE(t))
3560           {
3561           t++;
3562           ACROSSCHAR(t < end_subject, t, t++);
3563           }
3564         }
3565       else
3566 #endif
3567       while (t < end_subject && !IS_NEWLINE(t)) t++;
3568       end_subject = t;
3569       }
3570 
3571     /* Anchored: check the first code unit if one is recorded. This may seem
3572     pointless but it can help in detecting a no match case without scanning for
3573     the required code unit. */
3574 
3575     if (anchored)
3576       {
3577       if (has_first_cu || start_bits != NULL)
3578         {
3579         BOOL ok = start_match < end_subject;
3580         if (ok)
3581           {
3582           PCRE2_UCHAR c = UCHAR21TEST(start_match);
3583           ok = has_first_cu && (c == first_cu || c == first_cu2);
3584           if (!ok && start_bits != NULL)
3585             {
3586 #if PCRE2_CODE_UNIT_WIDTH != 8
3587             if (c > 255) c = 255;
3588 #endif
3589             ok = (start_bits[c/8] & (1 << (c&7))) != 0;
3590             }
3591           }
3592         if (!ok) break;
3593         }
3594       }
3595 
3596     /* Not anchored. Advance to a unique first code unit if there is one. In
3597     8-bit mode, the use of memchr() gives a big speed up, even though we have
3598     to call it twice in caseless mode, in order to find the earliest occurrence
3599     of the character in either of its cases. */
3600 
3601     else
3602       {
3603       if (has_first_cu)
3604         {
3605         if (first_cu != first_cu2)  /* Caseless */
3606           {
3607 #if PCRE2_CODE_UNIT_WIDTH != 8
3608           PCRE2_UCHAR smc;
3609           while (start_match < end_subject &&
3610                 (smc = UCHAR21TEST(start_match)) != first_cu &&
3611                   smc != first_cu2)
3612             start_match++;
3613 #else  /* 8-bit code units */
3614           PCRE2_SPTR pp1 =
3615             memchr(start_match, first_cu, end_subject-start_match);
3616           PCRE2_SPTR pp2 =
3617             memchr(start_match, first_cu2, end_subject-start_match);
3618           if (pp1 == NULL)
3619             start_match = (pp2 == NULL)? end_subject : pp2;
3620           else
3621             start_match = (pp2 == NULL || pp1 < pp2)? pp1 : pp2;
3622 #endif
3623           }
3624 
3625         /* The caseful case */
3626 
3627         else
3628           {
3629 #if PCRE2_CODE_UNIT_WIDTH != 8
3630           while (start_match < end_subject && UCHAR21TEST(start_match) !=
3631                  first_cu)
3632             start_match++;
3633 #else
3634           start_match = memchr(start_match, first_cu, end_subject - start_match);
3635           if (start_match == NULL) start_match = end_subject;
3636 #endif
3637           }
3638 
3639         /* If we can't find the required code unit, having reached the true end
3640         of the subject, break the bumpalong loop, to force a match failure,
3641         except when doing partial matching, when we let the next cycle run at
3642         the end of the subject. To see why, consider the pattern /(?<=abc)def/,
3643         which partially matches "abc", even though the string does not contain
3644         the starting character "d". If we have not reached the true end of the
3645         subject (PCRE2_FIRSTLINE caused end_subject to be temporarily modified)
3646         we also let the cycle run, because the matching string is legitimately
3647         allowed to start with the first code unit of a newline. */
3648 
3649         if ((mb->moptions & (PCRE2_PARTIAL_HARD|PCRE2_PARTIAL_SOFT)) == 0 &&
3650             start_match >= mb->end_subject)
3651           break;
3652         }
3653 
3654       /* If there's no first code unit, advance to just after a linebreak for a
3655       multiline match if required. */
3656 
3657       else if (startline)
3658         {
3659         if (start_match > mb->start_subject + start_offset)
3660           {
3661 #ifdef SUPPORT_UNICODE
3662           if (utf)
3663             {
3664             while (start_match < end_subject && !WAS_NEWLINE(start_match))
3665               {
3666               start_match++;
3667               ACROSSCHAR(start_match < end_subject, start_match, start_match++);
3668               }
3669             }
3670           else
3671 #endif
3672           while (start_match < end_subject && !WAS_NEWLINE(start_match))
3673             start_match++;
3674 
3675           /* If we have just passed a CR and the newline option is ANY or
3676           ANYCRLF, and we are now at a LF, advance the match position by one
3677           more code unit. */
3678 
3679           if (start_match[-1] == CHAR_CR &&
3680                (mb->nltype == NLTYPE_ANY || mb->nltype == NLTYPE_ANYCRLF) &&
3681                start_match < end_subject &&
3682                UCHAR21TEST(start_match) == CHAR_NL)
3683             start_match++;
3684           }
3685         }
3686 
3687       /* If there's no first code unit or a requirement for a multiline line
3688       start, advance to a non-unique first code unit if any have been
3689       identified. The bitmap contains only 256 bits. When code units are 16 or
3690       32 bits wide, all code units greater than 254 set the 255 bit. */
3691 
3692       else if (start_bits != NULL)
3693         {
3694         while (start_match < end_subject)
3695           {
3696           uint32_t c = UCHAR21TEST(start_match);
3697 #if PCRE2_CODE_UNIT_WIDTH != 8
3698           if (c > 255) c = 255;
3699 #endif
3700           if ((start_bits[c/8] & (1 << (c&7))) != 0) break;
3701           start_match++;
3702           }
3703 
3704         /* See comment above in first_cu checking about the next line. */
3705 
3706         if ((mb->moptions & (PCRE2_PARTIAL_HARD|PCRE2_PARTIAL_SOFT)) == 0 &&
3707             start_match >= mb->end_subject)
3708           break;
3709         }
3710       }  /* End of first code unit handling */
3711 
3712     /* Restore fudged end_subject */
3713 
3714     end_subject = mb->end_subject;
3715 
3716     /* The following two optimizations are disabled for partial matching. */
3717 
3718     if ((mb->moptions & (PCRE2_PARTIAL_HARD|PCRE2_PARTIAL_SOFT)) == 0)
3719       {
3720       /* The minimum matching length is a lower bound; no actual string of that
3721       length may actually match the pattern. Although the value is, strictly,
3722       in characters, we treat it as code units to avoid spending too much time
3723       in this optimization. */
3724 
3725       if (end_subject - start_match < re->minlength) goto NOMATCH_EXIT;
3726 
3727       /* If req_cu is set, we know that that code unit must appear in the
3728       subject for the match to succeed. If the first code unit is set, req_cu
3729       must be later in the subject; otherwise the test starts at the match
3730       point. This optimization can save a huge amount of backtracking in
3731       patterns with nested unlimited repeats that aren't going to match.
3732       Writing separate code for cased/caseless versions makes it go faster, as
3733       does using an autoincrement and backing off on a match.
3734 
3735       HOWEVER: when the subject string is very, very long, searching to its end
3736       can take a long time, and give bad performance on quite ordinary
3737       patterns. This showed up when somebody was matching something like
3738       /^\d+C/ on a 32-megabyte string... so we don't do this when the string is
3739       sufficiently long. */
3740 
3741       if (has_req_cu && end_subject - start_match < REQ_CU_MAX)
3742         {
3743         PCRE2_SPTR p = start_match + (has_first_cu? 1:0);
3744 
3745         /* We don't need to repeat the search if we haven't yet reached the
3746         place we found it at last time. */
3747 
3748         if (p > req_cu_ptr)
3749           {
3750           if (req_cu != req_cu2)
3751             {
3752             while (p < end_subject)
3753               {
3754               uint32_t pp = UCHAR21INCTEST(p);
3755               if (pp == req_cu || pp == req_cu2) { p--; break; }
3756               }
3757             }
3758           else
3759             {
3760             while (p < end_subject)
3761               {
3762               if (UCHAR21INCTEST(p) == req_cu) { p--; break; }
3763               }
3764             }
3765 
3766           /* If we can't find the required code unit, break the matching loop,
3767           forcing a match failure. */
3768 
3769           if (p >= end_subject) break;
3770 
3771           /* If we have found the required code unit, save the point where we
3772           found it, so that we don't search again next time round the loop if
3773           the start hasn't passed this code unit yet. */
3774 
3775           req_cu_ptr = p;
3776           }
3777         }
3778       }
3779     }
3780 
3781   /* ------------ End of start of match optimizations ------------ */
3782 
3783   /* Give no match if we have passed the bumpalong limit. */
3784 
3785   if (start_match > bumpalong_limit) break;
3786 
3787   /* OK, now we can do the business */
3788 
3789   mb->start_used_ptr = start_match;
3790   mb->last_used_ptr = start_match;
3791   mb->recursive = NULL;
3792 
3793   rc = internal_dfa_match(
3794     mb,                           /* fixed match data */
3795     mb->start_code,               /* this subexpression's code */
3796     start_match,                  /* where we currently are */
3797     start_offset,                 /* start offset in subject */
3798     match_data->ovector,          /* offset vector */
3799     (uint32_t)match_data->oveccount * 2,  /* actual size of same */
3800     workspace,                    /* workspace vector */
3801     (int)wscount,                 /* size of same */
3802     0,                            /* function recurse level */
3803     base_recursion_workspace);    /* initial workspace for recursion */
3804 
3805   /* Anything other than "no match" means we are done, always; otherwise, carry
3806   on only if not anchored. */
3807 
3808   if (rc != PCRE2_ERROR_NOMATCH || anchored)
3809     {
3810     if (rc == PCRE2_ERROR_PARTIAL && match_data->oveccount > 0)
3811       {
3812       match_data->ovector[0] = (PCRE2_SIZE)(start_match - subject);
3813       match_data->ovector[1] = (PCRE2_SIZE)(end_subject - subject);
3814       }
3815     match_data->leftchar = (PCRE2_SIZE)(mb->start_used_ptr - subject);
3816     match_data->rightchar = (PCRE2_SIZE)( mb->last_used_ptr - subject);
3817     match_data->startchar = (PCRE2_SIZE)(start_match - subject);
3818     match_data->rc = rc;
3819     goto EXIT;
3820     }
3821 
3822   /* Advance to the next subject character unless we are at the end of a line
3823   and firstline is set. */
3824 
3825   if (firstline && IS_NEWLINE(start_match)) break;
3826   start_match++;
3827 #ifdef SUPPORT_UNICODE
3828   if (utf)
3829     {
3830     ACROSSCHAR(start_match < end_subject, start_match, start_match++);
3831     }
3832 #endif
3833   if (start_match > end_subject) break;
3834 
3835   /* If we have just passed a CR and we are now at a LF, and the pattern does
3836   not contain any explicit matches for \r or \n, and the newline option is CRLF
3837   or ANY or ANYCRLF, advance the match position by one more character. */
3838 
3839   if (UCHAR21TEST(start_match - 1) == CHAR_CR &&
3840       start_match < end_subject &&
3841       UCHAR21TEST(start_match) == CHAR_NL &&
3842       (re->flags & PCRE2_HASCRORLF) == 0 &&
3843         (mb->nltype == NLTYPE_ANY ||
3844          mb->nltype == NLTYPE_ANYCRLF ||
3845          mb->nllen == 2))
3846     start_match++;
3847 
3848   }   /* "Bumpalong" loop */
3849 
3850 NOMATCH_EXIT:
3851 rc = PCRE2_ERROR_NOMATCH;
3852 
3853 EXIT:
3854 while (rws->next != NULL)
3855   {
3856   RWS_anchor *next = rws->next;
3857   rws->next = next->next;
3858   mb->memctl.free(next, mb->memctl.memory_data);
3859   }
3860 
3861 return rc;
3862 }
3863 
3864 /* End of pcre2_dfa_match.c */
3865