1 /*************************************************
2 *      Perl-Compatible Regular Expressions       *
3 *************************************************/
4 
5 /* PCRE is a library of functions to support regular expressions whose syntax
6 and semantics are as close as possible to those of the Perl 5 language.
7 
8                        Written by Philip Hazel
9      Original API code Copyright (c) 1997-2012 University of Cambridge
10           New API code Copyright (c) 2016-2023 University of Cambridge
11 
12 -----------------------------------------------------------------------------
13 Redistribution and use in source and binary forms, with or without
14 modification, are permitted provided that the following conditions are met:
15 
16     * Redistributions of source code must retain the above copyright notice,
17       this list of conditions and the following disclaimer.
18 
19     * Redistributions in binary form must reproduce the above copyright
20       notice, this list of conditions and the following disclaimer in the
21       documentation and/or other materials provided with the distribution.
22 
23     * Neither the name of the University of Cambridge nor the names of its
24       contributors may be used to endorse or promote products derived from
25       this software without specific prior written permission.
26 
27 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
28 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
29 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
30 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
31 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
32 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
33 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
34 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
35 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
36 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
37 POSSIBILITY OF SUCH DAMAGE.
38 -----------------------------------------------------------------------------
39 */
40 
41 
42 /* This module contains the external function pcre2_dfa_match(), which is an
43 alternative matching function that uses a sort of DFA algorithm (not a true
44 FSM). This is NOT Perl-compatible, but it has advantages in certain
45 applications. */
46 
47 
48 /* NOTE ABOUT PERFORMANCE: A user of this function sent some code that improved
49 the performance of his patterns greatly. I could not use it as it stood, as it
50 was not thread safe, and made assumptions about pattern sizes. Also, it caused
51 test 7 to loop, and test 9 to crash with a segfault.
52 
53 The issue is the check for duplicate states, which is done by a simple linear
54 search up the state list. (Grep for "duplicate" below to find the code.) For
55 many patterns, there will never be many states active at one time, so a simple
56 linear search is fine. In patterns that have many active states, it might be a
57 bottleneck. The suggested code used an indexing scheme to remember which states
58 had previously been used for each character, and avoided the linear search when
59 it knew there was no chance of a duplicate. This was implemented when adding
60 states to the state lists.
61 
62 I wrote some thread-safe, not-limited code to try something similar at the time
63 of checking for duplicates (instead of when adding states), using index vectors
64 on the stack. It did give a 13% improvement with one specially constructed
65 pattern for certain subject strings, but on other strings and on many of the
66 simpler patterns in the test suite it did worse. The major problem, I think,
67 was the extra time to initialize the index. This had to be done for each call
68 of internal_dfa_match(). (The supplied patch used a static vector, initialized
69 only once - I suspect this was the cause of the problems with the tests.)
70 
71 Overall, I concluded that the gains in some cases did not outweigh the losses
72 in others, so I abandoned this code. */
73 
74 
75 #ifdef HAVE_CONFIG_H
76 #include "config.h"
77 #endif
78 
79 #define NLBLOCK mb             /* Block containing newline information */
80 #define PSSTART start_subject  /* Field containing processed string start */
81 #define PSEND   end_subject    /* Field containing processed string end */
82 
83 #include "pcre2_internal.h"
84 
85 #define PUBLIC_DFA_MATCH_OPTIONS \
86   (PCRE2_ANCHORED|PCRE2_ENDANCHORED|PCRE2_NOTBOL|PCRE2_NOTEOL|PCRE2_NOTEMPTY| \
87    PCRE2_NOTEMPTY_ATSTART|PCRE2_NO_UTF_CHECK|PCRE2_PARTIAL_HARD| \
88    PCRE2_PARTIAL_SOFT|PCRE2_DFA_SHORTEST|PCRE2_DFA_RESTART| \
89    PCRE2_COPY_MATCHED_SUBJECT)
90 
91 
92 /*************************************************
93 *      Code parameters and static tables         *
94 *************************************************/
95 
96 /* These are offsets that are used to turn the OP_TYPESTAR and friends opcodes
97 into others, under special conditions. A gap of 20 between the blocks should be
98 enough. The resulting opcodes don't have to be less than 256 because they are
99 never stored, so we push them well clear of the normal opcodes. */
100 
101 #define OP_PROP_EXTRA       300
102 #define OP_EXTUNI_EXTRA     320
103 #define OP_ANYNL_EXTRA      340
104 #define OP_HSPACE_EXTRA     360
105 #define OP_VSPACE_EXTRA     380
106 
107 
108 /* This table identifies those opcodes that are followed immediately by a
109 character that is to be tested in some way. This makes it possible to
110 centralize the loading of these characters. In the case of Type * etc, the
111 "character" is the opcode for \D, \d, \S, \s, \W, or \w, which will always be a
112 small value. Non-zero values in the table are the offsets from the opcode where
113 the character is to be found. ***NOTE*** If the start of this table is
114 modified, the three tables that follow must also be modified. */
115 
116 static const uint8_t coptable[] = {
117   0,                             /* End                                    */
118   0, 0, 0, 0, 0,                 /* \A, \G, \K, \B, \b                     */
119   0, 0, 0, 0, 0, 0,              /* \D, \d, \S, \s, \W, \w                 */
120   0, 0, 0,                       /* Any, AllAny, Anybyte                   */
121   0, 0,                          /* \P, \p                                 */
122   0, 0, 0, 0, 0,                 /* \R, \H, \h, \V, \v                     */
123   0,                             /* \X                                     */
124   0, 0, 0, 0, 0, 0,              /* \Z, \z, $, $M, ^, ^M                   */
125   1,                             /* Char                                   */
126   1,                             /* Chari                                  */
127   1,                             /* not                                    */
128   1,                             /* noti                                   */
129   /* Positive single-char repeats                                          */
130   1, 1, 1, 1, 1, 1,              /* *, *?, +, +?, ?, ??                    */
131   1+IMM2_SIZE, 1+IMM2_SIZE,      /* upto, minupto                          */
132   1+IMM2_SIZE,                   /* exact                                  */
133   1, 1, 1, 1+IMM2_SIZE,          /* *+, ++, ?+, upto+                      */
134   1, 1, 1, 1, 1, 1,              /* *I, *?I, +I, +?I, ?I, ??I              */
135   1+IMM2_SIZE, 1+IMM2_SIZE,      /* upto I, minupto I                      */
136   1+IMM2_SIZE,                   /* exact I                                */
137   1, 1, 1, 1+IMM2_SIZE,          /* *+I, ++I, ?+I, upto+I                  */
138   /* Negative single-char repeats - only for chars < 256                   */
139   1, 1, 1, 1, 1, 1,              /* NOT *, *?, +, +?, ?, ??                */
140   1+IMM2_SIZE, 1+IMM2_SIZE,      /* NOT upto, minupto                      */
141   1+IMM2_SIZE,                   /* NOT exact                              */
142   1, 1, 1, 1+IMM2_SIZE,          /* NOT *+, ++, ?+, upto+                  */
143   1, 1, 1, 1, 1, 1,              /* NOT *I, *?I, +I, +?I, ?I, ??I          */
144   1+IMM2_SIZE, 1+IMM2_SIZE,      /* NOT upto I, minupto I                  */
145   1+IMM2_SIZE,                   /* NOT exact I                            */
146   1, 1, 1, 1+IMM2_SIZE,          /* NOT *+I, ++I, ?+I, upto+I              */
147   /* Positive type repeats                                                 */
148   1, 1, 1, 1, 1, 1,              /* Type *, *?, +, +?, ?, ??               */
149   1+IMM2_SIZE, 1+IMM2_SIZE,      /* Type upto, minupto                     */
150   1+IMM2_SIZE,                   /* Type exact                             */
151   1, 1, 1, 1+IMM2_SIZE,          /* Type *+, ++, ?+, upto+                 */
152   /* Character class & ref repeats                                         */
153   0, 0, 0, 0, 0, 0,              /* *, *?, +, +?, ?, ??                    */
154   0, 0,                          /* CRRANGE, CRMINRANGE                    */
155   0, 0, 0, 0,                    /* Possessive *+, ++, ?+, CRPOSRANGE      */
156   0,                             /* CLASS                                  */
157   0,                             /* NCLASS                                 */
158   0,                             /* XCLASS - variable length               */
159   0,                             /* REF                                    */
160   0,                             /* REFI                                   */
161   0,                             /* DNREF                                  */
162   0,                             /* DNREFI                                 */
163   0,                             /* RECURSE                                */
164   0,                             /* CALLOUT                                */
165   0,                             /* CALLOUT_STR                            */
166   0,                             /* Alt                                    */
167   0,                             /* Ket                                    */
168   0,                             /* KetRmax                                */
169   0,                             /* KetRmin                                */
170   0,                             /* KetRpos                                */
171   0, 0,                          /* Reverse, Vreverse                      */
172   0,                             /* Assert                                 */
173   0,                             /* Assert not                             */
174   0,                             /* Assert behind                          */
175   0,                             /* Assert behind not                      */
176   0,                             /* NA assert                              */
177   0,                             /* NA assert behind                       */
178   0,                             /* ONCE                                   */
179   0,                             /* SCRIPT_RUN                             */
180   0, 0, 0, 0, 0,                 /* BRA, BRAPOS, CBRA, CBRAPOS, COND       */
181   0, 0, 0, 0, 0,                 /* SBRA, SBRAPOS, SCBRA, SCBRAPOS, SCOND  */
182   0, 0,                          /* CREF, DNCREF                           */
183   0, 0,                          /* RREF, DNRREF                           */
184   0, 0,                          /* FALSE, TRUE                            */
185   0, 0, 0,                       /* BRAZERO, BRAMINZERO, BRAPOSZERO        */
186   0, 0, 0,                       /* MARK, PRUNE, PRUNE_ARG                 */
187   0, 0, 0, 0,                    /* SKIP, SKIP_ARG, THEN, THEN_ARG         */
188   0, 0,                          /* COMMIT, COMMIT_ARG                     */
189   0, 0, 0,                       /* FAIL, ACCEPT, ASSERT_ACCEPT            */
190   0, 0, 0,                       /* CLOSE, SKIPZERO, DEFINE                */
191   0, 0                           /* \B and \b in UCP mode                  */
192 };
193 
194 /* This table identifies those opcodes that inspect a character. It is used to
195 remember the fact that a character could have been inspected when the end of
196 the subject is reached. ***NOTE*** If the start of this table is modified, the
197 two tables that follow must also be modified. */
198 
199 static const uint8_t poptable[] = {
200   0,                             /* End                                    */
201   0, 0, 0, 1, 1,                 /* \A, \G, \K, \B, \b                     */
202   1, 1, 1, 1, 1, 1,              /* \D, \d, \S, \s, \W, \w                 */
203   1, 1, 1,                       /* Any, AllAny, Anybyte                   */
204   1, 1,                          /* \P, \p                                 */
205   1, 1, 1, 1, 1,                 /* \R, \H, \h, \V, \v                     */
206   1,                             /* \X                                     */
207   0, 0, 0, 0, 0, 0,              /* \Z, \z, $, $M, ^, ^M                   */
208   1,                             /* Char                                   */
209   1,                             /* Chari                                  */
210   1,                             /* not                                    */
211   1,                             /* noti                                   */
212   /* Positive single-char repeats                                          */
213   1, 1, 1, 1, 1, 1,              /* *, *?, +, +?, ?, ??                    */
214   1, 1, 1,                       /* upto, minupto, exact                   */
215   1, 1, 1, 1,                    /* *+, ++, ?+, upto+                      */
216   1, 1, 1, 1, 1, 1,              /* *I, *?I, +I, +?I, ?I, ??I              */
217   1, 1, 1,                       /* upto I, minupto I, exact I             */
218   1, 1, 1, 1,                    /* *+I, ++I, ?+I, upto+I                  */
219   /* Negative single-char repeats - only for chars < 256                   */
220   1, 1, 1, 1, 1, 1,              /* NOT *, *?, +, +?, ?, ??                */
221   1, 1, 1,                       /* NOT upto, minupto, exact               */
222   1, 1, 1, 1,                    /* NOT *+, ++, ?+, upto+                  */
223   1, 1, 1, 1, 1, 1,              /* NOT *I, *?I, +I, +?I, ?I, ??I          */
224   1, 1, 1,                       /* NOT upto I, minupto I, exact I         */
225   1, 1, 1, 1,                    /* NOT *+I, ++I, ?+I, upto+I              */
226   /* Positive type repeats                                                 */
227   1, 1, 1, 1, 1, 1,              /* Type *, *?, +, +?, ?, ??               */
228   1, 1, 1,                       /* Type upto, minupto, exact              */
229   1, 1, 1, 1,                    /* Type *+, ++, ?+, upto+                 */
230   /* Character class & ref repeats                                         */
231   1, 1, 1, 1, 1, 1,              /* *, *?, +, +?, ?, ??                    */
232   1, 1,                          /* CRRANGE, CRMINRANGE                    */
233   1, 1, 1, 1,                    /* Possessive *+, ++, ?+, CRPOSRANGE      */
234   1,                             /* CLASS                                  */
235   1,                             /* NCLASS                                 */
236   1,                             /* XCLASS - variable length               */
237   0,                             /* REF                                    */
238   0,                             /* REFI                                   */
239   0,                             /* DNREF                                  */
240   0,                             /* DNREFI                                 */
241   0,                             /* RECURSE                                */
242   0,                             /* CALLOUT                                */
243   0,                             /* CALLOUT_STR                            */
244   0,                             /* Alt                                    */
245   0,                             /* Ket                                    */
246   0,                             /* KetRmax                                */
247   0,                             /* KetRmin                                */
248   0,                             /* KetRpos                                */
249   0, 0,                          /* Reverse, Vreverse                      */
250   0,                             /* Assert                                 */
251   0,                             /* Assert not                             */
252   0,                             /* Assert behind                          */
253   0,                             /* Assert behind not                      */
254   0,                             /* NA assert                              */
255   0,                             /* NA assert behind                       */
256   0,                             /* ONCE                                   */
257   0,                             /* SCRIPT_RUN                             */
258   0, 0, 0, 0, 0,                 /* BRA, BRAPOS, CBRA, CBRAPOS, COND       */
259   0, 0, 0, 0, 0,                 /* SBRA, SBRAPOS, SCBRA, SCBRAPOS, SCOND  */
260   0, 0,                          /* CREF, DNCREF                           */
261   0, 0,                          /* RREF, DNRREF                           */
262   0, 0,                          /* FALSE, TRUE                            */
263   0, 0, 0,                       /* BRAZERO, BRAMINZERO, BRAPOSZERO        */
264   0, 0, 0,                       /* MARK, PRUNE, PRUNE_ARG                 */
265   0, 0, 0, 0,                    /* SKIP, SKIP_ARG, THEN, THEN_ARG         */
266   0, 0,                          /* COMMIT, COMMIT_ARG                     */
267   0, 0, 0,                       /* FAIL, ACCEPT, ASSERT_ACCEPT            */
268   0, 0, 0,                       /* CLOSE, SKIPZERO, DEFINE                */
269   1, 1                           /* \B and \b in UCP mode                  */
270 };
271 
272 /* These 2 tables allow for compact code for testing for \D, \d, \S, \s, \W,
273 and \w */
274 
275 static const uint8_t toptable1[] = {
276   0, 0, 0, 0, 0, 0,
277   ctype_digit, ctype_digit,
278   ctype_space, ctype_space,
279   ctype_word,  ctype_word,
280   0, 0                            /* OP_ANY, OP_ALLANY */
281 };
282 
283 static const uint8_t toptable2[] = {
284   0, 0, 0, 0, 0, 0,
285   ctype_digit, 0,
286   ctype_space, 0,
287   ctype_word,  0,
288   1, 1                            /* OP_ANY, OP_ALLANY */
289 };
290 
291 
292 /* Structure for holding data about a particular state, which is in effect the
293 current data for an active path through the match tree. It must consist
294 entirely of ints because the working vector we are passed, and which we put
295 these structures in, is a vector of ints. */
296 
297 typedef struct stateblock {
298   int offset;                     /* Offset to opcode (-ve has meaning) */
299   int count;                      /* Count for repeats */
300   int data;                       /* Some use extra data */
301 } stateblock;
302 
303 #define INTS_PER_STATEBLOCK  (int)(sizeof(stateblock)/sizeof(int))
304 
305 
306 /* Before version 10.32 the recursive calls of internal_dfa_match() were passed
307 local working space and output vectors that were created on the stack. This has
308 caused issues for some patterns, especially in small-stack environments such as
309 Windows. A new scheme is now in use which sets up a vector on the stack, but if
310 this is too small, heap memory is used, up to the heap_limit. The main
311 parameters are all numbers of ints because the workspace is a vector of ints.
312 
313 The size of the starting stack vector, DFA_START_RWS_SIZE, is in bytes, and is
314 defined in pcre2_internal.h so as to be available to pcre2test when it is
315 finding the minimum heap requirement for a match. */
316 
317 #define OVEC_UNIT  (sizeof(PCRE2_SIZE)/sizeof(int))
318 
319 #define RWS_BASE_SIZE   (DFA_START_RWS_SIZE/sizeof(int))  /* Stack vector */
320 #define RWS_RSIZE       1000                    /* Work size for recursion */
321 #define RWS_OVEC_RSIZE  (1000*OVEC_UNIT)        /* Ovector for recursion */
322 #define RWS_OVEC_OSIZE  (2*OVEC_UNIT)           /* Ovector in other cases */
323 
324 /* This structure is at the start of each workspace block. */
325 
326 typedef struct RWS_anchor {
327   struct RWS_anchor *next;
328   uint32_t size;  /* Number of ints */
329   uint32_t free;  /* Number of ints */
330 } RWS_anchor;
331 
332 #define RWS_ANCHOR_SIZE (sizeof(RWS_anchor)/sizeof(int))
333 
334 
335 
336 /*************************************************
337 *               Process a callout                *
338 *************************************************/
339 
340 /* This function is called to perform a callout.
341 
342 Arguments:
343   code              current code pointer
344   offsets           points to current capture offsets
345   current_subject   start of current subject match
346   ptr               current position in subject
347   mb                the match block
348   extracode         extra code offset when called from condition
349   lengthptr         where to return the callout length
350 
351 Returns:            the return from the callout
352 */
353 
354 static int
do_callout_dfa(PCRE2_SPTR code,PCRE2_SIZE * offsets,PCRE2_SPTR current_subject,PCRE2_SPTR ptr,dfa_match_block * mb,PCRE2_SIZE extracode,PCRE2_SIZE * lengthptr)355 do_callout_dfa(PCRE2_SPTR code, PCRE2_SIZE *offsets, PCRE2_SPTR current_subject,
356   PCRE2_SPTR ptr, dfa_match_block *mb, PCRE2_SIZE extracode,
357   PCRE2_SIZE *lengthptr)
358 {
359 pcre2_callout_block *cb = mb->cb;
360 
361 *lengthptr = (code[extracode] == OP_CALLOUT)?
362   (PCRE2_SIZE)PRIV(OP_lengths)[OP_CALLOUT] :
363   (PCRE2_SIZE)GET(code, 1 + 2*LINK_SIZE + extracode);
364 
365 if (mb->callout == NULL) return 0;    /* No callout provided */
366 
367 /* Fixed fields in the callout block are set once and for all at the start of
368 matching. */
369 
370 cb->offset_vector    = offsets;
371 cb->start_match      = (PCRE2_SIZE)(current_subject - mb->start_subject);
372 cb->current_position = (PCRE2_SIZE)(ptr - mb->start_subject);
373 cb->pattern_position = GET(code, 1 + extracode);
374 cb->next_item_length = GET(code, 1 + LINK_SIZE + extracode);
375 
376 if (code[extracode] == OP_CALLOUT)
377   {
378   cb->callout_number = code[1 + 2*LINK_SIZE + extracode];
379   cb->callout_string_offset = 0;
380   cb->callout_string = NULL;
381   cb->callout_string_length = 0;
382   }
383 else
384   {
385   cb->callout_number = 0;
386   cb->callout_string_offset = GET(code, 1 + 3*LINK_SIZE + extracode);
387   cb->callout_string = code + (1 + 4*LINK_SIZE + extracode) + 1;
388   cb->callout_string_length = *lengthptr - (1 + 4*LINK_SIZE) - 2;
389   }
390 
391 return (mb->callout)(cb, mb->callout_data);
392 }
393 
394 
395 
396 /*************************************************
397 *         Expand local workspace memory          *
398 *************************************************/
399 
400 /* This function is called when internal_dfa_match() is about to be called
401 recursively and there is insufficient working space left in the current
402 workspace block. If there's an existing next block, use it; otherwise get a new
403 block unless the heap limit is reached.
404 
405 Arguments:
406   rwsptr     pointer to block pointer (updated)
407   ovecsize   space needed for an ovector
408   mb         the match block
409 
410 Returns:     0 rwsptr has been updated
411             !0 an error code
412 */
413 
414 static int
more_workspace(RWS_anchor ** rwsptr,unsigned int ovecsize,dfa_match_block * mb)415 more_workspace(RWS_anchor **rwsptr, unsigned int ovecsize, dfa_match_block *mb)
416 {
417 RWS_anchor *rws = *rwsptr;
418 RWS_anchor *new;
419 
420 if (rws->next != NULL)
421   {
422   new = rws->next;
423   }
424 
425 /* Sizes in the RWS_anchor blocks are in units of sizeof(int), but
426 mb->heap_limit and mb->heap_used are in kibibytes. Play carefully, to avoid
427 overflow. */
428 
429 else
430   {
431   uint32_t newsize = (rws->size >= UINT32_MAX/(sizeof(int)*2))? UINT32_MAX/sizeof(int) : rws->size * 2;
432   uint32_t newsizeK = newsize/(1024/sizeof(int));
433 
434   if (newsizeK + mb->heap_used > mb->heap_limit)
435     newsizeK = (uint32_t)(mb->heap_limit - mb->heap_used);
436   newsize = newsizeK*(1024/sizeof(int));
437 
438   if (newsize < RWS_RSIZE + ovecsize + RWS_ANCHOR_SIZE)
439     return PCRE2_ERROR_HEAPLIMIT;
440   new = mb->memctl.malloc(newsize*sizeof(int), mb->memctl.memory_data);
441   if (new == NULL) return PCRE2_ERROR_NOMEMORY;
442   mb->heap_used += newsizeK;
443   new->next = NULL;
444   new->size = newsize;
445   rws->next = new;
446   }
447 
448 new->free = new->size - RWS_ANCHOR_SIZE;
449 *rwsptr = new;
450 return 0;
451 }
452 
453 
454 
455 /*************************************************
456 *     Match a Regular Expression - DFA engine    *
457 *************************************************/
458 
459 /* This internal function applies a compiled pattern to a subject string,
460 starting at a given point, using a DFA engine. This function is called from the
461 external one, possibly multiple times if the pattern is not anchored. The
462 function calls itself recursively for some kinds of subpattern.
463 
464 Arguments:
465   mb                the match_data block with fixed information
466   this_start_code   the opening bracket of this subexpression's code
467   current_subject   where we currently are in the subject string
468   start_offset      start offset in the subject string
469   offsets           vector to contain the matching string offsets
470   offsetcount       size of same
471   workspace         vector of workspace
472   wscount           size of same
473   rlevel            function call recursion level
474 
475 Returns:            > 0 => number of match offset pairs placed in offsets
476                     = 0 => offsets overflowed; longest matches are present
477                      -1 => failed to match
478                    < -1 => some kind of unexpected problem
479 
480 The following macros are used for adding states to the two state vectors (one
481 for the current character, one for the following character). */
482 
483 #define ADD_ACTIVE(x,y) \
484   if (active_count++ < wscount) \
485     { \
486     next_active_state->offset = (x); \
487     next_active_state->count  = (y); \
488     next_active_state++; \
489     } \
490   else return PCRE2_ERROR_DFA_WSSIZE
491 
492 #define ADD_ACTIVE_DATA(x,y,z) \
493   if (active_count++ < wscount) \
494     { \
495     next_active_state->offset = (x); \
496     next_active_state->count  = (y); \
497     next_active_state->data   = (z); \
498     next_active_state++; \
499     } \
500   else return PCRE2_ERROR_DFA_WSSIZE
501 
502 #define ADD_NEW(x,y) \
503   if (new_count++ < wscount) \
504     { \
505     next_new_state->offset = (x); \
506     next_new_state->count  = (y); \
507     next_new_state++; \
508     } \
509   else return PCRE2_ERROR_DFA_WSSIZE
510 
511 #define ADD_NEW_DATA(x,y,z) \
512   if (new_count++ < wscount) \
513     { \
514     next_new_state->offset = (x); \
515     next_new_state->count  = (y); \
516     next_new_state->data   = (z); \
517     next_new_state++; \
518     } \
519   else return PCRE2_ERROR_DFA_WSSIZE
520 
521 /* And now, here is the code */
522 
523 static int
internal_dfa_match(dfa_match_block * mb,PCRE2_SPTR this_start_code,PCRE2_SPTR current_subject,PCRE2_SIZE start_offset,PCRE2_SIZE * offsets,uint32_t offsetcount,int * workspace,int wscount,uint32_t rlevel,int * RWS)524 internal_dfa_match(
525   dfa_match_block *mb,
526   PCRE2_SPTR this_start_code,
527   PCRE2_SPTR current_subject,
528   PCRE2_SIZE start_offset,
529   PCRE2_SIZE *offsets,
530   uint32_t offsetcount,
531   int *workspace,
532   int wscount,
533   uint32_t rlevel,
534   int *RWS)
535 {
536 stateblock *active_states, *new_states, *temp_states;
537 stateblock *next_active_state, *next_new_state;
538 const uint8_t *ctypes, *lcc, *fcc;
539 PCRE2_SPTR ptr;
540 PCRE2_SPTR end_code;
541 dfa_recursion_info new_recursive;
542 int active_count, new_count, match_count;
543 
544 /* Some fields in the mb block are frequently referenced, so we load them into
545 independent variables in the hope that this will perform better. */
546 
547 PCRE2_SPTR start_subject = mb->start_subject;
548 PCRE2_SPTR end_subject = mb->end_subject;
549 PCRE2_SPTR start_code = mb->start_code;
550 
551 #ifdef SUPPORT_UNICODE
552 BOOL utf = (mb->poptions & PCRE2_UTF) != 0;
553 BOOL utf_or_ucp = utf || (mb->poptions & PCRE2_UCP) != 0;
554 #else
555 BOOL utf = FALSE;
556 #endif
557 
558 BOOL reset_could_continue = FALSE;
559 
560 if (mb->match_call_count++ >= mb->match_limit) return PCRE2_ERROR_MATCHLIMIT;
561 if (rlevel++ > mb->match_limit_depth) return PCRE2_ERROR_DEPTHLIMIT;
562 offsetcount &= (uint32_t)(-2);  /* Round down */
563 
564 wscount -= 2;
565 wscount = (wscount - (wscount % (INTS_PER_STATEBLOCK * 2))) /
566           (2 * INTS_PER_STATEBLOCK);
567 
568 ctypes = mb->tables + ctypes_offset;
569 lcc = mb->tables + lcc_offset;
570 fcc = mb->tables + fcc_offset;
571 
572 match_count = PCRE2_ERROR_NOMATCH;   /* A negative number */
573 
574 active_states = (stateblock *)(workspace + 2);
575 next_new_state = new_states = active_states + wscount;
576 new_count = 0;
577 
578 /* The first thing in any (sub) pattern is a bracket of some sort. Push all
579 the alternative states onto the list, and find out where the end is. This
580 makes is possible to use this function recursively, when we want to stop at a
581 matching internal ket rather than at the end.
582 
583 If we are dealing with a backward assertion we have to find out the maximum
584 amount to move back, and set up each alternative appropriately. */
585 
586 if (*this_start_code == OP_ASSERTBACK || *this_start_code == OP_ASSERTBACK_NOT)
587   {
588   size_t max_back = 0;
589   size_t gone_back;
590 
591   end_code = this_start_code;
592   do
593     {
594     size_t back = (size_t)GET2(end_code, 2+LINK_SIZE);
595     if (back > max_back) max_back = back;
596     end_code += GET(end_code, 1);
597     }
598   while (*end_code == OP_ALT);
599 
600   /* If we can't go back the amount required for the longest lookbehind
601   pattern, go back as far as we can; some alternatives may still be viable. */
602 
603 #ifdef SUPPORT_UNICODE
604   /* In character mode we have to step back character by character */
605 
606   if (utf)
607     {
608     for (gone_back = 0; gone_back < max_back; gone_back++)
609       {
610       if (current_subject <= start_subject) break;
611       current_subject--;
612       ACROSSCHAR(current_subject > start_subject, current_subject,
613         current_subject--);
614       }
615     }
616   else
617 #endif
618 
619   /* In byte-mode we can do this quickly. */
620 
621     {
622     size_t current_offset = (size_t)(current_subject - start_subject);
623     gone_back = (current_offset < max_back)? current_offset : max_back;
624     current_subject -= gone_back;
625     }
626 
627   /* Save the earliest consulted character */
628 
629   if (current_subject < mb->start_used_ptr)
630     mb->start_used_ptr = current_subject;
631 
632   /* Now we can process the individual branches. There will be an OP_REVERSE at
633   the start of each branch, except when the length of the branch is zero. */
634 
635   end_code = this_start_code;
636   do
637     {
638     uint32_t revlen = (end_code[1+LINK_SIZE] == OP_REVERSE)? 1 + IMM2_SIZE : 0;
639     size_t back = (revlen == 0)? 0 : (size_t)GET2(end_code, 2+LINK_SIZE);
640     if (back <= gone_back)
641       {
642       int bstate = (int)(end_code - start_code + 1 + LINK_SIZE + revlen);
643       ADD_NEW_DATA(-bstate, 0, (int)(gone_back - back));
644       }
645     end_code += GET(end_code, 1);
646     }
647   while (*end_code == OP_ALT);
648  }
649 
650 /* This is the code for a "normal" subpattern (not a backward assertion). The
651 start of a whole pattern is always one of these. If we are at the top level,
652 we may be asked to restart matching from the same point that we reached for a
653 previous partial match. We still have to scan through the top-level branches to
654 find the end state. */
655 
656 else
657   {
658   end_code = this_start_code;
659 
660   /* Restarting */
661 
662   if (rlevel == 1 && (mb->moptions & PCRE2_DFA_RESTART) != 0)
663     {
664     do { end_code += GET(end_code, 1); } while (*end_code == OP_ALT);
665     new_count = workspace[1];
666     if (!workspace[0])
667       memcpy(new_states, active_states, (size_t)new_count * sizeof(stateblock));
668     }
669 
670   /* Not restarting */
671 
672   else
673     {
674     int length = 1 + LINK_SIZE +
675       ((*this_start_code == OP_CBRA || *this_start_code == OP_SCBRA ||
676         *this_start_code == OP_CBRAPOS || *this_start_code == OP_SCBRAPOS)
677         ? IMM2_SIZE:0);
678     do
679       {
680       ADD_NEW((int)(end_code - start_code + length), 0);
681       end_code += GET(end_code, 1);
682       length = 1 + LINK_SIZE;
683       }
684     while (*end_code == OP_ALT);
685     }
686   }
687 
688 workspace[0] = 0;    /* Bit indicating which vector is current */
689 
690 /* Loop for scanning the subject */
691 
692 ptr = current_subject;
693 for (;;)
694   {
695   int i, j;
696   int clen, dlen;
697   uint32_t c, d;
698   int forced_fail = 0;
699   BOOL partial_newline = FALSE;
700   BOOL could_continue = reset_could_continue;
701   reset_could_continue = FALSE;
702 
703   if (ptr > mb->last_used_ptr) mb->last_used_ptr = ptr;
704 
705   /* Make the new state list into the active state list and empty the
706   new state list. */
707 
708   temp_states = active_states;
709   active_states = new_states;
710   new_states = temp_states;
711   active_count = new_count;
712   new_count = 0;
713 
714   workspace[0] ^= 1;              /* Remember for the restarting feature */
715   workspace[1] = active_count;
716 
717   /* Set the pointers for adding new states */
718 
719   next_active_state = active_states + active_count;
720   next_new_state = new_states;
721 
722   /* Load the current character from the subject outside the loop, as many
723   different states may want to look at it, and we assume that at least one
724   will. */
725 
726   if (ptr < end_subject)
727     {
728     clen = 1;        /* Number of data items in the character */
729 #ifdef SUPPORT_UNICODE
730     GETCHARLENTEST(c, ptr, clen);
731 #else
732     c = *ptr;
733 #endif  /* SUPPORT_UNICODE */
734     }
735   else
736     {
737     clen = 0;        /* This indicates the end of the subject */
738     c = NOTACHAR;    /* This value should never actually be used */
739     }
740 
741   /* Scan up the active states and act on each one. The result of an action
742   may be to add more states to the currently active list (e.g. on hitting a
743   parenthesis) or it may be to put states on the new list, for considering
744   when we move the character pointer on. */
745 
746   for (i = 0; i < active_count; i++)
747     {
748     stateblock *current_state = active_states + i;
749     BOOL caseless = FALSE;
750     PCRE2_SPTR code;
751     uint32_t codevalue;
752     int state_offset = current_state->offset;
753     int rrc;
754     int count;
755 
756     /* A negative offset is a special case meaning "hold off going to this
757     (negated) state until the number of characters in the data field have
758     been skipped". If the could_continue flag was passed over from a previous
759     state, arrange for it to passed on. */
760 
761     if (state_offset < 0)
762       {
763       if (current_state->data > 0)
764         {
765         ADD_NEW_DATA(state_offset, current_state->count,
766           current_state->data - 1);
767         if (could_continue) reset_could_continue = TRUE;
768         continue;
769         }
770       else
771         {
772         current_state->offset = state_offset = -state_offset;
773         }
774       }
775 
776     /* Check for a duplicate state with the same count, and skip if found.
777     See the note at the head of this module about the possibility of improving
778     performance here. */
779 
780     for (j = 0; j < i; j++)
781       {
782       if (active_states[j].offset == state_offset &&
783           active_states[j].count == current_state->count)
784         goto NEXT_ACTIVE_STATE;
785       }
786 
787     /* The state offset is the offset to the opcode */
788 
789     code = start_code + state_offset;
790     codevalue = *code;
791 
792     /* If this opcode inspects a character, but we are at the end of the
793     subject, remember the fact for use when testing for a partial match. */
794 
795     if (clen == 0 && poptable[codevalue] != 0)
796       could_continue = TRUE;
797 
798     /* If this opcode is followed by an inline character, load it. It is
799     tempting to test for the presence of a subject character here, but that
800     is wrong, because sometimes zero repetitions of the subject are
801     permitted.
802 
803     We also use this mechanism for opcodes such as OP_TYPEPLUS that take an
804     argument that is not a data character - but is always one byte long because
805     the values are small. We have to take special action to deal with  \P, \p,
806     \H, \h, \V, \v and \X in this case. To keep the other cases fast, convert
807     these ones to new opcodes. */
808 
809     if (coptable[codevalue] > 0)
810       {
811       dlen = 1;
812 #ifdef SUPPORT_UNICODE
813       if (utf) { GETCHARLEN(d, (code + coptable[codevalue]), dlen); } else
814 #endif  /* SUPPORT_UNICODE */
815       d = code[coptable[codevalue]];
816       if (codevalue >= OP_TYPESTAR)
817         {
818         switch(d)
819           {
820           case OP_ANYBYTE: return PCRE2_ERROR_DFA_UITEM;
821           case OP_NOTPROP:
822           case OP_PROP: codevalue += OP_PROP_EXTRA; break;
823           case OP_ANYNL: codevalue += OP_ANYNL_EXTRA; break;
824           case OP_EXTUNI: codevalue += OP_EXTUNI_EXTRA; break;
825           case OP_NOT_HSPACE:
826           case OP_HSPACE: codevalue += OP_HSPACE_EXTRA; break;
827           case OP_NOT_VSPACE:
828           case OP_VSPACE: codevalue += OP_VSPACE_EXTRA; break;
829           default: break;
830           }
831         }
832       }
833     else
834       {
835       dlen = 0;         /* Not strictly necessary, but compilers moan */
836       d = NOTACHAR;     /* if these variables are not set. */
837       }
838 
839 
840     /* Now process the individual opcodes */
841 
842     switch (codevalue)
843       {
844 /* ========================================================================== */
845       /* These cases are never obeyed. This is a fudge that causes a compile-
846       time error if the vectors coptable or poptable, which are indexed by
847       opcode, are not the correct length. It seems to be the only way to do
848       such a check at compile time, as the sizeof() operator does not work
849       in the C preprocessor. */
850 
851       case OP_TABLE_LENGTH:
852       case OP_TABLE_LENGTH +
853         ((sizeof(coptable) == OP_TABLE_LENGTH) &&
854          (sizeof(poptable) == OP_TABLE_LENGTH)):
855       return 0;
856 
857 /* ========================================================================== */
858       /* Reached a closing bracket. If not at the end of the pattern, carry
859       on with the next opcode. For repeating opcodes, also add the repeat
860       state. Note that KETRPOS will always be encountered at the end of the
861       subpattern, because the possessive subpattern repeats are always handled
862       using recursive calls. Thus, it never adds any new states.
863 
864       At the end of the (sub)pattern, unless we have an empty string and
865       PCRE2_NOTEMPTY is set, or PCRE2_NOTEMPTY_ATSTART is set and we are at the
866       start of the subject, save the match data, shifting up all previous
867       matches so we always have the longest first. */
868 
869       case OP_KET:
870       case OP_KETRMIN:
871       case OP_KETRMAX:
872       case OP_KETRPOS:
873       if (code != end_code)
874         {
875         ADD_ACTIVE(state_offset + 1 + LINK_SIZE, 0);
876         if (codevalue != OP_KET)
877           {
878           ADD_ACTIVE(state_offset - (int)GET(code, 1), 0);
879           }
880         }
881       else
882         {
883         if (ptr > current_subject ||
884             ((mb->moptions & PCRE2_NOTEMPTY) == 0 &&
885               ((mb->moptions & PCRE2_NOTEMPTY_ATSTART) == 0 ||
886                 current_subject > start_subject + mb->start_offset)))
887           {
888           if (match_count < 0) match_count = (offsetcount >= 2)? 1 : 0;
889             else if (match_count > 0 && ++match_count * 2 > (int)offsetcount)
890               match_count = 0;
891           count = ((match_count == 0)? (int)offsetcount : match_count * 2) - 2;
892           if (count > 0) (void)memmove(offsets + 2, offsets,
893             (size_t)count * sizeof(PCRE2_SIZE));
894           if (offsetcount >= 2)
895             {
896             offsets[0] = (PCRE2_SIZE)(current_subject - start_subject);
897             offsets[1] = (PCRE2_SIZE)(ptr - start_subject);
898             }
899           if ((mb->moptions & PCRE2_DFA_SHORTEST) != 0) return match_count;
900           }
901         }
902       break;
903 
904 /* ========================================================================== */
905       /* These opcodes add to the current list of states without looking
906       at the current character. */
907 
908       /*-----------------------------------------------------------------*/
909       case OP_ALT:
910       do { code += GET(code, 1); } while (*code == OP_ALT);
911       ADD_ACTIVE((int)(code - start_code), 0);
912       break;
913 
914       /*-----------------------------------------------------------------*/
915       case OP_BRA:
916       case OP_SBRA:
917       do
918         {
919         ADD_ACTIVE((int)(code - start_code + 1 + LINK_SIZE), 0);
920         code += GET(code, 1);
921         }
922       while (*code == OP_ALT);
923       break;
924 
925       /*-----------------------------------------------------------------*/
926       case OP_CBRA:
927       case OP_SCBRA:
928       ADD_ACTIVE((int)(code - start_code + 1 + LINK_SIZE + IMM2_SIZE),  0);
929       code += GET(code, 1);
930       while (*code == OP_ALT)
931         {
932         ADD_ACTIVE((int)(code - start_code + 1 + LINK_SIZE),  0);
933         code += GET(code, 1);
934         }
935       break;
936 
937       /*-----------------------------------------------------------------*/
938       case OP_BRAZERO:
939       case OP_BRAMINZERO:
940       ADD_ACTIVE(state_offset + 1, 0);
941       code += 1 + GET(code, 2);
942       while (*code == OP_ALT) code += GET(code, 1);
943       ADD_ACTIVE((int)(code - start_code + 1 + LINK_SIZE), 0);
944       break;
945 
946       /*-----------------------------------------------------------------*/
947       case OP_SKIPZERO:
948       code += 1 + GET(code, 2);
949       while (*code == OP_ALT) code += GET(code, 1);
950       ADD_ACTIVE((int)(code - start_code + 1 + LINK_SIZE), 0);
951       break;
952 
953       /*-----------------------------------------------------------------*/
954       case OP_CIRC:
955       if (ptr == start_subject && (mb->moptions & PCRE2_NOTBOL) == 0)
956         { ADD_ACTIVE(state_offset + 1, 0); }
957       break;
958 
959       /*-----------------------------------------------------------------*/
960       case OP_CIRCM:
961       if ((ptr == start_subject && (mb->moptions & PCRE2_NOTBOL) == 0) ||
962           ((ptr != end_subject || (mb->poptions & PCRE2_ALT_CIRCUMFLEX) != 0 )
963             && WAS_NEWLINE(ptr)))
964         { ADD_ACTIVE(state_offset + 1, 0); }
965       break;
966 
967       /*-----------------------------------------------------------------*/
968       case OP_EOD:
969       if (ptr >= end_subject)
970         {
971         if ((mb->moptions & PCRE2_PARTIAL_HARD) != 0)
972           return PCRE2_ERROR_PARTIAL;
973         else { ADD_ACTIVE(state_offset + 1, 0); }
974         }
975       break;
976 
977       /*-----------------------------------------------------------------*/
978       case OP_SOD:
979       if (ptr == start_subject) { ADD_ACTIVE(state_offset + 1, 0); }
980       break;
981 
982       /*-----------------------------------------------------------------*/
983       case OP_SOM:
984       if (ptr == start_subject + start_offset) { ADD_ACTIVE(state_offset + 1, 0); }
985       break;
986 
987 
988 /* ========================================================================== */
989       /* These opcodes inspect the next subject character, and sometimes
990       the previous one as well, but do not have an argument. The variable
991       clen contains the length of the current character and is zero if we are
992       at the end of the subject. */
993 
994       /*-----------------------------------------------------------------*/
995       case OP_ANY:
996       if (clen > 0 && !IS_NEWLINE(ptr))
997         {
998         if (ptr + 1 >= mb->end_subject &&
999             (mb->moptions & (PCRE2_PARTIAL_HARD)) != 0 &&
1000             NLBLOCK->nltype == NLTYPE_FIXED &&
1001             NLBLOCK->nllen == 2 &&
1002             c == NLBLOCK->nl[0])
1003           {
1004           could_continue = partial_newline = TRUE;
1005           }
1006         else
1007           {
1008           ADD_NEW(state_offset + 1, 0);
1009           }
1010         }
1011       break;
1012 
1013       /*-----------------------------------------------------------------*/
1014       case OP_ALLANY:
1015       if (clen > 0)
1016         { ADD_NEW(state_offset + 1, 0); }
1017       break;
1018 
1019       /*-----------------------------------------------------------------*/
1020       case OP_EODN:
1021       if (clen == 0 || (IS_NEWLINE(ptr) && ptr == end_subject - mb->nllen))
1022         {
1023         if ((mb->moptions & PCRE2_PARTIAL_HARD) != 0)
1024           return PCRE2_ERROR_PARTIAL;
1025         ADD_ACTIVE(state_offset + 1, 0);
1026         }
1027       break;
1028 
1029       /*-----------------------------------------------------------------*/
1030       case OP_DOLL:
1031       if ((mb->moptions & PCRE2_NOTEOL) == 0)
1032         {
1033         if (clen == 0 && (mb->moptions & PCRE2_PARTIAL_HARD) != 0)
1034           could_continue = TRUE;
1035         else if (clen == 0 ||
1036             ((mb->poptions & PCRE2_DOLLAR_ENDONLY) == 0 && IS_NEWLINE(ptr) &&
1037                (ptr == end_subject - mb->nllen)
1038             ))
1039           { ADD_ACTIVE(state_offset + 1, 0); }
1040         else if (ptr + 1 >= mb->end_subject &&
1041                  (mb->moptions & (PCRE2_PARTIAL_HARD|PCRE2_PARTIAL_SOFT)) != 0 &&
1042                  NLBLOCK->nltype == NLTYPE_FIXED &&
1043                  NLBLOCK->nllen == 2 &&
1044                  c == NLBLOCK->nl[0])
1045           {
1046           if ((mb->moptions & PCRE2_PARTIAL_HARD) != 0)
1047             {
1048             reset_could_continue = TRUE;
1049             ADD_NEW_DATA(-(state_offset + 1), 0, 1);
1050             }
1051           else could_continue = partial_newline = TRUE;
1052           }
1053         }
1054       break;
1055 
1056       /*-----------------------------------------------------------------*/
1057       case OP_DOLLM:
1058       if ((mb->moptions & PCRE2_NOTEOL) == 0)
1059         {
1060         if (clen == 0 && (mb->moptions & PCRE2_PARTIAL_HARD) != 0)
1061           could_continue = TRUE;
1062         else if (clen == 0 ||
1063             ((mb->poptions & PCRE2_DOLLAR_ENDONLY) == 0 && IS_NEWLINE(ptr)))
1064           { ADD_ACTIVE(state_offset + 1, 0); }
1065         else if (ptr + 1 >= mb->end_subject &&
1066                  (mb->moptions & (PCRE2_PARTIAL_HARD|PCRE2_PARTIAL_SOFT)) != 0 &&
1067                  NLBLOCK->nltype == NLTYPE_FIXED &&
1068                  NLBLOCK->nllen == 2 &&
1069                  c == NLBLOCK->nl[0])
1070           {
1071           if ((mb->moptions & PCRE2_PARTIAL_HARD) != 0)
1072             {
1073             reset_could_continue = TRUE;
1074             ADD_NEW_DATA(-(state_offset + 1), 0, 1);
1075             }
1076           else could_continue = partial_newline = TRUE;
1077           }
1078         }
1079       else if (IS_NEWLINE(ptr))
1080         { ADD_ACTIVE(state_offset + 1, 0); }
1081       break;
1082 
1083       /*-----------------------------------------------------------------*/
1084 
1085       case OP_DIGIT:
1086       case OP_WHITESPACE:
1087       case OP_WORDCHAR:
1088       if (clen > 0 && c < 256 &&
1089             ((ctypes[c] & toptable1[codevalue]) ^ toptable2[codevalue]) != 0)
1090         { ADD_NEW(state_offset + 1, 0); }
1091       break;
1092 
1093       /*-----------------------------------------------------------------*/
1094       case OP_NOT_DIGIT:
1095       case OP_NOT_WHITESPACE:
1096       case OP_NOT_WORDCHAR:
1097       if (clen > 0 && (c >= 256 ||
1098             ((ctypes[c] & toptable1[codevalue]) ^ toptable2[codevalue]) != 0))
1099         { ADD_NEW(state_offset + 1, 0); }
1100       break;
1101 
1102       /*-----------------------------------------------------------------*/
1103       case OP_WORD_BOUNDARY:
1104       case OP_NOT_WORD_BOUNDARY:
1105       case OP_NOT_UCP_WORD_BOUNDARY:
1106       case OP_UCP_WORD_BOUNDARY:
1107         {
1108         int left_word, right_word;
1109 
1110         if (ptr > start_subject)
1111           {
1112           PCRE2_SPTR temp = ptr - 1;
1113           if (temp < mb->start_used_ptr) mb->start_used_ptr = temp;
1114 #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32
1115           if (utf) { BACKCHAR(temp); }
1116 #endif
1117           GETCHARTEST(d, temp);
1118 #ifdef SUPPORT_UNICODE
1119           if (codevalue == OP_UCP_WORD_BOUNDARY ||
1120               codevalue == OP_NOT_UCP_WORD_BOUNDARY)
1121             {
1122             int chartype = UCD_CHARTYPE(d);
1123             int category = PRIV(ucp_gentype)[chartype];
1124             left_word = (category == ucp_L || category == ucp_N ||
1125               chartype == ucp_Mn || chartype == ucp_Pc);
1126             }
1127           else
1128 #endif
1129           left_word = d < 256 && (ctypes[d] & ctype_word) != 0;
1130           }
1131         else left_word = FALSE;
1132 
1133         if (clen > 0)
1134           {
1135           if (ptr >= mb->last_used_ptr)
1136             {
1137             PCRE2_SPTR temp = ptr + 1;
1138 #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32
1139             if (utf) { FORWARDCHARTEST(temp, mb->end_subject); }
1140 #endif
1141             mb->last_used_ptr = temp;
1142             }
1143 #ifdef SUPPORT_UNICODE
1144           if (codevalue == OP_UCP_WORD_BOUNDARY ||
1145               codevalue == OP_NOT_UCP_WORD_BOUNDARY)
1146             {
1147             int chartype = UCD_CHARTYPE(c);
1148             int category = PRIV(ucp_gentype)[chartype];
1149             right_word = (category == ucp_L || category == ucp_N ||
1150               chartype == ucp_Mn || chartype == ucp_Pc);
1151             }
1152           else
1153 #endif
1154           right_word = c < 256 && (ctypes[c] & ctype_word) != 0;
1155           }
1156         else right_word = FALSE;
1157 
1158         if ((left_word == right_word) ==
1159             (codevalue == OP_NOT_WORD_BOUNDARY ||
1160              codevalue == OP_NOT_UCP_WORD_BOUNDARY))
1161           { ADD_ACTIVE(state_offset + 1, 0); }
1162         }
1163       break;
1164 
1165 
1166       /*-----------------------------------------------------------------*/
1167       /* Check the next character by Unicode property. We will get here only
1168       if the support is in the binary; otherwise a compile-time error occurs.
1169       */
1170 
1171 #ifdef SUPPORT_UNICODE
1172       case OP_PROP:
1173       case OP_NOTPROP:
1174       if (clen > 0)
1175         {
1176         BOOL OK;
1177         int chartype;
1178         const uint32_t *cp;
1179         const ucd_record * prop = GET_UCD(c);
1180         switch(code[1])
1181           {
1182           case PT_ANY:
1183           OK = TRUE;
1184           break;
1185 
1186           case PT_LAMP:
1187           chartype = prop->chartype;
1188           OK = chartype == ucp_Lu || chartype == ucp_Ll ||
1189                chartype == ucp_Lt;
1190           break;
1191 
1192           case PT_GC:
1193           OK = PRIV(ucp_gentype)[prop->chartype] == code[2];
1194           break;
1195 
1196           case PT_PC:
1197           OK = prop->chartype == code[2];
1198           break;
1199 
1200           case PT_SC:
1201           OK = prop->script == code[2];
1202           break;
1203 
1204           case PT_SCX:
1205           OK = (prop->script == code[2] ||
1206                 MAPBIT(PRIV(ucd_script_sets) + UCD_SCRIPTX_PROP(prop), code[2]) != 0);
1207           break;
1208 
1209           /* These are specials for combination cases. */
1210 
1211           case PT_ALNUM:
1212           chartype = prop->chartype;
1213           OK = PRIV(ucp_gentype)[chartype] == ucp_L ||
1214                PRIV(ucp_gentype)[chartype] == ucp_N;
1215           break;
1216 
1217           /* Perl space used to exclude VT, but from Perl 5.18 it is included,
1218           which means that Perl space and POSIX space are now identical. PCRE
1219           was changed at release 8.34. */
1220 
1221           case PT_SPACE:    /* Perl space */
1222           case PT_PXSPACE:  /* POSIX space */
1223           switch(c)
1224             {
1225             HSPACE_CASES:
1226             VSPACE_CASES:
1227             OK = TRUE;
1228             break;
1229 
1230             default:
1231             OK = PRIV(ucp_gentype)[prop->chartype] == ucp_Z;
1232             break;
1233             }
1234           break;
1235 
1236           case PT_WORD:
1237           chartype = prop->chartype;
1238           OK = PRIV(ucp_gentype)[chartype] == ucp_L ||
1239                PRIV(ucp_gentype)[chartype] == ucp_N ||
1240                chartype == ucp_Mn || chartype == ucp_Pc;
1241           break;
1242 
1243           case PT_CLIST:
1244 #if PCRE2_CODE_UNIT_WIDTH == 32
1245           if (c > MAX_UTF_CODE_POINT)
1246             {
1247             OK = FALSE;
1248             break;
1249             }
1250 #endif
1251           cp = PRIV(ucd_caseless_sets) + code[2];
1252           for (;;)
1253             {
1254             if (c < *cp) { OK = FALSE; break; }
1255             if (c == *cp++) { OK = TRUE; break; }
1256             }
1257           break;
1258 
1259           case PT_UCNC:
1260           OK = c == CHAR_DOLLAR_SIGN || c == CHAR_COMMERCIAL_AT ||
1261                c == CHAR_GRAVE_ACCENT || (c >= 0xa0 && c <= 0xd7ff) ||
1262                c >= 0xe000;
1263           break;
1264 
1265           case PT_BIDICL:
1266           OK = UCD_BIDICLASS(c) == code[2];
1267           break;
1268 
1269           case PT_BOOL:
1270           OK = MAPBIT(PRIV(ucd_boolprop_sets) +
1271             UCD_BPROPS_PROP(prop), code[2]) != 0;
1272           break;
1273 
1274           /* Should never occur, but keep compilers from grumbling. */
1275 
1276           default:
1277           OK = codevalue != OP_PROP;
1278           break;
1279           }
1280 
1281         if (OK == (codevalue == OP_PROP)) { ADD_NEW(state_offset + 3, 0); }
1282         }
1283       break;
1284 #endif
1285 
1286 
1287 
1288 /* ========================================================================== */
1289       /* These opcodes likewise inspect the subject character, but have an
1290       argument that is not a data character. It is one of these opcodes:
1291       OP_ANY, OP_ALLANY, OP_DIGIT, OP_NOT_DIGIT, OP_WHITESPACE, OP_NOT_SPACE,
1292       OP_WORDCHAR, OP_NOT_WORDCHAR. The value is loaded into d. */
1293 
1294       case OP_TYPEPLUS:
1295       case OP_TYPEMINPLUS:
1296       case OP_TYPEPOSPLUS:
1297       count = current_state->count;  /* Already matched */
1298       if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1299       if (clen > 0)
1300         {
1301         if (d == OP_ANY && ptr + 1 >= mb->end_subject &&
1302             (mb->moptions & (PCRE2_PARTIAL_HARD)) != 0 &&
1303             NLBLOCK->nltype == NLTYPE_FIXED &&
1304             NLBLOCK->nllen == 2 &&
1305             c == NLBLOCK->nl[0])
1306           {
1307           could_continue = partial_newline = TRUE;
1308           }
1309         else if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
1310             (c < 256 &&
1311               (d != OP_ANY || !IS_NEWLINE(ptr)) &&
1312               ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
1313           {
1314           if (count > 0 && codevalue == OP_TYPEPOSPLUS)
1315             {
1316             active_count--;            /* Remove non-match possibility */
1317             next_active_state--;
1318             }
1319           count++;
1320           ADD_NEW(state_offset, count);
1321           }
1322         }
1323       break;
1324 
1325       /*-----------------------------------------------------------------*/
1326       case OP_TYPEQUERY:
1327       case OP_TYPEMINQUERY:
1328       case OP_TYPEPOSQUERY:
1329       ADD_ACTIVE(state_offset + 2, 0);
1330       if (clen > 0)
1331         {
1332         if (d == OP_ANY && ptr + 1 >= mb->end_subject &&
1333             (mb->moptions & (PCRE2_PARTIAL_HARD)) != 0 &&
1334             NLBLOCK->nltype == NLTYPE_FIXED &&
1335             NLBLOCK->nllen == 2 &&
1336             c == NLBLOCK->nl[0])
1337           {
1338           could_continue = partial_newline = TRUE;
1339           }
1340         else if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
1341             (c < 256 &&
1342               (d != OP_ANY || !IS_NEWLINE(ptr)) &&
1343               ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
1344           {
1345           if (codevalue == OP_TYPEPOSQUERY)
1346             {
1347             active_count--;            /* Remove non-match possibility */
1348             next_active_state--;
1349             }
1350           ADD_NEW(state_offset + 2, 0);
1351           }
1352         }
1353       break;
1354 
1355       /*-----------------------------------------------------------------*/
1356       case OP_TYPESTAR:
1357       case OP_TYPEMINSTAR:
1358       case OP_TYPEPOSSTAR:
1359       ADD_ACTIVE(state_offset + 2, 0);
1360       if (clen > 0)
1361         {
1362         if (d == OP_ANY && ptr + 1 >= mb->end_subject &&
1363             (mb->moptions & (PCRE2_PARTIAL_HARD)) != 0 &&
1364             NLBLOCK->nltype == NLTYPE_FIXED &&
1365             NLBLOCK->nllen == 2 &&
1366             c == NLBLOCK->nl[0])
1367           {
1368           could_continue = partial_newline = TRUE;
1369           }
1370         else if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
1371             (c < 256 &&
1372               (d != OP_ANY || !IS_NEWLINE(ptr)) &&
1373               ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
1374           {
1375           if (codevalue == OP_TYPEPOSSTAR)
1376             {
1377             active_count--;            /* Remove non-match possibility */
1378             next_active_state--;
1379             }
1380           ADD_NEW(state_offset, 0);
1381           }
1382         }
1383       break;
1384 
1385       /*-----------------------------------------------------------------*/
1386       case OP_TYPEEXACT:
1387       count = current_state->count;  /* Number already matched */
1388       if (clen > 0)
1389         {
1390         if (d == OP_ANY && ptr + 1 >= mb->end_subject &&
1391             (mb->moptions & (PCRE2_PARTIAL_HARD)) != 0 &&
1392             NLBLOCK->nltype == NLTYPE_FIXED &&
1393             NLBLOCK->nllen == 2 &&
1394             c == NLBLOCK->nl[0])
1395           {
1396           could_continue = partial_newline = TRUE;
1397           }
1398         else if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
1399             (c < 256 &&
1400               (d != OP_ANY || !IS_NEWLINE(ptr)) &&
1401               ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
1402           {
1403           if (++count >= (int)GET2(code, 1))
1404             { ADD_NEW(state_offset + 1 + IMM2_SIZE + 1, 0); }
1405           else
1406             { ADD_NEW(state_offset, count); }
1407           }
1408         }
1409       break;
1410 
1411       /*-----------------------------------------------------------------*/
1412       case OP_TYPEUPTO:
1413       case OP_TYPEMINUPTO:
1414       case OP_TYPEPOSUPTO:
1415       ADD_ACTIVE(state_offset + 2 + IMM2_SIZE, 0);
1416       count = current_state->count;  /* Number already matched */
1417       if (clen > 0)
1418         {
1419         if (d == OP_ANY && ptr + 1 >= mb->end_subject &&
1420             (mb->moptions & (PCRE2_PARTIAL_HARD)) != 0 &&
1421             NLBLOCK->nltype == NLTYPE_FIXED &&
1422             NLBLOCK->nllen == 2 &&
1423             c == NLBLOCK->nl[0])
1424           {
1425           could_continue = partial_newline = TRUE;
1426           }
1427         else if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
1428             (c < 256 &&
1429               (d != OP_ANY || !IS_NEWLINE(ptr)) &&
1430               ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
1431           {
1432           if (codevalue == OP_TYPEPOSUPTO)
1433             {
1434             active_count--;           /* Remove non-match possibility */
1435             next_active_state--;
1436             }
1437           if (++count >= (int)GET2(code, 1))
1438             { ADD_NEW(state_offset + 2 + IMM2_SIZE, 0); }
1439           else
1440             { ADD_NEW(state_offset, count); }
1441           }
1442         }
1443       break;
1444 
1445 /* ========================================================================== */
1446       /* These are virtual opcodes that are used when something like
1447       OP_TYPEPLUS has OP_PROP, OP_NOTPROP, OP_ANYNL, or OP_EXTUNI as its
1448       argument. It keeps the code above fast for the other cases. The argument
1449       is in the d variable. */
1450 
1451 #ifdef SUPPORT_UNICODE
1452       case OP_PROP_EXTRA + OP_TYPEPLUS:
1453       case OP_PROP_EXTRA + OP_TYPEMINPLUS:
1454       case OP_PROP_EXTRA + OP_TYPEPOSPLUS:
1455       count = current_state->count;           /* Already matched */
1456       if (count > 0) { ADD_ACTIVE(state_offset + 4, 0); }
1457       if (clen > 0)
1458         {
1459         BOOL OK;
1460         int chartype;
1461         const uint32_t *cp;
1462         const ucd_record * prop = GET_UCD(c);
1463         switch(code[2])
1464           {
1465           case PT_ANY:
1466           OK = TRUE;
1467           break;
1468 
1469           case PT_LAMP:
1470           chartype = prop->chartype;
1471           OK = chartype == ucp_Lu || chartype == ucp_Ll || chartype == ucp_Lt;
1472           break;
1473 
1474           case PT_GC:
1475           OK = PRIV(ucp_gentype)[prop->chartype] == code[3];
1476           break;
1477 
1478           case PT_PC:
1479           OK = prop->chartype == code[3];
1480           break;
1481 
1482           case PT_SC:
1483           OK = prop->script == code[3];
1484           break;
1485 
1486           case PT_SCX:
1487           OK = (prop->script == code[3] ||
1488                 MAPBIT(PRIV(ucd_script_sets) + UCD_SCRIPTX_PROP(prop), code[3]) != 0);
1489           break;
1490 
1491           /* These are specials for combination cases. */
1492 
1493           case PT_ALNUM:
1494           chartype = prop->chartype;
1495           OK = PRIV(ucp_gentype)[chartype] == ucp_L ||
1496                PRIV(ucp_gentype)[chartype] == ucp_N;
1497           break;
1498 
1499           /* Perl space used to exclude VT, but from Perl 5.18 it is included,
1500           which means that Perl space and POSIX space are now identical. PCRE
1501           was changed at release 8.34. */
1502 
1503           case PT_SPACE:    /* Perl space */
1504           case PT_PXSPACE:  /* POSIX space */
1505           switch(c)
1506             {
1507             HSPACE_CASES:
1508             VSPACE_CASES:
1509             OK = TRUE;
1510             break;
1511 
1512             default:
1513             OK = PRIV(ucp_gentype)[prop->chartype] == ucp_Z;
1514             break;
1515             }
1516           break;
1517 
1518           case PT_WORD:
1519           chartype = prop->chartype;
1520           OK = PRIV(ucp_gentype)[chartype] == ucp_L ||
1521                PRIV(ucp_gentype)[chartype] == ucp_N ||
1522                chartype == ucp_Mn || chartype == ucp_Pc;
1523           break;
1524 
1525           case PT_CLIST:
1526 #if PCRE2_CODE_UNIT_WIDTH == 32
1527           if (c > MAX_UTF_CODE_POINT)
1528             {
1529             OK = FALSE;
1530             break;
1531             }
1532 #endif
1533           cp = PRIV(ucd_caseless_sets) + code[3];
1534           for (;;)
1535             {
1536             if (c < *cp) { OK = FALSE; break; }
1537             if (c == *cp++) { OK = TRUE; break; }
1538             }
1539           break;
1540 
1541           case PT_UCNC:
1542           OK = c == CHAR_DOLLAR_SIGN || c == CHAR_COMMERCIAL_AT ||
1543                c == CHAR_GRAVE_ACCENT || (c >= 0xa0 && c <= 0xd7ff) ||
1544                c >= 0xe000;
1545           break;
1546 
1547           case PT_BIDICL:
1548           OK = UCD_BIDICLASS(c) == code[3];
1549           break;
1550 
1551           case PT_BOOL:
1552           OK = MAPBIT(PRIV(ucd_boolprop_sets) +
1553             UCD_BPROPS_PROP(prop), code[3]) != 0;
1554           break;
1555 
1556           /* Should never occur, but keep compilers from grumbling. */
1557 
1558           default:
1559           OK = codevalue != OP_PROP;
1560           break;
1561           }
1562 
1563         if (OK == (d == OP_PROP))
1564           {
1565           if (count > 0 && codevalue == OP_PROP_EXTRA + OP_TYPEPOSPLUS)
1566             {
1567             active_count--;           /* Remove non-match possibility */
1568             next_active_state--;
1569             }
1570           count++;
1571           ADD_NEW(state_offset, count);
1572           }
1573         }
1574       break;
1575 
1576       /*-----------------------------------------------------------------*/
1577       case OP_EXTUNI_EXTRA + OP_TYPEPLUS:
1578       case OP_EXTUNI_EXTRA + OP_TYPEMINPLUS:
1579       case OP_EXTUNI_EXTRA + OP_TYPEPOSPLUS:
1580       count = current_state->count;  /* Already matched */
1581       if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1582       if (clen > 0)
1583         {
1584         int ncount = 0;
1585         if (count > 0 && codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSPLUS)
1586           {
1587           active_count--;           /* Remove non-match possibility */
1588           next_active_state--;
1589           }
1590         (void)PRIV(extuni)(c, ptr + clen, mb->start_subject, end_subject, utf,
1591           &ncount);
1592         count++;
1593         ADD_NEW_DATA(-state_offset, count, ncount);
1594         }
1595       break;
1596 #endif
1597 
1598       /*-----------------------------------------------------------------*/
1599       case OP_ANYNL_EXTRA + OP_TYPEPLUS:
1600       case OP_ANYNL_EXTRA + OP_TYPEMINPLUS:
1601       case OP_ANYNL_EXTRA + OP_TYPEPOSPLUS:
1602       count = current_state->count;  /* Already matched */
1603       if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1604       if (clen > 0)
1605         {
1606         int ncount = 0;
1607         switch (c)
1608           {
1609           case CHAR_VT:
1610           case CHAR_FF:
1611           case CHAR_NEL:
1612 #ifndef EBCDIC
1613           case 0x2028:
1614           case 0x2029:
1615 #endif  /* Not EBCDIC */
1616           if (mb->bsr_convention == PCRE2_BSR_ANYCRLF) break;
1617           goto ANYNL01;
1618 
1619           case CHAR_CR:
1620           if (ptr + 1 < end_subject && UCHAR21TEST(ptr + 1) == CHAR_LF) ncount = 1;
1621           /* Fall through */
1622 
1623           ANYNL01:
1624           case CHAR_LF:
1625           if (count > 0 && codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSPLUS)
1626             {
1627             active_count--;           /* Remove non-match possibility */
1628             next_active_state--;
1629             }
1630           count++;
1631           ADD_NEW_DATA(-state_offset, count, ncount);
1632           break;
1633 
1634           default:
1635           break;
1636           }
1637         }
1638       break;
1639 
1640       /*-----------------------------------------------------------------*/
1641       case OP_VSPACE_EXTRA + OP_TYPEPLUS:
1642       case OP_VSPACE_EXTRA + OP_TYPEMINPLUS:
1643       case OP_VSPACE_EXTRA + OP_TYPEPOSPLUS:
1644       count = current_state->count;  /* Already matched */
1645       if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1646       if (clen > 0)
1647         {
1648         BOOL OK;
1649         switch (c)
1650           {
1651           VSPACE_CASES:
1652           OK = TRUE;
1653           break;
1654 
1655           default:
1656           OK = FALSE;
1657           break;
1658           }
1659 
1660         if (OK == (d == OP_VSPACE))
1661           {
1662           if (count > 0 && codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSPLUS)
1663             {
1664             active_count--;           /* Remove non-match possibility */
1665             next_active_state--;
1666             }
1667           count++;
1668           ADD_NEW_DATA(-state_offset, count, 0);
1669           }
1670         }
1671       break;
1672 
1673       /*-----------------------------------------------------------------*/
1674       case OP_HSPACE_EXTRA + OP_TYPEPLUS:
1675       case OP_HSPACE_EXTRA + OP_TYPEMINPLUS:
1676       case OP_HSPACE_EXTRA + OP_TYPEPOSPLUS:
1677       count = current_state->count;  /* Already matched */
1678       if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1679       if (clen > 0)
1680         {
1681         BOOL OK;
1682         switch (c)
1683           {
1684           HSPACE_CASES:
1685           OK = TRUE;
1686           break;
1687 
1688           default:
1689           OK = FALSE;
1690           break;
1691           }
1692 
1693         if (OK == (d == OP_HSPACE))
1694           {
1695           if (count > 0 && codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSPLUS)
1696             {
1697             active_count--;           /* Remove non-match possibility */
1698             next_active_state--;
1699             }
1700           count++;
1701           ADD_NEW_DATA(-state_offset, count, 0);
1702           }
1703         }
1704       break;
1705 
1706       /*-----------------------------------------------------------------*/
1707 #ifdef SUPPORT_UNICODE
1708       case OP_PROP_EXTRA + OP_TYPEQUERY:
1709       case OP_PROP_EXTRA + OP_TYPEMINQUERY:
1710       case OP_PROP_EXTRA + OP_TYPEPOSQUERY:
1711       count = 4;
1712       goto QS1;
1713 
1714       case OP_PROP_EXTRA + OP_TYPESTAR:
1715       case OP_PROP_EXTRA + OP_TYPEMINSTAR:
1716       case OP_PROP_EXTRA + OP_TYPEPOSSTAR:
1717       count = 0;
1718 
1719       QS1:
1720 
1721       ADD_ACTIVE(state_offset + 4, 0);
1722       if (clen > 0)
1723         {
1724         BOOL OK;
1725         int chartype;
1726         const uint32_t *cp;
1727         const ucd_record * prop = GET_UCD(c);
1728         switch(code[2])
1729           {
1730           case PT_ANY:
1731           OK = TRUE;
1732           break;
1733 
1734           case PT_LAMP:
1735           chartype = prop->chartype;
1736           OK = chartype == ucp_Lu || chartype == ucp_Ll || chartype == ucp_Lt;
1737           break;
1738 
1739           case PT_GC:
1740           OK = PRIV(ucp_gentype)[prop->chartype] == code[3];
1741           break;
1742 
1743           case PT_PC:
1744           OK = prop->chartype == code[3];
1745           break;
1746 
1747           case PT_SC:
1748           OK = prop->script == code[3];
1749           break;
1750 
1751           case PT_SCX:
1752           OK = (prop->script == code[3] ||
1753                 MAPBIT(PRIV(ucd_script_sets) + UCD_SCRIPTX_PROP(prop), code[3]) != 0);
1754           break;
1755 
1756           /* These are specials for combination cases. */
1757 
1758           case PT_ALNUM:
1759           chartype = prop->chartype;
1760           OK = PRIV(ucp_gentype)[chartype] == ucp_L ||
1761                PRIV(ucp_gentype)[chartype] == ucp_N;
1762           break;
1763 
1764           /* Perl space used to exclude VT, but from Perl 5.18 it is included,
1765           which means that Perl space and POSIX space are now identical. PCRE
1766           was changed at release 8.34. */
1767 
1768           case PT_SPACE:    /* Perl space */
1769           case PT_PXSPACE:  /* POSIX space */
1770           switch(c)
1771             {
1772             HSPACE_CASES:
1773             VSPACE_CASES:
1774             OK = TRUE;
1775             break;
1776 
1777             default:
1778             OK = PRIV(ucp_gentype)[prop->chartype] == ucp_Z;
1779             break;
1780             }
1781           break;
1782 
1783           case PT_WORD:
1784           chartype = prop->chartype;
1785           OK = PRIV(ucp_gentype)[chartype] == ucp_L ||
1786                PRIV(ucp_gentype)[chartype] == ucp_N ||
1787                chartype == ucp_Mn || chartype == ucp_Pc;
1788           break;
1789 
1790           case PT_CLIST:
1791 #if PCRE2_CODE_UNIT_WIDTH == 32
1792           if (c > MAX_UTF_CODE_POINT)
1793             {
1794             OK = FALSE;
1795             break;
1796             }
1797 #endif
1798           cp = PRIV(ucd_caseless_sets) + code[3];
1799           for (;;)
1800             {
1801             if (c < *cp) { OK = FALSE; break; }
1802             if (c == *cp++) { OK = TRUE; break; }
1803             }
1804           break;
1805 
1806           case PT_UCNC:
1807           OK = c == CHAR_DOLLAR_SIGN || c == CHAR_COMMERCIAL_AT ||
1808                c == CHAR_GRAVE_ACCENT || (c >= 0xa0 && c <= 0xd7ff) ||
1809                c >= 0xe000;
1810           break;
1811 
1812           case PT_BIDICL:
1813           OK = UCD_BIDICLASS(c) == code[3];
1814           break;
1815 
1816           case PT_BOOL:
1817           OK = MAPBIT(PRIV(ucd_boolprop_sets) +
1818             UCD_BPROPS_PROP(prop), code[3]) != 0;
1819           break;
1820 
1821           /* Should never occur, but keep compilers from grumbling. */
1822 
1823           default:
1824           OK = codevalue != OP_PROP;
1825           break;
1826           }
1827 
1828         if (OK == (d == OP_PROP))
1829           {
1830           if (codevalue == OP_PROP_EXTRA + OP_TYPEPOSSTAR ||
1831               codevalue == OP_PROP_EXTRA + OP_TYPEPOSQUERY)
1832             {
1833             active_count--;           /* Remove non-match possibility */
1834             next_active_state--;
1835             }
1836           ADD_NEW(state_offset + count, 0);
1837           }
1838         }
1839       break;
1840 
1841       /*-----------------------------------------------------------------*/
1842       case OP_EXTUNI_EXTRA + OP_TYPEQUERY:
1843       case OP_EXTUNI_EXTRA + OP_TYPEMINQUERY:
1844       case OP_EXTUNI_EXTRA + OP_TYPEPOSQUERY:
1845       count = 2;
1846       goto QS2;
1847 
1848       case OP_EXTUNI_EXTRA + OP_TYPESTAR:
1849       case OP_EXTUNI_EXTRA + OP_TYPEMINSTAR:
1850       case OP_EXTUNI_EXTRA + OP_TYPEPOSSTAR:
1851       count = 0;
1852 
1853       QS2:
1854 
1855       ADD_ACTIVE(state_offset + 2, 0);
1856       if (clen > 0)
1857         {
1858         int ncount = 0;
1859         if (codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSSTAR ||
1860             codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSQUERY)
1861           {
1862           active_count--;           /* Remove non-match possibility */
1863           next_active_state--;
1864           }
1865         (void)PRIV(extuni)(c, ptr + clen, mb->start_subject, end_subject, utf,
1866           &ncount);
1867         ADD_NEW_DATA(-(state_offset + count), 0, ncount);
1868         }
1869       break;
1870 #endif
1871 
1872       /*-----------------------------------------------------------------*/
1873       case OP_ANYNL_EXTRA + OP_TYPEQUERY:
1874       case OP_ANYNL_EXTRA + OP_TYPEMINQUERY:
1875       case OP_ANYNL_EXTRA + OP_TYPEPOSQUERY:
1876       count = 2;
1877       goto QS3;
1878 
1879       case OP_ANYNL_EXTRA + OP_TYPESTAR:
1880       case OP_ANYNL_EXTRA + OP_TYPEMINSTAR:
1881       case OP_ANYNL_EXTRA + OP_TYPEPOSSTAR:
1882       count = 0;
1883 
1884       QS3:
1885       ADD_ACTIVE(state_offset + 2, 0);
1886       if (clen > 0)
1887         {
1888         int ncount = 0;
1889         switch (c)
1890           {
1891           case CHAR_VT:
1892           case CHAR_FF:
1893           case CHAR_NEL:
1894 #ifndef EBCDIC
1895           case 0x2028:
1896           case 0x2029:
1897 #endif  /* Not EBCDIC */
1898           if (mb->bsr_convention == PCRE2_BSR_ANYCRLF) break;
1899           goto ANYNL02;
1900 
1901           case CHAR_CR:
1902           if (ptr + 1 < end_subject && UCHAR21TEST(ptr + 1) == CHAR_LF) ncount = 1;
1903           /* Fall through */
1904 
1905           ANYNL02:
1906           case CHAR_LF:
1907           if (codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSSTAR ||
1908               codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSQUERY)
1909             {
1910             active_count--;           /* Remove non-match possibility */
1911             next_active_state--;
1912             }
1913           ADD_NEW_DATA(-(state_offset + (int)count), 0, ncount);
1914           break;
1915 
1916           default:
1917           break;
1918           }
1919         }
1920       break;
1921 
1922       /*-----------------------------------------------------------------*/
1923       case OP_VSPACE_EXTRA + OP_TYPEQUERY:
1924       case OP_VSPACE_EXTRA + OP_TYPEMINQUERY:
1925       case OP_VSPACE_EXTRA + OP_TYPEPOSQUERY:
1926       count = 2;
1927       goto QS4;
1928 
1929       case OP_VSPACE_EXTRA + OP_TYPESTAR:
1930       case OP_VSPACE_EXTRA + OP_TYPEMINSTAR:
1931       case OP_VSPACE_EXTRA + OP_TYPEPOSSTAR:
1932       count = 0;
1933 
1934       QS4:
1935       ADD_ACTIVE(state_offset + 2, 0);
1936       if (clen > 0)
1937         {
1938         BOOL OK;
1939         switch (c)
1940           {
1941           VSPACE_CASES:
1942           OK = TRUE;
1943           break;
1944 
1945           default:
1946           OK = FALSE;
1947           break;
1948           }
1949         if (OK == (d == OP_VSPACE))
1950           {
1951           if (codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSSTAR ||
1952               codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSQUERY)
1953             {
1954             active_count--;           /* Remove non-match possibility */
1955             next_active_state--;
1956             }
1957           ADD_NEW_DATA(-(state_offset + (int)count), 0, 0);
1958           }
1959         }
1960       break;
1961 
1962       /*-----------------------------------------------------------------*/
1963       case OP_HSPACE_EXTRA + OP_TYPEQUERY:
1964       case OP_HSPACE_EXTRA + OP_TYPEMINQUERY:
1965       case OP_HSPACE_EXTRA + OP_TYPEPOSQUERY:
1966       count = 2;
1967       goto QS5;
1968 
1969       case OP_HSPACE_EXTRA + OP_TYPESTAR:
1970       case OP_HSPACE_EXTRA + OP_TYPEMINSTAR:
1971       case OP_HSPACE_EXTRA + OP_TYPEPOSSTAR:
1972       count = 0;
1973 
1974       QS5:
1975       ADD_ACTIVE(state_offset + 2, 0);
1976       if (clen > 0)
1977         {
1978         BOOL OK;
1979         switch (c)
1980           {
1981           HSPACE_CASES:
1982           OK = TRUE;
1983           break;
1984 
1985           default:
1986           OK = FALSE;
1987           break;
1988           }
1989 
1990         if (OK == (d == OP_HSPACE))
1991           {
1992           if (codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSSTAR ||
1993               codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSQUERY)
1994             {
1995             active_count--;           /* Remove non-match possibility */
1996             next_active_state--;
1997             }
1998           ADD_NEW_DATA(-(state_offset + (int)count), 0, 0);
1999           }
2000         }
2001       break;
2002 
2003       /*-----------------------------------------------------------------*/
2004 #ifdef SUPPORT_UNICODE
2005       case OP_PROP_EXTRA + OP_TYPEEXACT:
2006       case OP_PROP_EXTRA + OP_TYPEUPTO:
2007       case OP_PROP_EXTRA + OP_TYPEMINUPTO:
2008       case OP_PROP_EXTRA + OP_TYPEPOSUPTO:
2009       if (codevalue != OP_PROP_EXTRA + OP_TYPEEXACT)
2010         { ADD_ACTIVE(state_offset + 1 + IMM2_SIZE + 3, 0); }
2011       count = current_state->count;  /* Number already matched */
2012       if (clen > 0)
2013         {
2014         BOOL OK;
2015         int chartype;
2016         const uint32_t *cp;
2017         const ucd_record * prop = GET_UCD(c);
2018         switch(code[1 + IMM2_SIZE + 1])
2019           {
2020           case PT_ANY:
2021           OK = TRUE;
2022           break;
2023 
2024           case PT_LAMP:
2025           chartype = prop->chartype;
2026           OK = chartype == ucp_Lu || chartype == ucp_Ll || chartype == ucp_Lt;
2027           break;
2028 
2029           case PT_GC:
2030           OK = PRIV(ucp_gentype)[prop->chartype] == code[1 + IMM2_SIZE + 2];
2031           break;
2032 
2033           case PT_PC:
2034           OK = prop->chartype == code[1 + IMM2_SIZE + 2];
2035           break;
2036 
2037           case PT_SC:
2038           OK = prop->script == code[1 + IMM2_SIZE + 2];
2039           break;
2040 
2041           case PT_SCX:
2042           OK = (prop->script == code[1 + IMM2_SIZE + 2] ||
2043                 MAPBIT(PRIV(ucd_script_sets) + UCD_SCRIPTX_PROP(prop),
2044                   code[1 + IMM2_SIZE + 2]) != 0);
2045           break;
2046 
2047           /* These are specials for combination cases. */
2048 
2049           case PT_ALNUM:
2050           chartype = prop->chartype;
2051           OK = PRIV(ucp_gentype)[chartype] == ucp_L ||
2052                PRIV(ucp_gentype)[chartype] == ucp_N;
2053           break;
2054 
2055           /* Perl space used to exclude VT, but from Perl 5.18 it is included,
2056           which means that Perl space and POSIX space are now identical. PCRE
2057           was changed at release 8.34. */
2058 
2059           case PT_SPACE:    /* Perl space */
2060           case PT_PXSPACE:  /* POSIX space */
2061           switch(c)
2062             {
2063             HSPACE_CASES:
2064             VSPACE_CASES:
2065             OK = TRUE;
2066             break;
2067 
2068             default:
2069             OK = PRIV(ucp_gentype)[prop->chartype] == ucp_Z;
2070             break;
2071             }
2072           break;
2073 
2074           case PT_WORD:
2075           chartype = prop->chartype;
2076           OK = PRIV(ucp_gentype)[chartype] == ucp_L ||
2077                PRIV(ucp_gentype)[chartype] == ucp_N ||
2078                chartype == ucp_Mn || chartype == ucp_Pc;
2079           break;
2080 
2081           case PT_CLIST:
2082 #if PCRE2_CODE_UNIT_WIDTH == 32
2083           if (c > MAX_UTF_CODE_POINT)
2084             {
2085             OK = FALSE;
2086             break;
2087             }
2088 #endif
2089           cp = PRIV(ucd_caseless_sets) + code[1 + IMM2_SIZE + 2];
2090           for (;;)
2091             {
2092             if (c < *cp) { OK = FALSE; break; }
2093             if (c == *cp++) { OK = TRUE; break; }
2094             }
2095           break;
2096 
2097           case PT_UCNC:
2098           OK = c == CHAR_DOLLAR_SIGN || c == CHAR_COMMERCIAL_AT ||
2099                c == CHAR_GRAVE_ACCENT || (c >= 0xa0 && c <= 0xd7ff) ||
2100                c >= 0xe000;
2101           break;
2102 
2103           case PT_BIDICL:
2104           OK = UCD_BIDICLASS(c) == code[1 + IMM2_SIZE + 2];
2105           break;
2106 
2107           case PT_BOOL:
2108           OK = MAPBIT(PRIV(ucd_boolprop_sets) +
2109             UCD_BPROPS_PROP(prop), code[1 + IMM2_SIZE + 2]) != 0;
2110           break;
2111 
2112           /* Should never occur, but keep compilers from grumbling. */
2113 
2114           default:
2115           OK = codevalue != OP_PROP;
2116           break;
2117           }
2118 
2119         if (OK == (d == OP_PROP))
2120           {
2121           if (codevalue == OP_PROP_EXTRA + OP_TYPEPOSUPTO)
2122             {
2123             active_count--;           /* Remove non-match possibility */
2124             next_active_state--;
2125             }
2126           if (++count >= (int)GET2(code, 1))
2127             { ADD_NEW(state_offset + 1 + IMM2_SIZE + 3, 0); }
2128           else
2129             { ADD_NEW(state_offset, count); }
2130           }
2131         }
2132       break;
2133 
2134       /*-----------------------------------------------------------------*/
2135       case OP_EXTUNI_EXTRA + OP_TYPEEXACT:
2136       case OP_EXTUNI_EXTRA + OP_TYPEUPTO:
2137       case OP_EXTUNI_EXTRA + OP_TYPEMINUPTO:
2138       case OP_EXTUNI_EXTRA + OP_TYPEPOSUPTO:
2139       if (codevalue != OP_EXTUNI_EXTRA + OP_TYPEEXACT)
2140         { ADD_ACTIVE(state_offset + 2 + IMM2_SIZE, 0); }
2141       count = current_state->count;  /* Number already matched */
2142       if (clen > 0)
2143         {
2144         PCRE2_SPTR nptr;
2145         int ncount = 0;
2146         if (codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSUPTO)
2147           {
2148           active_count--;           /* Remove non-match possibility */
2149           next_active_state--;
2150           }
2151         nptr = PRIV(extuni)(c, ptr + clen, mb->start_subject, end_subject, utf,
2152           &ncount);
2153         if (nptr >= end_subject && (mb->moptions & PCRE2_PARTIAL_HARD) != 0)
2154             reset_could_continue = TRUE;
2155         if (++count >= (int)GET2(code, 1))
2156           { ADD_NEW_DATA(-(state_offset + 2 + IMM2_SIZE), 0, ncount); }
2157         else
2158           { ADD_NEW_DATA(-state_offset, count, ncount); }
2159         }
2160       break;
2161 #endif
2162 
2163       /*-----------------------------------------------------------------*/
2164       case OP_ANYNL_EXTRA + OP_TYPEEXACT:
2165       case OP_ANYNL_EXTRA + OP_TYPEUPTO:
2166       case OP_ANYNL_EXTRA + OP_TYPEMINUPTO:
2167       case OP_ANYNL_EXTRA + OP_TYPEPOSUPTO:
2168       if (codevalue != OP_ANYNL_EXTRA + OP_TYPEEXACT)
2169         { ADD_ACTIVE(state_offset + 2 + IMM2_SIZE, 0); }
2170       count = current_state->count;  /* Number already matched */
2171       if (clen > 0)
2172         {
2173         int ncount = 0;
2174         switch (c)
2175           {
2176           case CHAR_VT:
2177           case CHAR_FF:
2178           case CHAR_NEL:
2179 #ifndef EBCDIC
2180           case 0x2028:
2181           case 0x2029:
2182 #endif  /* Not EBCDIC */
2183           if (mb->bsr_convention == PCRE2_BSR_ANYCRLF) break;
2184           goto ANYNL03;
2185 
2186           case CHAR_CR:
2187           if (ptr + 1 < end_subject && UCHAR21TEST(ptr + 1) == CHAR_LF) ncount = 1;
2188           /* Fall through */
2189 
2190           ANYNL03:
2191           case CHAR_LF:
2192           if (codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSUPTO)
2193             {
2194             active_count--;           /* Remove non-match possibility */
2195             next_active_state--;
2196             }
2197           if (++count >= (int)GET2(code, 1))
2198             { ADD_NEW_DATA(-(state_offset + 2 + IMM2_SIZE), 0, ncount); }
2199           else
2200             { ADD_NEW_DATA(-state_offset, count, ncount); }
2201           break;
2202 
2203           default:
2204           break;
2205           }
2206         }
2207       break;
2208 
2209       /*-----------------------------------------------------------------*/
2210       case OP_VSPACE_EXTRA + OP_TYPEEXACT:
2211       case OP_VSPACE_EXTRA + OP_TYPEUPTO:
2212       case OP_VSPACE_EXTRA + OP_TYPEMINUPTO:
2213       case OP_VSPACE_EXTRA + OP_TYPEPOSUPTO:
2214       if (codevalue != OP_VSPACE_EXTRA + OP_TYPEEXACT)
2215         { ADD_ACTIVE(state_offset + 2 + IMM2_SIZE, 0); }
2216       count = current_state->count;  /* Number already matched */
2217       if (clen > 0)
2218         {
2219         BOOL OK;
2220         switch (c)
2221           {
2222           VSPACE_CASES:
2223           OK = TRUE;
2224           break;
2225 
2226           default:
2227           OK = FALSE;
2228           }
2229 
2230         if (OK == (d == OP_VSPACE))
2231           {
2232           if (codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSUPTO)
2233             {
2234             active_count--;           /* Remove non-match possibility */
2235             next_active_state--;
2236             }
2237           if (++count >= (int)GET2(code, 1))
2238             { ADD_NEW_DATA(-(state_offset + 2 + IMM2_SIZE), 0, 0); }
2239           else
2240             { ADD_NEW_DATA(-state_offset, count, 0); }
2241           }
2242         }
2243       break;
2244 
2245       /*-----------------------------------------------------------------*/
2246       case OP_HSPACE_EXTRA + OP_TYPEEXACT:
2247       case OP_HSPACE_EXTRA + OP_TYPEUPTO:
2248       case OP_HSPACE_EXTRA + OP_TYPEMINUPTO:
2249       case OP_HSPACE_EXTRA + OP_TYPEPOSUPTO:
2250       if (codevalue != OP_HSPACE_EXTRA + OP_TYPEEXACT)
2251         { ADD_ACTIVE(state_offset + 2 + IMM2_SIZE, 0); }
2252       count = current_state->count;  /* Number already matched */
2253       if (clen > 0)
2254         {
2255         BOOL OK;
2256         switch (c)
2257           {
2258           HSPACE_CASES:
2259           OK = TRUE;
2260           break;
2261 
2262           default:
2263           OK = FALSE;
2264           break;
2265           }
2266 
2267         if (OK == (d == OP_HSPACE))
2268           {
2269           if (codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSUPTO)
2270             {
2271             active_count--;           /* Remove non-match possibility */
2272             next_active_state--;
2273             }
2274           if (++count >= (int)GET2(code, 1))
2275             { ADD_NEW_DATA(-(state_offset + 2 + IMM2_SIZE), 0, 0); }
2276           else
2277             { ADD_NEW_DATA(-state_offset, count, 0); }
2278           }
2279         }
2280       break;
2281 
2282 /* ========================================================================== */
2283       /* These opcodes are followed by a character that is usually compared
2284       to the current subject character; it is loaded into d. We still get
2285       here even if there is no subject character, because in some cases zero
2286       repetitions are permitted. */
2287 
2288       /*-----------------------------------------------------------------*/
2289       case OP_CHAR:
2290       if (clen > 0 && c == d) { ADD_NEW(state_offset + dlen + 1, 0); }
2291       break;
2292 
2293       /*-----------------------------------------------------------------*/
2294       case OP_CHARI:
2295       if (clen == 0) break;
2296 
2297 #ifdef SUPPORT_UNICODE
2298       if (utf_or_ucp)
2299         {
2300         if (c == d) { ADD_NEW(state_offset + dlen + 1, 0); } else
2301           {
2302           unsigned int othercase;
2303           if (c < 128)
2304             othercase = fcc[c];
2305           else
2306             othercase = UCD_OTHERCASE(c);
2307           if (d == othercase) { ADD_NEW(state_offset + dlen + 1, 0); }
2308           }
2309         }
2310       else
2311 #endif  /* SUPPORT_UNICODE */
2312       /* Not UTF or UCP mode */
2313         {
2314         if (TABLE_GET(c, lcc, c) == TABLE_GET(d, lcc, d))
2315           { ADD_NEW(state_offset + 2, 0); }
2316         }
2317       break;
2318 
2319 
2320 #ifdef SUPPORT_UNICODE
2321       /*-----------------------------------------------------------------*/
2322       /* This is a tricky one because it can match more than one character.
2323       Find out how many characters to skip, and then set up a negative state
2324       to wait for them to pass before continuing. */
2325 
2326       case OP_EXTUNI:
2327       if (clen > 0)
2328         {
2329         int ncount = 0;
2330         PCRE2_SPTR nptr = PRIV(extuni)(c, ptr + clen, mb->start_subject,
2331           end_subject, utf, &ncount);
2332         if (nptr >= end_subject && (mb->moptions & PCRE2_PARTIAL_HARD) != 0)
2333             reset_could_continue = TRUE;
2334         ADD_NEW_DATA(-(state_offset + 1), 0, ncount);
2335         }
2336       break;
2337 #endif
2338 
2339       /*-----------------------------------------------------------------*/
2340       /* This is a tricky like EXTUNI because it too can match more than one
2341       character (when CR is followed by LF). In this case, set up a negative
2342       state to wait for one character to pass before continuing. */
2343 
2344       case OP_ANYNL:
2345       if (clen > 0) switch(c)
2346         {
2347         case CHAR_VT:
2348         case CHAR_FF:
2349         case CHAR_NEL:
2350 #ifndef EBCDIC
2351         case 0x2028:
2352         case 0x2029:
2353 #endif  /* Not EBCDIC */
2354         if (mb->bsr_convention == PCRE2_BSR_ANYCRLF) break;
2355         /* Fall through */
2356 
2357         case CHAR_LF:
2358         ADD_NEW(state_offset + 1, 0);
2359         break;
2360 
2361         case CHAR_CR:
2362         if (ptr + 1 >= end_subject)
2363           {
2364           ADD_NEW(state_offset + 1, 0);
2365           if ((mb->moptions & PCRE2_PARTIAL_HARD) != 0)
2366             reset_could_continue = TRUE;
2367           }
2368         else if (UCHAR21TEST(ptr + 1) == CHAR_LF)
2369           {
2370           ADD_NEW_DATA(-(state_offset + 1), 0, 1);
2371           }
2372         else
2373           {
2374           ADD_NEW(state_offset + 1, 0);
2375           }
2376         break;
2377         }
2378       break;
2379 
2380       /*-----------------------------------------------------------------*/
2381       case OP_NOT_VSPACE:
2382       if (clen > 0) switch(c)
2383         {
2384         VSPACE_CASES:
2385         break;
2386 
2387         default:
2388         ADD_NEW(state_offset + 1, 0);
2389         break;
2390         }
2391       break;
2392 
2393       /*-----------------------------------------------------------------*/
2394       case OP_VSPACE:
2395       if (clen > 0) switch(c)
2396         {
2397         VSPACE_CASES:
2398         ADD_NEW(state_offset + 1, 0);
2399         break;
2400 
2401         default:
2402         break;
2403         }
2404       break;
2405 
2406       /*-----------------------------------------------------------------*/
2407       case OP_NOT_HSPACE:
2408       if (clen > 0) switch(c)
2409         {
2410         HSPACE_CASES:
2411         break;
2412 
2413         default:
2414         ADD_NEW(state_offset + 1, 0);
2415         break;
2416         }
2417       break;
2418 
2419       /*-----------------------------------------------------------------*/
2420       case OP_HSPACE:
2421       if (clen > 0) switch(c)
2422         {
2423         HSPACE_CASES:
2424         ADD_NEW(state_offset + 1, 0);
2425         break;
2426 
2427         default:
2428         break;
2429         }
2430       break;
2431 
2432       /*-----------------------------------------------------------------*/
2433       /* Match a negated single character casefully. */
2434 
2435       case OP_NOT:
2436       if (clen > 0 && c != d) { ADD_NEW(state_offset + dlen + 1, 0); }
2437       break;
2438 
2439       /*-----------------------------------------------------------------*/
2440       /* Match a negated single character caselessly. */
2441 
2442       case OP_NOTI:
2443       if (clen > 0)
2444         {
2445         uint32_t otherd;
2446 #ifdef SUPPORT_UNICODE
2447         if (utf_or_ucp && d >= 128)
2448           otherd = UCD_OTHERCASE(d);
2449         else
2450 #endif  /* SUPPORT_UNICODE */
2451         otherd = TABLE_GET(d, fcc, d);
2452         if (c != d && c != otherd)
2453           { ADD_NEW(state_offset + dlen + 1, 0); }
2454         }
2455       break;
2456 
2457       /*-----------------------------------------------------------------*/
2458       case OP_PLUSI:
2459       case OP_MINPLUSI:
2460       case OP_POSPLUSI:
2461       case OP_NOTPLUSI:
2462       case OP_NOTMINPLUSI:
2463       case OP_NOTPOSPLUSI:
2464       caseless = TRUE;
2465       codevalue -= OP_STARI - OP_STAR;
2466 
2467       /* Fall through */
2468       case OP_PLUS:
2469       case OP_MINPLUS:
2470       case OP_POSPLUS:
2471       case OP_NOTPLUS:
2472       case OP_NOTMINPLUS:
2473       case OP_NOTPOSPLUS:
2474       count = current_state->count;  /* Already matched */
2475       if (count > 0) { ADD_ACTIVE(state_offset + dlen + 1, 0); }
2476       if (clen > 0)
2477         {
2478         uint32_t otherd = NOTACHAR;
2479         if (caseless)
2480           {
2481 #ifdef SUPPORT_UNICODE
2482           if (utf_or_ucp && d >= 128)
2483             otherd = UCD_OTHERCASE(d);
2484           else
2485 #endif  /* SUPPORT_UNICODE */
2486           otherd = TABLE_GET(d, fcc, d);
2487           }
2488         if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
2489           {
2490           if (count > 0 &&
2491               (codevalue == OP_POSPLUS || codevalue == OP_NOTPOSPLUS))
2492             {
2493             active_count--;             /* Remove non-match possibility */
2494             next_active_state--;
2495             }
2496           count++;
2497           ADD_NEW(state_offset, count);
2498           }
2499         }
2500       break;
2501 
2502       /*-----------------------------------------------------------------*/
2503       case OP_QUERYI:
2504       case OP_MINQUERYI:
2505       case OP_POSQUERYI:
2506       case OP_NOTQUERYI:
2507       case OP_NOTMINQUERYI:
2508       case OP_NOTPOSQUERYI:
2509       caseless = TRUE;
2510       codevalue -= OP_STARI - OP_STAR;
2511       /* Fall through */
2512       case OP_QUERY:
2513       case OP_MINQUERY:
2514       case OP_POSQUERY:
2515       case OP_NOTQUERY:
2516       case OP_NOTMINQUERY:
2517       case OP_NOTPOSQUERY:
2518       ADD_ACTIVE(state_offset + dlen + 1, 0);
2519       if (clen > 0)
2520         {
2521         uint32_t otherd = NOTACHAR;
2522         if (caseless)
2523           {
2524 #ifdef SUPPORT_UNICODE
2525           if (utf_or_ucp && d >= 128)
2526             otherd = UCD_OTHERCASE(d);
2527           else
2528 #endif  /* SUPPORT_UNICODE */
2529           otherd = TABLE_GET(d, fcc, d);
2530           }
2531         if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
2532           {
2533           if (codevalue == OP_POSQUERY || codevalue == OP_NOTPOSQUERY)
2534             {
2535             active_count--;            /* Remove non-match possibility */
2536             next_active_state--;
2537             }
2538           ADD_NEW(state_offset + dlen + 1, 0);
2539           }
2540         }
2541       break;
2542 
2543       /*-----------------------------------------------------------------*/
2544       case OP_STARI:
2545       case OP_MINSTARI:
2546       case OP_POSSTARI:
2547       case OP_NOTSTARI:
2548       case OP_NOTMINSTARI:
2549       case OP_NOTPOSSTARI:
2550       caseless = TRUE;
2551       codevalue -= OP_STARI - OP_STAR;
2552       /* Fall through */
2553       case OP_STAR:
2554       case OP_MINSTAR:
2555       case OP_POSSTAR:
2556       case OP_NOTSTAR:
2557       case OP_NOTMINSTAR:
2558       case OP_NOTPOSSTAR:
2559       ADD_ACTIVE(state_offset + dlen + 1, 0);
2560       if (clen > 0)
2561         {
2562         uint32_t otherd = NOTACHAR;
2563         if (caseless)
2564           {
2565 #ifdef SUPPORT_UNICODE
2566           if (utf_or_ucp && d >= 128)
2567             otherd = UCD_OTHERCASE(d);
2568           else
2569 #endif  /* SUPPORT_UNICODE */
2570           otherd = TABLE_GET(d, fcc, d);
2571           }
2572         if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
2573           {
2574           if (codevalue == OP_POSSTAR || codevalue == OP_NOTPOSSTAR)
2575             {
2576             active_count--;            /* Remove non-match possibility */
2577             next_active_state--;
2578             }
2579           ADD_NEW(state_offset, 0);
2580           }
2581         }
2582       break;
2583 
2584       /*-----------------------------------------------------------------*/
2585       case OP_EXACTI:
2586       case OP_NOTEXACTI:
2587       caseless = TRUE;
2588       codevalue -= OP_STARI - OP_STAR;
2589       /* Fall through */
2590       case OP_EXACT:
2591       case OP_NOTEXACT:
2592       count = current_state->count;  /* Number already matched */
2593       if (clen > 0)
2594         {
2595         uint32_t otherd = NOTACHAR;
2596         if (caseless)
2597           {
2598 #ifdef SUPPORT_UNICODE
2599           if (utf_or_ucp && d >= 128)
2600             otherd = UCD_OTHERCASE(d);
2601           else
2602 #endif  /* SUPPORT_UNICODE */
2603           otherd = TABLE_GET(d, fcc, d);
2604           }
2605         if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
2606           {
2607           if (++count >= (int)GET2(code, 1))
2608             { ADD_NEW(state_offset + dlen + 1 + IMM2_SIZE, 0); }
2609           else
2610             { ADD_NEW(state_offset, count); }
2611           }
2612         }
2613       break;
2614 
2615       /*-----------------------------------------------------------------*/
2616       case OP_UPTOI:
2617       case OP_MINUPTOI:
2618       case OP_POSUPTOI:
2619       case OP_NOTUPTOI:
2620       case OP_NOTMINUPTOI:
2621       case OP_NOTPOSUPTOI:
2622       caseless = TRUE;
2623       codevalue -= OP_STARI - OP_STAR;
2624       /* Fall through */
2625       case OP_UPTO:
2626       case OP_MINUPTO:
2627       case OP_POSUPTO:
2628       case OP_NOTUPTO:
2629       case OP_NOTMINUPTO:
2630       case OP_NOTPOSUPTO:
2631       ADD_ACTIVE(state_offset + dlen + 1 + IMM2_SIZE, 0);
2632       count = current_state->count;  /* Number already matched */
2633       if (clen > 0)
2634         {
2635         uint32_t otherd = NOTACHAR;
2636         if (caseless)
2637           {
2638 #ifdef SUPPORT_UNICODE
2639           if (utf_or_ucp && d >= 128)
2640             otherd = UCD_OTHERCASE(d);
2641           else
2642 #endif  /* SUPPORT_UNICODE */
2643           otherd = TABLE_GET(d, fcc, d);
2644           }
2645         if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
2646           {
2647           if (codevalue == OP_POSUPTO || codevalue == OP_NOTPOSUPTO)
2648             {
2649             active_count--;             /* Remove non-match possibility */
2650             next_active_state--;
2651             }
2652           if (++count >= (int)GET2(code, 1))
2653             { ADD_NEW(state_offset + dlen + 1 + IMM2_SIZE, 0); }
2654           else
2655             { ADD_NEW(state_offset, count); }
2656           }
2657         }
2658       break;
2659 
2660 
2661 /* ========================================================================== */
2662       /* These are the class-handling opcodes */
2663 
2664       case OP_CLASS:
2665       case OP_NCLASS:
2666       case OP_XCLASS:
2667         {
2668         BOOL isinclass = FALSE;
2669         int next_state_offset;
2670         PCRE2_SPTR ecode;
2671 
2672         /* For a simple class, there is always just a 32-byte table, and we
2673         can set isinclass from it. */
2674 
2675         if (codevalue != OP_XCLASS)
2676           {
2677           ecode = code + 1 + (32 / sizeof(PCRE2_UCHAR));
2678           if (clen > 0)
2679             {
2680             isinclass = (c > 255)? (codevalue == OP_NCLASS) :
2681               ((((uint8_t *)(code + 1))[c/8] & (1u << (c&7))) != 0);
2682             }
2683           }
2684 
2685         /* An extended class may have a table or a list of single characters,
2686         ranges, or both, and it may be positive or negative. There's a
2687         function that sorts all this out. */
2688 
2689         else
2690          {
2691          ecode = code + GET(code, 1);
2692          if (clen > 0) isinclass = PRIV(xclass)(c, code + 1 + LINK_SIZE, utf);
2693          }
2694 
2695         /* At this point, isinclass is set for all kinds of class, and ecode
2696         points to the byte after the end of the class. If there is a
2697         quantifier, this is where it will be. */
2698 
2699         next_state_offset = (int)(ecode - start_code);
2700 
2701         switch (*ecode)
2702           {
2703           case OP_CRSTAR:
2704           case OP_CRMINSTAR:
2705           case OP_CRPOSSTAR:
2706           ADD_ACTIVE(next_state_offset + 1, 0);
2707           if (isinclass)
2708             {
2709             if (*ecode == OP_CRPOSSTAR)
2710               {
2711               active_count--;           /* Remove non-match possibility */
2712               next_active_state--;
2713               }
2714             ADD_NEW(state_offset, 0);
2715             }
2716           break;
2717 
2718           case OP_CRPLUS:
2719           case OP_CRMINPLUS:
2720           case OP_CRPOSPLUS:
2721           count = current_state->count;  /* Already matched */
2722           if (count > 0) { ADD_ACTIVE(next_state_offset + 1, 0); }
2723           if (isinclass)
2724             {
2725             if (count > 0 && *ecode == OP_CRPOSPLUS)
2726               {
2727               active_count--;           /* Remove non-match possibility */
2728               next_active_state--;
2729               }
2730             count++;
2731             ADD_NEW(state_offset, count);
2732             }
2733           break;
2734 
2735           case OP_CRQUERY:
2736           case OP_CRMINQUERY:
2737           case OP_CRPOSQUERY:
2738           ADD_ACTIVE(next_state_offset + 1, 0);
2739           if (isinclass)
2740             {
2741             if (*ecode == OP_CRPOSQUERY)
2742               {
2743               active_count--;           /* Remove non-match possibility */
2744               next_active_state--;
2745               }
2746             ADD_NEW(next_state_offset + 1, 0);
2747             }
2748           break;
2749 
2750           case OP_CRRANGE:
2751           case OP_CRMINRANGE:
2752           case OP_CRPOSRANGE:
2753           count = current_state->count;  /* Already matched */
2754           if (count >= (int)GET2(ecode, 1))
2755             { ADD_ACTIVE(next_state_offset + 1 + 2 * IMM2_SIZE, 0); }
2756           if (isinclass)
2757             {
2758             int max = (int)GET2(ecode, 1 + IMM2_SIZE);
2759 
2760             if (*ecode == OP_CRPOSRANGE && count >= (int)GET2(ecode, 1))
2761               {
2762               active_count--;           /* Remove non-match possibility */
2763               next_active_state--;
2764               }
2765 
2766             if (++count >= max && max != 0)   /* Max 0 => no limit */
2767               { ADD_NEW(next_state_offset + 1 + 2 * IMM2_SIZE, 0); }
2768             else
2769               { ADD_NEW(state_offset, count); }
2770             }
2771           break;
2772 
2773           default:
2774           if (isinclass) { ADD_NEW(next_state_offset, 0); }
2775           break;
2776           }
2777         }
2778       break;
2779 
2780 /* ========================================================================== */
2781       /* These are the opcodes for fancy brackets of various kinds. We have
2782       to use recursion in order to handle them. The "always failing" assertion
2783       (?!) is optimised to OP_FAIL when compiling, so we have to support that,
2784       though the other "backtracking verbs" are not supported. */
2785 
2786       case OP_FAIL:
2787       forced_fail++;    /* Count FAILs for multiple states */
2788       break;
2789 
2790       case OP_ASSERT:
2791       case OP_ASSERT_NOT:
2792       case OP_ASSERTBACK:
2793       case OP_ASSERTBACK_NOT:
2794         {
2795         int rc;
2796         int *local_workspace;
2797         PCRE2_SIZE *local_offsets;
2798         PCRE2_SPTR endasscode = code + GET(code, 1);
2799         RWS_anchor *rws = (RWS_anchor *)RWS;
2800 
2801         if (rws->free < RWS_RSIZE + RWS_OVEC_OSIZE)
2802           {
2803           rc = more_workspace(&rws, RWS_OVEC_OSIZE, mb);
2804           if (rc != 0) return rc;
2805           RWS = (int *)rws;
2806           }
2807 
2808         local_offsets = (PCRE2_SIZE *)(RWS + rws->size - rws->free);
2809         local_workspace = ((int *)local_offsets) + RWS_OVEC_OSIZE;
2810         rws->free -= RWS_RSIZE + RWS_OVEC_OSIZE;
2811 
2812         while (*endasscode == OP_ALT) endasscode += GET(endasscode, 1);
2813 
2814         rc = internal_dfa_match(
2815           mb,                                   /* static match data */
2816           code,                                 /* this subexpression's code */
2817           ptr,                                  /* where we currently are */
2818           (PCRE2_SIZE)(ptr - start_subject),    /* start offset */
2819           local_offsets,                        /* offset vector */
2820           RWS_OVEC_OSIZE/OVEC_UNIT,             /* size of same */
2821           local_workspace,                      /* workspace vector */
2822           RWS_RSIZE,                            /* size of same */
2823           rlevel,                               /* function recursion level */
2824           RWS);                                 /* recursion workspace */
2825 
2826         rws->free += RWS_RSIZE + RWS_OVEC_OSIZE;
2827 
2828         if (rc < 0 && rc != PCRE2_ERROR_NOMATCH) return rc;
2829         if ((rc >= 0) == (codevalue == OP_ASSERT || codevalue == OP_ASSERTBACK))
2830             { ADD_ACTIVE((int)(endasscode + LINK_SIZE + 1 - start_code), 0); }
2831         }
2832       break;
2833 
2834       /*-----------------------------------------------------------------*/
2835       case OP_COND:
2836       case OP_SCOND:
2837         {
2838         int codelink = (int)GET(code, 1);
2839         PCRE2_UCHAR condcode;
2840 
2841         /* Because of the way auto-callout works during compile, a callout item
2842         is inserted between OP_COND and an assertion condition. This does not
2843         happen for the other conditions. */
2844 
2845         if (code[LINK_SIZE + 1] == OP_CALLOUT
2846             || code[LINK_SIZE + 1] == OP_CALLOUT_STR)
2847           {
2848           PCRE2_SIZE callout_length;
2849           rrc = do_callout_dfa(code, offsets, current_subject, ptr, mb,
2850             1 + LINK_SIZE, &callout_length);
2851           if (rrc < 0) return rrc;                 /* Abandon */
2852           if (rrc > 0) break;                      /* Fail this thread */
2853           code += callout_length;                  /* Skip callout data */
2854           }
2855 
2856         condcode = code[LINK_SIZE+1];
2857 
2858         /* Back reference conditions and duplicate named recursion conditions
2859         are not supported */
2860 
2861         if (condcode == OP_CREF || condcode == OP_DNCREF ||
2862             condcode == OP_DNRREF)
2863           return PCRE2_ERROR_DFA_UCOND;
2864 
2865         /* The DEFINE condition is always false, and the assertion (?!) is
2866         converted to OP_FAIL. */
2867 
2868         if (condcode == OP_FALSE || condcode == OP_FAIL)
2869           { ADD_ACTIVE(state_offset + codelink + LINK_SIZE + 1, 0); }
2870 
2871         /* There is also an always-true condition */
2872 
2873         else if (condcode == OP_TRUE)
2874           { ADD_ACTIVE(state_offset + LINK_SIZE + 2, 0); }
2875 
2876         /* The only supported version of OP_RREF is for the value RREF_ANY,
2877         which means "test if in any recursion". We can't test for specifically
2878         recursed groups. */
2879 
2880         else if (condcode == OP_RREF)
2881           {
2882           unsigned int value = GET2(code, LINK_SIZE + 2);
2883           if (value != RREF_ANY) return PCRE2_ERROR_DFA_UCOND;
2884           if (mb->recursive != NULL)
2885             { ADD_ACTIVE(state_offset + LINK_SIZE + 2 + IMM2_SIZE, 0); }
2886           else { ADD_ACTIVE(state_offset + codelink + LINK_SIZE + 1, 0); }
2887           }
2888 
2889         /* Otherwise, the condition is an assertion */
2890 
2891         else
2892           {
2893           int rc;
2894           int *local_workspace;
2895           PCRE2_SIZE *local_offsets;
2896           PCRE2_SPTR asscode = code + LINK_SIZE + 1;
2897           PCRE2_SPTR endasscode = asscode + GET(asscode, 1);
2898           RWS_anchor *rws = (RWS_anchor *)RWS;
2899 
2900           if (rws->free < RWS_RSIZE + RWS_OVEC_OSIZE)
2901             {
2902             rc = more_workspace(&rws, RWS_OVEC_OSIZE, mb);
2903             if (rc != 0) return rc;
2904             RWS = (int *)rws;
2905             }
2906 
2907           local_offsets = (PCRE2_SIZE *)(RWS + rws->size - rws->free);
2908           local_workspace = ((int *)local_offsets) + RWS_OVEC_OSIZE;
2909           rws->free -= RWS_RSIZE + RWS_OVEC_OSIZE;
2910 
2911           while (*endasscode == OP_ALT) endasscode += GET(endasscode, 1);
2912 
2913           rc = internal_dfa_match(
2914             mb,                                   /* fixed match data */
2915             asscode,                              /* this subexpression's code */
2916             ptr,                                  /* where we currently are */
2917             (PCRE2_SIZE)(ptr - start_subject),    /* start offset */
2918             local_offsets,                        /* offset vector */
2919             RWS_OVEC_OSIZE/OVEC_UNIT,             /* size of same */
2920             local_workspace,                      /* workspace vector */
2921             RWS_RSIZE,                            /* size of same */
2922             rlevel,                               /* function recursion level */
2923             RWS);                                 /* recursion workspace */
2924 
2925           rws->free += RWS_RSIZE + RWS_OVEC_OSIZE;
2926 
2927           if (rc < 0 && rc != PCRE2_ERROR_NOMATCH) return rc;
2928           if ((rc >= 0) ==
2929                 (condcode == OP_ASSERT || condcode == OP_ASSERTBACK))
2930             { ADD_ACTIVE((int)(endasscode + LINK_SIZE + 1 - start_code), 0); }
2931           else
2932             { ADD_ACTIVE(state_offset + codelink + LINK_SIZE + 1, 0); }
2933           }
2934         }
2935       break;
2936 
2937       /*-----------------------------------------------------------------*/
2938       case OP_RECURSE:
2939         {
2940         int rc;
2941         int *local_workspace;
2942         PCRE2_SIZE *local_offsets;
2943         RWS_anchor *rws = (RWS_anchor *)RWS;
2944         PCRE2_SPTR callpat = start_code + GET(code, 1);
2945         uint32_t recno = (callpat == mb->start_code)? 0 :
2946           GET2(callpat, 1 + LINK_SIZE);
2947 
2948         if (rws->free < RWS_RSIZE + RWS_OVEC_RSIZE)
2949           {
2950           rc = more_workspace(&rws, RWS_OVEC_RSIZE, mb);
2951           if (rc != 0) return rc;
2952           RWS = (int *)rws;
2953           }
2954 
2955         local_offsets = (PCRE2_SIZE *)(RWS + rws->size - rws->free);
2956         local_workspace = ((int *)local_offsets) + RWS_OVEC_RSIZE;
2957         rws->free -= RWS_RSIZE + RWS_OVEC_RSIZE;
2958 
2959         /* Check for repeating a recursion without advancing the subject
2960         pointer or last used character. This should catch convoluted mutual
2961         recursions. (Some simple cases are caught at compile time.) */
2962 
2963         for (dfa_recursion_info *ri = mb->recursive;
2964              ri != NULL;
2965              ri = ri->prevrec)
2966           {
2967           if (recno == ri->group_num && ptr == ri->subject_position &&
2968               mb->last_used_ptr == ri->last_used_ptr)
2969             return PCRE2_ERROR_RECURSELOOP;
2970           }
2971 
2972         /* Remember this recursion and where we started it so as to
2973         catch infinite loops. */
2974 
2975         new_recursive.group_num = recno;
2976         new_recursive.subject_position = ptr;
2977         new_recursive.last_used_ptr = mb->last_used_ptr;
2978         new_recursive.prevrec = mb->recursive;
2979         mb->recursive = &new_recursive;
2980 
2981         rc = internal_dfa_match(
2982           mb,                                   /* fixed match data */
2983           callpat,                              /* this subexpression's code */
2984           ptr,                                  /* where we currently are */
2985           (PCRE2_SIZE)(ptr - start_subject),    /* start offset */
2986           local_offsets,                        /* offset vector */
2987           RWS_OVEC_RSIZE/OVEC_UNIT,             /* size of same */
2988           local_workspace,                      /* workspace vector */
2989           RWS_RSIZE,                            /* size of same */
2990           rlevel,                               /* function recursion level */
2991           RWS);                                 /* recursion workspace */
2992 
2993         rws->free += RWS_RSIZE + RWS_OVEC_RSIZE;
2994         mb->recursive = new_recursive.prevrec;  /* Done this recursion */
2995 
2996         /* Ran out of internal offsets */
2997 
2998         if (rc == 0) return PCRE2_ERROR_DFA_RECURSE;
2999 
3000         /* For each successful matched substring, set up the next state with a
3001         count of characters to skip before trying it. Note that the count is in
3002         characters, not bytes. */
3003 
3004         if (rc > 0)
3005           {
3006           for (rc = rc*2 - 2; rc >= 0; rc -= 2)
3007             {
3008             PCRE2_SIZE charcount = local_offsets[rc+1] - local_offsets[rc];
3009 #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32
3010             if (utf)
3011               {
3012               PCRE2_SPTR p = start_subject + local_offsets[rc];
3013               PCRE2_SPTR pp = start_subject + local_offsets[rc+1];
3014               while (p < pp) if (NOT_FIRSTCU(*p++)) charcount--;
3015               }
3016 #endif
3017             if (charcount > 0)
3018               {
3019               ADD_NEW_DATA(-(state_offset + LINK_SIZE + 1), 0,
3020                 (int)(charcount - 1));
3021               }
3022             else
3023               {
3024               ADD_ACTIVE(state_offset + LINK_SIZE + 1, 0);
3025               }
3026             }
3027           }
3028         else if (rc != PCRE2_ERROR_NOMATCH) return rc;
3029         }
3030       break;
3031 
3032       /*-----------------------------------------------------------------*/
3033       case OP_BRAPOS:
3034       case OP_SBRAPOS:
3035       case OP_CBRAPOS:
3036       case OP_SCBRAPOS:
3037       case OP_BRAPOSZERO:
3038         {
3039         int rc;
3040         int *local_workspace;
3041         PCRE2_SIZE *local_offsets;
3042         PCRE2_SIZE charcount, matched_count;
3043         PCRE2_SPTR local_ptr = ptr;
3044         RWS_anchor *rws = (RWS_anchor *)RWS;
3045         BOOL allow_zero;
3046 
3047         if (rws->free < RWS_RSIZE + RWS_OVEC_OSIZE)
3048           {
3049           rc = more_workspace(&rws, RWS_OVEC_OSIZE, mb);
3050           if (rc != 0) return rc;
3051           RWS = (int *)rws;
3052           }
3053 
3054         local_offsets = (PCRE2_SIZE *)(RWS + rws->size - rws->free);
3055         local_workspace = ((int *)local_offsets) + RWS_OVEC_OSIZE;
3056         rws->free -= RWS_RSIZE + RWS_OVEC_OSIZE;
3057 
3058         if (codevalue == OP_BRAPOSZERO)
3059           {
3060           allow_zero = TRUE;
3061           codevalue = *(++code);  /* Codevalue will be one of above BRAs */
3062           }
3063         else allow_zero = FALSE;
3064 
3065         /* Loop to match the subpattern as many times as possible as if it were
3066         a complete pattern. */
3067 
3068         for (matched_count = 0;; matched_count++)
3069           {
3070           rc = internal_dfa_match(
3071             mb,                                   /* fixed match data */
3072             code,                                 /* this subexpression's code */
3073             local_ptr,                            /* where we currently are */
3074             (PCRE2_SIZE)(ptr - start_subject),    /* start offset */
3075             local_offsets,                        /* offset vector */
3076             RWS_OVEC_OSIZE/OVEC_UNIT,             /* size of same */
3077             local_workspace,                      /* workspace vector */
3078             RWS_RSIZE,                            /* size of same */
3079             rlevel,                               /* function recursion level */
3080             RWS);                                 /* recursion workspace */
3081 
3082           /* Failed to match */
3083 
3084           if (rc < 0)
3085             {
3086             if (rc != PCRE2_ERROR_NOMATCH) return rc;
3087             break;
3088             }
3089 
3090           /* Matched: break the loop if zero characters matched. */
3091 
3092           charcount = local_offsets[1] - local_offsets[0];
3093           if (charcount == 0) break;
3094           local_ptr += charcount;    /* Advance temporary position ptr */
3095           }
3096 
3097         rws->free += RWS_RSIZE + RWS_OVEC_OSIZE;
3098 
3099         /* At this point we have matched the subpattern matched_count
3100         times, and local_ptr is pointing to the character after the end of the
3101         last match. */
3102 
3103         if (matched_count > 0 || allow_zero)
3104           {
3105           PCRE2_SPTR end_subpattern = code;
3106           int next_state_offset;
3107 
3108           do { end_subpattern += GET(end_subpattern, 1); }
3109             while (*end_subpattern == OP_ALT);
3110           next_state_offset =
3111             (int)(end_subpattern - start_code + LINK_SIZE + 1);
3112 
3113           /* Optimization: if there are no more active states, and there
3114           are no new states yet set up, then skip over the subject string
3115           right here, to save looping. Otherwise, set up the new state to swing
3116           into action when the end of the matched substring is reached. */
3117 
3118           if (i + 1 >= active_count && new_count == 0)
3119             {
3120             ptr = local_ptr;
3121             clen = 0;
3122             ADD_NEW(next_state_offset, 0);
3123             }
3124           else
3125             {
3126             PCRE2_SPTR p = ptr;
3127             PCRE2_SPTR pp = local_ptr;
3128             charcount = (PCRE2_SIZE)(pp - p);
3129 #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32
3130             if (utf) while (p < pp) if (NOT_FIRSTCU(*p++)) charcount--;
3131 #endif
3132             ADD_NEW_DATA(-next_state_offset, 0, (int)(charcount - 1));
3133             }
3134           }
3135         }
3136       break;
3137 
3138       /*-----------------------------------------------------------------*/
3139       case OP_ONCE:
3140         {
3141         int rc;
3142         int *local_workspace;
3143         PCRE2_SIZE *local_offsets;
3144         RWS_anchor *rws = (RWS_anchor *)RWS;
3145 
3146         if (rws->free < RWS_RSIZE + RWS_OVEC_OSIZE)
3147           {
3148           rc = more_workspace(&rws, RWS_OVEC_OSIZE, mb);
3149           if (rc != 0) return rc;
3150           RWS = (int *)rws;
3151           }
3152 
3153         local_offsets = (PCRE2_SIZE *)(RWS + rws->size - rws->free);
3154         local_workspace = ((int *)local_offsets) + RWS_OVEC_OSIZE;
3155         rws->free -= RWS_RSIZE + RWS_OVEC_OSIZE;
3156 
3157         rc = internal_dfa_match(
3158           mb,                                   /* fixed match data */
3159           code,                                 /* this subexpression's code */
3160           ptr,                                  /* where we currently are */
3161           (PCRE2_SIZE)(ptr - start_subject),    /* start offset */
3162           local_offsets,                        /* offset vector */
3163           RWS_OVEC_OSIZE/OVEC_UNIT,             /* size of same */
3164           local_workspace,                      /* workspace vector */
3165           RWS_RSIZE,                            /* size of same */
3166           rlevel,                               /* function recursion level */
3167           RWS);                                 /* recursion workspace */
3168 
3169         rws->free += RWS_RSIZE + RWS_OVEC_OSIZE;
3170 
3171         if (rc >= 0)
3172           {
3173           PCRE2_SPTR end_subpattern = code;
3174           PCRE2_SIZE charcount = local_offsets[1] - local_offsets[0];
3175           int next_state_offset, repeat_state_offset;
3176 
3177           do { end_subpattern += GET(end_subpattern, 1); }
3178             while (*end_subpattern == OP_ALT);
3179           next_state_offset =
3180             (int)(end_subpattern - start_code + LINK_SIZE + 1);
3181 
3182           /* If the end of this subpattern is KETRMAX or KETRMIN, we must
3183           arrange for the repeat state also to be added to the relevant list.
3184           Calculate the offset, or set -1 for no repeat. */
3185 
3186           repeat_state_offset = (*end_subpattern == OP_KETRMAX ||
3187                                  *end_subpattern == OP_KETRMIN)?
3188             (int)(end_subpattern - start_code - GET(end_subpattern, 1)) : -1;
3189 
3190           /* If we have matched an empty string, add the next state at the
3191           current character pointer. This is important so that the duplicate
3192           checking kicks in, which is what breaks infinite loops that match an
3193           empty string. */
3194 
3195           if (charcount == 0)
3196             {
3197             ADD_ACTIVE(next_state_offset, 0);
3198             }
3199 
3200           /* Optimization: if there are no more active states, and there
3201           are no new states yet set up, then skip over the subject string
3202           right here, to save looping. Otherwise, set up the new state to swing
3203           into action when the end of the matched substring is reached. */
3204 
3205           else if (i + 1 >= active_count && new_count == 0)
3206             {
3207             ptr += charcount;
3208             clen = 0;
3209             ADD_NEW(next_state_offset, 0);
3210 
3211             /* If we are adding a repeat state at the new character position,
3212             we must fudge things so that it is the only current state.
3213             Otherwise, it might be a duplicate of one we processed before, and
3214             that would cause it to be skipped. */
3215 
3216             if (repeat_state_offset >= 0)
3217               {
3218               next_active_state = active_states;
3219               active_count = 0;
3220               i = -1;
3221               ADD_ACTIVE(repeat_state_offset, 0);
3222               }
3223             }
3224           else
3225             {
3226 #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32
3227             if (utf)
3228               {
3229               PCRE2_SPTR p = start_subject + local_offsets[0];
3230               PCRE2_SPTR pp = start_subject + local_offsets[1];
3231               while (p < pp) if (NOT_FIRSTCU(*p++)) charcount--;
3232               }
3233 #endif
3234             ADD_NEW_DATA(-next_state_offset, 0, (int)(charcount - 1));
3235             if (repeat_state_offset >= 0)
3236               { ADD_NEW_DATA(-repeat_state_offset, 0, (int)(charcount - 1)); }
3237             }
3238           }
3239         else if (rc != PCRE2_ERROR_NOMATCH) return rc;
3240         }
3241       break;
3242 
3243 
3244 /* ========================================================================== */
3245       /* Handle callouts */
3246 
3247       case OP_CALLOUT:
3248       case OP_CALLOUT_STR:
3249         {
3250         PCRE2_SIZE callout_length;
3251         rrc = do_callout_dfa(code, offsets, current_subject, ptr, mb, 0,
3252           &callout_length);
3253         if (rrc < 0) return rrc;   /* Abandon */
3254         if (rrc == 0)
3255           { ADD_ACTIVE(state_offset + (int)callout_length, 0); }
3256         }
3257       break;
3258 
3259 
3260 /* ========================================================================== */
3261       default:        /* Unsupported opcode */
3262       return PCRE2_ERROR_DFA_UITEM;
3263       }
3264 
3265     NEXT_ACTIVE_STATE: continue;
3266 
3267     }      /* End of loop scanning active states */
3268 
3269   /* We have finished the processing at the current subject character. If no
3270   new states have been set for the next character, we have found all the
3271   matches that we are going to find. If partial matching has been requested,
3272   check for appropriate conditions.
3273 
3274   The "forced_ fail" variable counts the number of (*F) encountered for the
3275   character. If it is equal to the original active_count (saved in
3276   workspace[1]) it means that (*F) was found on every active state. In this
3277   case we don't want to give a partial match.
3278 
3279   The "could_continue" variable is true if a state could have continued but
3280   for the fact that the end of the subject was reached. */
3281 
3282   if (new_count <= 0)
3283     {
3284     if (could_continue &&                            /* Some could go on, and */
3285         forced_fail != workspace[1] &&               /* Not all forced fail & */
3286         (                                            /* either... */
3287         (mb->moptions & PCRE2_PARTIAL_HARD) != 0      /* Hard partial */
3288         ||                                           /* or... */
3289         ((mb->moptions & PCRE2_PARTIAL_SOFT) != 0 &&  /* Soft partial and */
3290          match_count < 0)                             /* no matches */
3291         ) &&                                         /* And... */
3292         (
3293         partial_newline ||                   /* Either partial NL */
3294           (                                  /* or ... */
3295           ptr >= end_subject &&              /* End of subject and */
3296             (                                  /* either */
3297             ptr > mb->start_used_ptr ||        /* Inspected non-empty string */
3298             mb->allowemptypartial              /* or pattern has lookbehind */
3299             )                                  /* or could match empty */
3300           )
3301         ))
3302       match_count = PCRE2_ERROR_PARTIAL;
3303     break;  /* Exit from loop along the subject string */
3304     }
3305 
3306   /* One or more states are active for the next character. */
3307 
3308   ptr += clen;    /* Advance to next subject character */
3309   }               /* Loop to move along the subject string */
3310 
3311 /* Control gets here from "break" a few lines above. If we have a match and
3312 PCRE2_ENDANCHORED is set, the match fails. */
3313 
3314 if (match_count >= 0 &&
3315     ((mb->moptions | mb->poptions) & PCRE2_ENDANCHORED) != 0 &&
3316     ptr < end_subject)
3317   match_count = PCRE2_ERROR_NOMATCH;
3318 
3319 return match_count;
3320 }
3321 
3322 
3323 
3324 /*************************************************
3325 *     Match a pattern using the DFA algorithm    *
3326 *************************************************/
3327 
3328 /* This function matches a compiled pattern to a subject string, using the
3329 alternate matching algorithm that finds all matches at once.
3330 
3331 Arguments:
3332   code          points to the compiled pattern
3333   subject       subject string
3334   length        length of subject string
3335   startoffset   where to start matching in the subject
3336   options       option bits
3337   match_data    points to a match data structure
3338   gcontext      points to a match context
3339   workspace     pointer to workspace
3340   wscount       size of workspace
3341 
3342 Returns:        > 0 => number of match offset pairs placed in offsets
3343                 = 0 => offsets overflowed; longest matches are present
3344                  -1 => failed to match
3345                < -1 => some kind of unexpected problem
3346 */
3347 
3348 PCRE2_EXP_DEFN int PCRE2_CALL_CONVENTION
pcre2_dfa_match(const pcre2_code * code,PCRE2_SPTR subject,PCRE2_SIZE length,PCRE2_SIZE start_offset,uint32_t options,pcre2_match_data * match_data,pcre2_match_context * mcontext,int * workspace,PCRE2_SIZE wscount)3349 pcre2_dfa_match(const pcre2_code *code, PCRE2_SPTR subject, PCRE2_SIZE length,
3350   PCRE2_SIZE start_offset, uint32_t options, pcre2_match_data *match_data,
3351   pcre2_match_context *mcontext, int *workspace, PCRE2_SIZE wscount)
3352 {
3353 int rc;
3354 int was_zero_terminated = 0;
3355 
3356 const pcre2_real_code *re = (const pcre2_real_code *)code;
3357 
3358 PCRE2_SPTR start_match;
3359 PCRE2_SPTR end_subject;
3360 PCRE2_SPTR bumpalong_limit;
3361 PCRE2_SPTR req_cu_ptr;
3362 
3363 BOOL utf, anchored, startline, firstline;
3364 BOOL has_first_cu = FALSE;
3365 BOOL has_req_cu = FALSE;
3366 
3367 #if PCRE2_CODE_UNIT_WIDTH == 8
3368 PCRE2_SPTR memchr_found_first_cu = NULL;
3369 PCRE2_SPTR memchr_found_first_cu2 = NULL;
3370 #endif
3371 
3372 PCRE2_UCHAR first_cu = 0;
3373 PCRE2_UCHAR first_cu2 = 0;
3374 PCRE2_UCHAR req_cu = 0;
3375 PCRE2_UCHAR req_cu2 = 0;
3376 
3377 const uint8_t *start_bits = NULL;
3378 
3379 /* We need to have mb pointing to a match block, because the IS_NEWLINE macro
3380 is used below, and it expects NLBLOCK to be defined as a pointer. */
3381 
3382 pcre2_callout_block cb;
3383 dfa_match_block actual_match_block;
3384 dfa_match_block *mb = &actual_match_block;
3385 
3386 /* Set up a starting block of memory for use during recursive calls to
3387 internal_dfa_match(). By putting this on the stack, it minimizes resource use
3388 in the case when it is not needed. If this is too small, more memory is
3389 obtained from the heap. At the start of each block is an anchor structure.*/
3390 
3391 int base_recursion_workspace[RWS_BASE_SIZE];
3392 RWS_anchor *rws = (RWS_anchor *)base_recursion_workspace;
3393 rws->next = NULL;
3394 rws->size = RWS_BASE_SIZE;
3395 rws->free = RWS_BASE_SIZE - RWS_ANCHOR_SIZE;
3396 
3397 /* Recognize NULL, length 0 as an empty string. */
3398 
3399 if (subject == NULL && length == 0) subject = (PCRE2_SPTR)"";
3400 
3401 /* Plausibility checks */
3402 
3403 if ((options & ~PUBLIC_DFA_MATCH_OPTIONS) != 0) return PCRE2_ERROR_BADOPTION;
3404 if (re == NULL || subject == NULL || workspace == NULL || match_data == NULL)
3405   return PCRE2_ERROR_NULL;
3406 
3407 if (length == PCRE2_ZERO_TERMINATED)
3408   {
3409   length = PRIV(strlen)(subject);
3410   was_zero_terminated = 1;
3411   }
3412 
3413 if (wscount < 20) return PCRE2_ERROR_DFA_WSSIZE;
3414 if (start_offset > length) return PCRE2_ERROR_BADOFFSET;
3415 
3416 /* Partial matching and PCRE2_ENDANCHORED are currently not allowed at the same
3417 time. */
3418 
3419 if ((options & (PCRE2_PARTIAL_HARD|PCRE2_PARTIAL_SOFT)) != 0 &&
3420    ((re->overall_options | options) & PCRE2_ENDANCHORED) != 0)
3421   return PCRE2_ERROR_BADOPTION;
3422 
3423 /* Invalid UTF support is not available for DFA matching. */
3424 
3425 if ((re->overall_options & PCRE2_MATCH_INVALID_UTF) != 0)
3426   return PCRE2_ERROR_DFA_UINVALID_UTF;
3427 
3428 /* Check that the first field in the block is the magic number. If it is not,
3429 return with PCRE2_ERROR_BADMAGIC. */
3430 
3431 if (re->magic_number != MAGIC_NUMBER) return PCRE2_ERROR_BADMAGIC;
3432 
3433 /* Check the code unit width. */
3434 
3435 if ((re->flags & PCRE2_MODE_MASK) != PCRE2_CODE_UNIT_WIDTH/8)
3436   return PCRE2_ERROR_BADMODE;
3437 
3438 /* PCRE2_NOTEMPTY and PCRE2_NOTEMPTY_ATSTART are match-time flags in the
3439 options variable for this function. Users of PCRE2 who are not calling the
3440 function directly would like to have a way of setting these flags, in the same
3441 way that they can set pcre2_compile() flags like PCRE2_NO_AUTOPOSSESS with
3442 constructions like (*NO_AUTOPOSSESS). To enable this, (*NOTEMPTY) and
3443 (*NOTEMPTY_ATSTART) set bits in the pattern's "flag" function which can now be
3444 transferred to the options for this function. The bits are guaranteed to be
3445 adjacent, but do not have the same values. This bit of Boolean trickery assumes
3446 that the match-time bits are not more significant than the flag bits. If by
3447 accident this is not the case, a compile-time division by zero error will
3448 occur. */
3449 
3450 #define FF (PCRE2_NOTEMPTY_SET|PCRE2_NE_ATST_SET)
3451 #define OO (PCRE2_NOTEMPTY|PCRE2_NOTEMPTY_ATSTART)
3452 options |= (re->flags & FF) / ((FF & (~FF+1)) / (OO & (~OO+1)));
3453 #undef FF
3454 #undef OO
3455 
3456 /* If restarting after a partial match, do some sanity checks on the contents
3457 of the workspace. */
3458 
3459 if ((options & PCRE2_DFA_RESTART) != 0)
3460   {
3461   if ((workspace[0] & (-2)) != 0 || workspace[1] < 1 ||
3462     workspace[1] > (int)((wscount - 2)/INTS_PER_STATEBLOCK))
3463       return PCRE2_ERROR_DFA_BADRESTART;
3464   }
3465 
3466 /* Set some local values */
3467 
3468 utf = (re->overall_options & PCRE2_UTF) != 0;
3469 start_match = subject + start_offset;
3470 end_subject = subject + length;
3471 req_cu_ptr = start_match - 1;
3472 anchored = (options & (PCRE2_ANCHORED|PCRE2_DFA_RESTART)) != 0 ||
3473   (re->overall_options & PCRE2_ANCHORED) != 0;
3474 
3475 /* The "must be at the start of a line" flags are used in a loop when finding
3476 where to start. */
3477 
3478 startline = (re->flags & PCRE2_STARTLINE) != 0;
3479 firstline = !anchored && (re->overall_options & PCRE2_FIRSTLINE) != 0;
3480 bumpalong_limit = end_subject;
3481 
3482 /* Initialize and set up the fixed fields in the callout block, with a pointer
3483 in the match block. */
3484 
3485 mb->cb = &cb;
3486 cb.version = 2;
3487 cb.subject = subject;
3488 cb.subject_length = (PCRE2_SIZE)(end_subject - subject);
3489 cb.callout_flags = 0;
3490 cb.capture_top      = 1;      /* No capture support */
3491 cb.capture_last     = 0;
3492 cb.mark             = NULL;   /* No (*MARK) support */
3493 
3494 /* Get data from the match context, if present, and fill in the remaining
3495 fields in the match block. It is an error to set an offset limit without
3496 setting the flag at compile time. */
3497 
3498 if (mcontext == NULL)
3499   {
3500   mb->callout = NULL;
3501   mb->memctl = re->memctl;
3502   mb->match_limit = PRIV(default_match_context).match_limit;
3503   mb->match_limit_depth = PRIV(default_match_context).depth_limit;
3504   mb->heap_limit = PRIV(default_match_context).heap_limit;
3505   }
3506 else
3507   {
3508   if (mcontext->offset_limit != PCRE2_UNSET)
3509     {
3510     if ((re->overall_options & PCRE2_USE_OFFSET_LIMIT) == 0)
3511       return PCRE2_ERROR_BADOFFSETLIMIT;
3512     bumpalong_limit = subject + mcontext->offset_limit;
3513     }
3514   mb->callout = mcontext->callout;
3515   mb->callout_data = mcontext->callout_data;
3516   mb->memctl = mcontext->memctl;
3517   mb->match_limit = mcontext->match_limit;
3518   mb->match_limit_depth = mcontext->depth_limit;
3519   mb->heap_limit = mcontext->heap_limit;
3520   }
3521 
3522 if (mb->match_limit > re->limit_match)
3523   mb->match_limit = re->limit_match;
3524 
3525 if (mb->match_limit_depth > re->limit_depth)
3526   mb->match_limit_depth = re->limit_depth;
3527 
3528 if (mb->heap_limit > re->limit_heap)
3529   mb->heap_limit = re->limit_heap;
3530 
3531 mb->start_code = (PCRE2_UCHAR *)((uint8_t *)re + sizeof(pcre2_real_code)) +
3532   re->name_count * re->name_entry_size;
3533 mb->tables = re->tables;
3534 mb->start_subject = subject;
3535 mb->end_subject = end_subject;
3536 mb->start_offset = start_offset;
3537 mb->allowemptypartial = (re->max_lookbehind > 0) ||
3538   (re->flags & PCRE2_MATCH_EMPTY) != 0;
3539 mb->moptions = options;
3540 mb->poptions = re->overall_options;
3541 mb->match_call_count = 0;
3542 mb->heap_used = 0;
3543 
3544 /* Process the \R and newline settings. */
3545 
3546 mb->bsr_convention = re->bsr_convention;
3547 mb->nltype = NLTYPE_FIXED;
3548 switch(re->newline_convention)
3549   {
3550   case PCRE2_NEWLINE_CR:
3551   mb->nllen = 1;
3552   mb->nl[0] = CHAR_CR;
3553   break;
3554 
3555   case PCRE2_NEWLINE_LF:
3556   mb->nllen = 1;
3557   mb->nl[0] = CHAR_NL;
3558   break;
3559 
3560   case PCRE2_NEWLINE_NUL:
3561   mb->nllen = 1;
3562   mb->nl[0] = CHAR_NUL;
3563   break;
3564 
3565   case PCRE2_NEWLINE_CRLF:
3566   mb->nllen = 2;
3567   mb->nl[0] = CHAR_CR;
3568   mb->nl[1] = CHAR_NL;
3569   break;
3570 
3571   case PCRE2_NEWLINE_ANY:
3572   mb->nltype = NLTYPE_ANY;
3573   break;
3574 
3575   case PCRE2_NEWLINE_ANYCRLF:
3576   mb->nltype = NLTYPE_ANYCRLF;
3577   break;
3578 
3579   default: return PCRE2_ERROR_INTERNAL;
3580   }
3581 
3582 /* Check a UTF string for validity if required. For 8-bit and 16-bit strings,
3583 we must also check that a starting offset does not point into the middle of a
3584 multiunit character. We check only the portion of the subject that is going to
3585 be inspected during matching - from the offset minus the maximum back reference
3586 to the given length. This saves time when a small part of a large subject is
3587 being matched by the use of a starting offset. Note that the maximum lookbehind
3588 is a number of characters, not code units. */
3589 
3590 #ifdef SUPPORT_UNICODE
3591 if (utf && (options & PCRE2_NO_UTF_CHECK) == 0)
3592   {
3593   PCRE2_SPTR check_subject = start_match;  /* start_match includes offset */
3594 
3595   if (start_offset > 0)
3596     {
3597 #if PCRE2_CODE_UNIT_WIDTH != 32
3598     unsigned int i;
3599     if (start_match < end_subject && NOT_FIRSTCU(*start_match))
3600       return PCRE2_ERROR_BADUTFOFFSET;
3601     for (i = re->max_lookbehind; i > 0 && check_subject > subject; i--)
3602       {
3603       check_subject--;
3604       while (check_subject > subject &&
3605 #if PCRE2_CODE_UNIT_WIDTH == 8
3606       (*check_subject & 0xc0) == 0x80)
3607 #else  /* 16-bit */
3608       (*check_subject & 0xfc00) == 0xdc00)
3609 #endif /* PCRE2_CODE_UNIT_WIDTH == 8 */
3610         check_subject--;
3611       }
3612 #else   /* In the 32-bit library, one code unit equals one character. */
3613     check_subject -= re->max_lookbehind;
3614     if (check_subject < subject) check_subject = subject;
3615 #endif  /* PCRE2_CODE_UNIT_WIDTH != 32 */
3616     }
3617 
3618   /* Validate the relevant portion of the subject. After an error, adjust the
3619   offset to be an absolute offset in the whole string. */
3620 
3621   match_data->rc = PRIV(valid_utf)(check_subject,
3622     length - (PCRE2_SIZE)(check_subject - subject), &(match_data->startchar));
3623   if (match_data->rc != 0)
3624     {
3625     match_data->startchar += (PCRE2_SIZE)(check_subject - subject);
3626     return match_data->rc;
3627     }
3628   }
3629 #endif  /* SUPPORT_UNICODE */
3630 
3631 /* Set up the first code unit to match, if available. If there's no first code
3632 unit there may be a bitmap of possible first characters. */
3633 
3634 if ((re->flags & PCRE2_FIRSTSET) != 0)
3635   {
3636   has_first_cu = TRUE;
3637   first_cu = first_cu2 = (PCRE2_UCHAR)(re->first_codeunit);
3638   if ((re->flags & PCRE2_FIRSTCASELESS) != 0)
3639     {
3640     first_cu2 = TABLE_GET(first_cu, mb->tables + fcc_offset, first_cu);
3641 #ifdef SUPPORT_UNICODE
3642 #if PCRE2_CODE_UNIT_WIDTH == 8
3643     if (first_cu > 127 && !utf && (re->overall_options & PCRE2_UCP) != 0)
3644       first_cu2 = (PCRE2_UCHAR)UCD_OTHERCASE(first_cu);
3645 #else
3646     if (first_cu > 127 && (utf || (re->overall_options & PCRE2_UCP) != 0))
3647       first_cu2 = (PCRE2_UCHAR)UCD_OTHERCASE(first_cu);
3648 #endif
3649 #endif  /* SUPPORT_UNICODE */
3650     }
3651   }
3652 else
3653   if (!startline && (re->flags & PCRE2_FIRSTMAPSET) != 0)
3654     start_bits = re->start_bitmap;
3655 
3656 /* There may be a "last known required code unit" set. */
3657 
3658 if ((re->flags & PCRE2_LASTSET) != 0)
3659   {
3660   has_req_cu = TRUE;
3661   req_cu = req_cu2 = (PCRE2_UCHAR)(re->last_codeunit);
3662   if ((re->flags & PCRE2_LASTCASELESS) != 0)
3663     {
3664     req_cu2 = TABLE_GET(req_cu, mb->tables + fcc_offset, req_cu);
3665 #ifdef SUPPORT_UNICODE
3666 #if PCRE2_CODE_UNIT_WIDTH == 8
3667     if (req_cu > 127 && !utf && (re->overall_options & PCRE2_UCP) != 0)
3668       req_cu2 = (PCRE2_UCHAR)UCD_OTHERCASE(req_cu);
3669 #else
3670     if (req_cu > 127 && (utf || (re->overall_options & PCRE2_UCP) != 0))
3671       req_cu2 = (PCRE2_UCHAR)UCD_OTHERCASE(req_cu);
3672 #endif
3673 #endif  /* SUPPORT_UNICODE */
3674     }
3675   }
3676 
3677 /* If the match data block was previously used with PCRE2_COPY_MATCHED_SUBJECT,
3678 free the memory that was obtained. */
3679 
3680 if ((match_data->flags & PCRE2_MD_COPIED_SUBJECT) != 0)
3681   {
3682   match_data->memctl.free((void *)match_data->subject,
3683     match_data->memctl.memory_data);
3684   match_data->flags &= ~PCRE2_MD_COPIED_SUBJECT;
3685   }
3686 
3687 /* Fill in fields that are always returned in the match data. */
3688 
3689 match_data->code = re;
3690 match_data->subject = NULL;  /* Default for no match */
3691 match_data->mark = NULL;
3692 match_data->matchedby = PCRE2_MATCHEDBY_DFA_INTERPRETER;
3693 
3694 /* Call the main matching function, looping for a non-anchored regex after a
3695 failed match. If not restarting, perform certain optimizations at the start of
3696 a match. */
3697 
3698 for (;;)
3699   {
3700   /* ----------------- Start of match optimizations ---------------- */
3701 
3702   /* There are some optimizations that avoid running the match if a known
3703   starting point is not found, or if a known later code unit is not present.
3704   However, there is an option (settable at compile time) that disables
3705   these, for testing and for ensuring that all callouts do actually occur.
3706   The optimizations must also be avoided when restarting a DFA match. */
3707 
3708   if ((re->overall_options & PCRE2_NO_START_OPTIMIZE) == 0 &&
3709       (options & PCRE2_DFA_RESTART) == 0)
3710     {
3711     /* If firstline is TRUE, the start of the match is constrained to the first
3712     line of a multiline string. That is, the match must be before or at the
3713     first newline following the start of matching. Temporarily adjust
3714     end_subject so that we stop the optimization scans for a first code unit
3715     immediately after the first character of a newline (the first code unit can
3716     legitimately be a newline). If the match fails at the newline, later code
3717     breaks this loop. */
3718 
3719     if (firstline)
3720       {
3721       PCRE2_SPTR t = start_match;
3722 #ifdef SUPPORT_UNICODE
3723       if (utf)
3724         {
3725         while (t < end_subject && !IS_NEWLINE(t))
3726           {
3727           t++;
3728           ACROSSCHAR(t < end_subject, t, t++);
3729           }
3730         }
3731       else
3732 #endif
3733       while (t < end_subject && !IS_NEWLINE(t)) t++;
3734       end_subject = t;
3735       }
3736 
3737     /* Anchored: check the first code unit if one is recorded. This may seem
3738     pointless but it can help in detecting a no match case without scanning for
3739     the required code unit. */
3740 
3741     if (anchored)
3742       {
3743       if (has_first_cu || start_bits != NULL)
3744         {
3745         BOOL ok = start_match < end_subject;
3746         if (ok)
3747           {
3748           PCRE2_UCHAR c = UCHAR21TEST(start_match);
3749           ok = has_first_cu && (c == first_cu || c == first_cu2);
3750           if (!ok && start_bits != NULL)
3751             {
3752 #if PCRE2_CODE_UNIT_WIDTH != 8
3753             if (c > 255) c = 255;
3754 #endif
3755             ok = (start_bits[c/8] & (1u << (c&7))) != 0;
3756             }
3757           }
3758         if (!ok) break;
3759         }
3760       }
3761 
3762     /* Not anchored. Advance to a unique first code unit if there is one. */
3763 
3764     else
3765       {
3766       if (has_first_cu)
3767         {
3768         if (first_cu != first_cu2)  /* Caseless */
3769           {
3770           /* In 16-bit and 32_bit modes we have to do our own search, so can
3771           look for both cases at once. */
3772 
3773 #if PCRE2_CODE_UNIT_WIDTH != 8
3774           PCRE2_UCHAR smc;
3775           while (start_match < end_subject &&
3776                 (smc = UCHAR21TEST(start_match)) != first_cu &&
3777                  smc != first_cu2)
3778             start_match++;
3779 #else
3780           /* In 8-bit mode, the use of memchr() gives a big speed up, even
3781           though we have to call it twice in order to find the earliest
3782           occurrence of the code unit in either of its cases. Caching is used
3783           to remember the positions of previously found code units. This can
3784           make a huge difference when the strings are very long and only one
3785           case is actually present. */
3786 
3787           PCRE2_SPTR pp1 = NULL;
3788           PCRE2_SPTR pp2 = NULL;
3789           PCRE2_SIZE searchlength = end_subject - start_match;
3790 
3791           /* If we haven't got a previously found position for first_cu, or if
3792           the current starting position is later, we need to do a search. If
3793           the code unit is not found, set it to the end. */
3794 
3795           if (memchr_found_first_cu == NULL ||
3796               start_match > memchr_found_first_cu)
3797             {
3798             pp1 = memchr(start_match, first_cu, searchlength);
3799             memchr_found_first_cu = (pp1 == NULL)? end_subject : pp1;
3800             }
3801 
3802           /* If the start is before a previously found position, use the
3803           previous position, or NULL if a previous search failed. */
3804 
3805           else pp1 = (memchr_found_first_cu == end_subject)? NULL :
3806             memchr_found_first_cu;
3807 
3808           /* Do the same thing for the other case. */
3809 
3810           if (memchr_found_first_cu2 == NULL ||
3811               start_match > memchr_found_first_cu2)
3812             {
3813             pp2 = memchr(start_match, first_cu2, searchlength);
3814             memchr_found_first_cu2 = (pp2 == NULL)? end_subject : pp2;
3815             }
3816 
3817           else pp2 = (memchr_found_first_cu2 == end_subject)? NULL :
3818             memchr_found_first_cu2;
3819 
3820           /* Set the start to the end of the subject if neither case was found.
3821           Otherwise, use the earlier found point. */
3822 
3823           if (pp1 == NULL)
3824             start_match = (pp2 == NULL)? end_subject : pp2;
3825           else
3826             start_match = (pp2 == NULL || pp1 < pp2)? pp1 : pp2;
3827 
3828 #endif  /* 8-bit handling */
3829           }
3830 
3831         /* The caseful case is much simpler. */
3832 
3833         else
3834           {
3835 #if PCRE2_CODE_UNIT_WIDTH != 8
3836           while (start_match < end_subject && UCHAR21TEST(start_match) !=
3837                  first_cu)
3838             start_match++;
3839 #else  /* 8-bit code units */
3840           start_match = memchr(start_match, first_cu, end_subject - start_match);
3841           if (start_match == NULL) start_match = end_subject;
3842 #endif
3843           }
3844 
3845         /* If we can't find the required code unit, having reached the true end
3846         of the subject, break the bumpalong loop, to force a match failure,
3847         except when doing partial matching, when we let the next cycle run at
3848         the end of the subject. To see why, consider the pattern /(?<=abc)def/,
3849         which partially matches "abc", even though the string does not contain
3850         the starting character "d". If we have not reached the true end of the
3851         subject (PCRE2_FIRSTLINE caused end_subject to be temporarily modified)
3852         we also let the cycle run, because the matching string is legitimately
3853         allowed to start with the first code unit of a newline. */
3854 
3855         if ((mb->moptions & (PCRE2_PARTIAL_HARD|PCRE2_PARTIAL_SOFT)) == 0 &&
3856             start_match >= mb->end_subject)
3857           break;
3858         }
3859 
3860       /* If there's no first code unit, advance to just after a linebreak for a
3861       multiline match if required. */
3862 
3863       else if (startline)
3864         {
3865         if (start_match > mb->start_subject + start_offset)
3866           {
3867 #ifdef SUPPORT_UNICODE
3868           if (utf)
3869             {
3870             while (start_match < end_subject && !WAS_NEWLINE(start_match))
3871               {
3872               start_match++;
3873               ACROSSCHAR(start_match < end_subject, start_match, start_match++);
3874               }
3875             }
3876           else
3877 #endif
3878           while (start_match < end_subject && !WAS_NEWLINE(start_match))
3879             start_match++;
3880 
3881           /* If we have just passed a CR and the newline option is ANY or
3882           ANYCRLF, and we are now at a LF, advance the match position by one
3883           more code unit. */
3884 
3885           if (start_match[-1] == CHAR_CR &&
3886                (mb->nltype == NLTYPE_ANY || mb->nltype == NLTYPE_ANYCRLF) &&
3887                start_match < end_subject &&
3888                UCHAR21TEST(start_match) == CHAR_NL)
3889             start_match++;
3890           }
3891         }
3892 
3893       /* If there's no first code unit or a requirement for a multiline line
3894       start, advance to a non-unique first code unit if any have been
3895       identified. The bitmap contains only 256 bits. When code units are 16 or
3896       32 bits wide, all code units greater than 254 set the 255 bit. */
3897 
3898       else if (start_bits != NULL)
3899         {
3900         while (start_match < end_subject)
3901           {
3902           uint32_t c = UCHAR21TEST(start_match);
3903 #if PCRE2_CODE_UNIT_WIDTH != 8
3904           if (c > 255) c = 255;
3905 #endif
3906           if ((start_bits[c/8] & (1u << (c&7))) != 0) break;
3907           start_match++;
3908           }
3909 
3910         /* See comment above in first_cu checking about the next line. */
3911 
3912         if ((mb->moptions & (PCRE2_PARTIAL_HARD|PCRE2_PARTIAL_SOFT)) == 0 &&
3913             start_match >= mb->end_subject)
3914           break;
3915         }
3916       }  /* End of first code unit handling */
3917 
3918     /* Restore fudged end_subject */
3919 
3920     end_subject = mb->end_subject;
3921 
3922     /* The following two optimizations are disabled for partial matching. */
3923 
3924     if ((mb->moptions & (PCRE2_PARTIAL_HARD|PCRE2_PARTIAL_SOFT)) == 0)
3925       {
3926       PCRE2_SPTR p;
3927 
3928       /* The minimum matching length is a lower bound; no actual string of that
3929       length may actually match the pattern. Although the value is, strictly,
3930       in characters, we treat it as code units to avoid spending too much time
3931       in this optimization. */
3932 
3933       if (end_subject - start_match < re->minlength) goto NOMATCH_EXIT;
3934 
3935       /* If req_cu is set, we know that that code unit must appear in the
3936       subject for the match to succeed. If the first code unit is set, req_cu
3937       must be later in the subject; otherwise the test starts at the match
3938       point. This optimization can save a huge amount of backtracking in
3939       patterns with nested unlimited repeats that aren't going to match.
3940       Writing separate code for cased/caseless versions makes it go faster, as
3941       does using an autoincrement and backing off on a match. As in the case of
3942       the first code unit, using memchr() in the 8-bit library gives a big
3943       speed up. Unlike the first_cu check above, we do not need to call
3944       memchr() twice in the caseless case because we only need to check for the
3945       presence of the character in either case, not find the first occurrence.
3946 
3947       The search can be skipped if the code unit was found later than the
3948       current starting point in a previous iteration of the bumpalong loop.
3949 
3950       HOWEVER: when the subject string is very, very long, searching to its end
3951       can take a long time, and give bad performance on quite ordinary
3952       patterns. This showed up when somebody was matching something like
3953       /^\d+C/ on a 32-megabyte string... so we don't do this when the string is
3954       sufficiently long, but it's worth searching a lot more for unanchored
3955       patterns. */
3956 
3957       p = start_match + (has_first_cu? 1:0);
3958       if (has_req_cu && p > req_cu_ptr)
3959         {
3960         PCRE2_SIZE check_length = end_subject - start_match;
3961 
3962         if (check_length < REQ_CU_MAX ||
3963               (!anchored && check_length < REQ_CU_MAX * 1000))
3964           {
3965           if (req_cu != req_cu2)  /* Caseless */
3966             {
3967 #if PCRE2_CODE_UNIT_WIDTH != 8
3968             while (p < end_subject)
3969               {
3970               uint32_t pp = UCHAR21INCTEST(p);
3971               if (pp == req_cu || pp == req_cu2) { p--; break; }
3972               }
3973 #else  /* 8-bit code units */
3974             PCRE2_SPTR pp = p;
3975             p = memchr(pp, req_cu, end_subject - pp);
3976             if (p == NULL)
3977               {
3978               p = memchr(pp, req_cu2, end_subject - pp);
3979               if (p == NULL) p = end_subject;
3980               }
3981 #endif /* PCRE2_CODE_UNIT_WIDTH != 8 */
3982             }
3983 
3984           /* The caseful case */
3985 
3986           else
3987             {
3988 #if PCRE2_CODE_UNIT_WIDTH != 8
3989             while (p < end_subject)
3990               {
3991               if (UCHAR21INCTEST(p) == req_cu) { p--; break; }
3992               }
3993 
3994 #else  /* 8-bit code units */
3995             p = memchr(p, req_cu, end_subject - p);
3996             if (p == NULL) p = end_subject;
3997 #endif
3998             }
3999 
4000           /* If we can't find the required code unit, break the matching loop,
4001           forcing a match failure. */
4002 
4003           if (p >= end_subject) break;
4004 
4005           /* If we have found the required code unit, save the point where we
4006           found it, so that we don't search again next time round the loop if
4007           the start hasn't passed this code unit yet. */
4008 
4009           req_cu_ptr = p;
4010           }
4011         }
4012       }
4013     }
4014 
4015   /* ------------ End of start of match optimizations ------------ */
4016 
4017   /* Give no match if we have passed the bumpalong limit. */
4018 
4019   if (start_match > bumpalong_limit) break;
4020 
4021   /* OK, now we can do the business */
4022 
4023   mb->start_used_ptr = start_match;
4024   mb->last_used_ptr = start_match;
4025   mb->recursive = NULL;
4026 
4027   rc = internal_dfa_match(
4028     mb,                           /* fixed match data */
4029     mb->start_code,               /* this subexpression's code */
4030     start_match,                  /* where we currently are */
4031     start_offset,                 /* start offset in subject */
4032     match_data->ovector,          /* offset vector */
4033     (uint32_t)match_data->oveccount * 2,  /* actual size of same */
4034     workspace,                    /* workspace vector */
4035     (int)wscount,                 /* size of same */
4036     0,                            /* function recurse level */
4037     base_recursion_workspace);    /* initial workspace for recursion */
4038 
4039   /* Anything other than "no match" means we are done, always; otherwise, carry
4040   on only if not anchored. */
4041 
4042   if (rc != PCRE2_ERROR_NOMATCH || anchored)
4043     {
4044     if (rc == PCRE2_ERROR_PARTIAL && match_data->oveccount > 0)
4045       {
4046       match_data->ovector[0] = (PCRE2_SIZE)(start_match - subject);
4047       match_data->ovector[1] = (PCRE2_SIZE)(end_subject - subject);
4048       }
4049     match_data->subject_length = length;
4050     match_data->leftchar = (PCRE2_SIZE)(mb->start_used_ptr - subject);
4051     match_data->rightchar = (PCRE2_SIZE)(mb->last_used_ptr - subject);
4052     match_data->startchar = (PCRE2_SIZE)(start_match - subject);
4053     match_data->rc = rc;
4054 
4055     if (rc >= 0 &&(options & PCRE2_COPY_MATCHED_SUBJECT) != 0)
4056       {
4057       length = CU2BYTES(length + was_zero_terminated);
4058       match_data->subject = match_data->memctl.malloc(length,
4059         match_data->memctl.memory_data);
4060       if (match_data->subject == NULL) return PCRE2_ERROR_NOMEMORY;
4061       memcpy((void *)match_data->subject, subject, length);
4062       match_data->flags |= PCRE2_MD_COPIED_SUBJECT;
4063       }
4064     else
4065       {
4066       if (rc >= 0 || rc == PCRE2_ERROR_PARTIAL) match_data->subject = subject;
4067       }
4068     goto EXIT;
4069     }
4070 
4071   /* Advance to the next subject character unless we are at the end of a line
4072   and firstline is set. */
4073 
4074   if (firstline && IS_NEWLINE(start_match)) break;
4075   start_match++;
4076 #ifdef SUPPORT_UNICODE
4077   if (utf)
4078     {
4079     ACROSSCHAR(start_match < end_subject, start_match, start_match++);
4080     }
4081 #endif
4082   if (start_match > end_subject) break;
4083 
4084   /* If we have just passed a CR and we are now at a LF, and the pattern does
4085   not contain any explicit matches for \r or \n, and the newline option is CRLF
4086   or ANY or ANYCRLF, advance the match position by one more character. */
4087 
4088   if (UCHAR21TEST(start_match - 1) == CHAR_CR &&
4089       start_match < end_subject &&
4090       UCHAR21TEST(start_match) == CHAR_NL &&
4091       (re->flags & PCRE2_HASCRORLF) == 0 &&
4092         (mb->nltype == NLTYPE_ANY ||
4093          mb->nltype == NLTYPE_ANYCRLF ||
4094          mb->nllen == 2))
4095     start_match++;
4096 
4097   }   /* "Bumpalong" loop */
4098 
4099 NOMATCH_EXIT:
4100 rc = PCRE2_ERROR_NOMATCH;
4101 
4102 EXIT:
4103 while (rws->next != NULL)
4104   {
4105   RWS_anchor *next = rws->next;
4106   rws->next = next->next;
4107   mb->memctl.free(next, mb->memctl.memory_data);
4108   }
4109 
4110 return rc;
4111 }
4112 
4113 /* These #undefs are here to enable unity builds with CMake. */
4114 
4115 #undef NLBLOCK /* Block containing newline information */
4116 #undef PSSTART /* Field containing processed string start */
4117 #undef PSEND   /* Field containing processed string end */
4118 
4119 /* End of pcre2_dfa_match.c */
4120