• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*************************************************
2 *      Perl-Compatible Regular Expressions       *
3 *************************************************/
4 
5 /* PCRE is a library of functions to support regular expressions whose syntax
6 and semantics are as close as possible to those of the Perl 5 language.
7 
8                        Written by Philip Hazel
9      Original API code Copyright (c) 1997-2012 University of Cambridge
10          New API code Copyright (c) 2016 University of Cambridge
11 
12 -----------------------------------------------------------------------------
13 Redistribution and use in source and binary forms, with or without
14 modification, are permitted provided that the following conditions are met:
15 
16     * Redistributions of source code must retain the above copyright notice,
17       this list of conditions and the following disclaimer.
18 
19     * Redistributions in binary form must reproduce the above copyright
20       notice, this list of conditions and the following disclaimer in the
21       documentation and/or other materials provided with the distribution.
22 
23     * Neither the name of the University of Cambridge nor the names of its
24       contributors may be used to endorse or promote products derived from
25       this software without specific prior written permission.
26 
27 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
28 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
29 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
30 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
31 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
32 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
33 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
34 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
35 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
36 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
37 POSSIBILITY OF SUCH DAMAGE.
38 -----------------------------------------------------------------------------
39 */
40 
41 
42 #ifdef HAVE_CONFIG_H
43 #include "config.h"
44 #endif
45 
46 #define NLBLOCK cb             /* Block containing newline information */
47 #define PSSTART start_pattern  /* Field containing processed string start */
48 #define PSEND   end_pattern    /* Field containing processed string end */
49 
50 #include "pcre2_internal.h"
51 
52 /* In rare error cases debugging might require calling pcre2_printint(). */
53 
54 #if 0
55 #ifdef EBCDIC
56 #define PRINTABLE(c) ((c) >= 64 && (c) < 255)
57 #else
58 #define PRINTABLE(c) ((c) >= 32 && (c) < 127)
59 #endif
60 #include "pcre2_printint.c"
61 #define CALL_PRINTINT
62 #endif
63 
64 /* There are a few things that vary with different code unit sizes. Handle them
65 by defining macros in order to minimize #if usage. */
66 
67 #if PCRE2_CODE_UNIT_WIDTH == 8
68 #define STRING_UTFn_RIGHTPAR     STRING_UTF8_RIGHTPAR, 5
69 #define XDIGIT(c)                xdigitab[c]
70 
71 #else  /* Either 16-bit or 32-bit */
72 #define XDIGIT(c)                (MAX_255(c)? xdigitab[c] : 0xff)
73 
74 #if PCRE2_CODE_UNIT_WIDTH == 16
75 #define STRING_UTFn_RIGHTPAR     STRING_UTF16_RIGHTPAR, 6
76 
77 #else  /* 32-bit */
78 #define STRING_UTFn_RIGHTPAR     STRING_UTF32_RIGHTPAR, 6
79 #endif
80 #endif
81 
82 /* Function definitions to allow mutual recursion */
83 
84 static unsigned int
85   add_list_to_class(uint8_t *, PCRE2_UCHAR **, uint32_t, compile_block *,
86     const uint32_t *, unsigned int);
87 
88 static BOOL
89   compile_regex(uint32_t, PCRE2_UCHAR **, PCRE2_SPTR *, int *, BOOL, BOOL,
90     uint32_t, int, uint32_t *, int32_t *, uint32_t *, int32_t *,
91     branch_chain *, compile_block *, size_t *);
92 
93 
94 
95 /*************************************************
96 *      Code parameters and static tables         *
97 *************************************************/
98 
99 /* This value specifies the size of stack workspace, which is used in different
100 ways in the different pattern scans. The group-identifying pre-scan uses it to
101 handle nesting, and needs it to be 16-bit aligned.
102 
103 During the first compiling phase, when determining how much memory is required,
104 the regex is partly compiled into this space, but the compiled parts are
105 discarded as soon as they can be, so that hopefully there will never be an
106 overrun. The code does, however, check for an overrun, which can occur for
107 pathological patterns. The size of the workspace depends on LINK_SIZE because
108 the length of compiled items varies with this.
109 
110 In the real compile phase, the workspace is used for remembering data about
111 numbered groups, provided there are not too many of them (if there are, extra
112 memory is acquired). For this phase the memory must be 32-bit aligned. Having
113 defined the size in code units, we set up C32_WORK_SIZE as the number of
114 elements in the 32-bit vector. */
115 
116 #define COMPILE_WORK_SIZE (2048*LINK_SIZE)   /* Size in code units */
117 
118 #define C32_WORK_SIZE \
119   ((COMPILE_WORK_SIZE * sizeof(PCRE2_UCHAR))/sizeof(uint32_t))
120 
121 /* The overrun tests check for a slightly smaller size so that they detect the
122 overrun before it actually does run off the end of the data block. */
123 
124 #define WORK_SIZE_SAFETY_MARGIN (100)
125 
126 /* This value determines the size of the initial vector that is used for
127 remembering named groups during the pre-compile. It is allocated on the stack,
128 but if it is too small, it is expanded, in a similar way to the workspace. The
129 value is the number of slots in the list. */
130 
131 #define NAMED_GROUP_LIST_SIZE  20
132 
133 /* The original PCRE required patterns to be zero-terminated, and it simplifies
134 the compiling code if it is guaranteed that there is a zero code unit at the
135 end of the pattern, because this means that tests for coding sequences such as
136 (*SKIP) or even just (?<= can check a sequence of code units without having to
137 keep checking for the end of the pattern. The new PCRE2 API allows zero code
138 units within patterns if a positive length is given, but in order to keep most
139 of the compiling code as it was, we copy such patterns and add a zero on the
140 end. This value determines the size of space on the stack that is used if the
141 pattern fits; if not, heap memory is used. */
142 
143 #define COPIED_PATTERN_SIZE 1024
144 
145 /* Maximum length value to check against when making sure that the variable
146 that holds the compiled pattern length does not overflow. We make it a bit less
147 than INT_MAX to allow for adding in group terminating bytes, so that we don't
148 have to check them every time. */
149 
150 #define OFLOW_MAX (INT_MAX - 20)
151 
152 /* Macro for setting individual bits in class bitmaps. It took some
153 experimenting to figure out how to stop gcc 5.3.0 from warning with
154 -Wconversion. This version gets a warning:
155 
156   #define SETBIT(a,b) a[(b)/8] |= (uint8_t)(1 << ((b)&7))
157 
158 Let's hope the apparently less efficient version isn't actually so bad if the
159 compiler is clever with identical subexpressions. */
160 
161 #define SETBIT(a,b) a[(b)/8] = (uint8_t)(a[(b)/8] | (1 << ((b)&7)))
162 
163 /* Private flags added to firstcu and reqcu. */
164 
165 #define REQ_CASELESS    (1 << 0)        /* Indicates caselessness */
166 #define REQ_VARY        (1 << 1)        /* reqcu followed non-literal item */
167 /* Negative values for the firstcu and reqcu flags */
168 #define REQ_UNSET       (-2)            /* Not yet found anything */
169 #define REQ_NONE        (-1)            /* Found not fixed char */
170 
171 /* These flags are used in the groupinfo vector. */
172 
173 #define GI_SET_COULD_BE_EMPTY  0x80000000u
174 #define GI_COULD_BE_EMPTY      0x40000000u
175 #define GI_NOT_FIXED_LENGTH    0x20000000u
176 #define GI_SET_FIXED_LENGTH    0x10000000u
177 #define GI_FIXED_LENGTH_MASK   0x0000ffffu
178 
179 /* This bit (which is greater than any UTF value) is used to indicate that a
180 variable contains a number of code units instead of an actual code point. */
181 
182 #define UTF_LENGTH     0x10000000l
183 
184 /* This simple test for a decimal digit works for both ASCII/Unicode and EBCDIC
185 and is fast (a good compiler can turn it into a subtraction and unsigned
186 comparison). */
187 
188 #define IS_DIGIT(x) ((x) >= CHAR_0 && (x) <= CHAR_9)
189 
190 /* Table to identify hex digits. The tables in chartables are dependent on the
191 locale, and may mark arbitrary characters as digits. We want to recognize only
192 0-9, a-z, and A-Z as hex digits, which is why we have a private table here. It
193 costs 256 bytes, but it is a lot faster than doing character value tests (at
194 least in some simple cases I timed), and in some applications one wants PCRE to
195 compile efficiently as well as match efficiently. The value in the table is
196 the binary hex digit value, or 0xff for non-hex digits. */
197 
198 /* This is the "normal" case, for ASCII systems, and EBCDIC systems running in
199 UTF-8 mode. */
200 
201 #ifndef EBCDIC
202 static const uint8_t xdigitab[] =
203   {
204   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*   0-  7 */
205   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*   8- 15 */
206   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  16- 23 */
207   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  24- 31 */
208   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*    - '  */
209   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  ( - /  */
210   0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07, /*  0 - 7  */
211   0x08,0x09,0xff,0xff,0xff,0xff,0xff,0xff, /*  8 - ?  */
212   0xff,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f,0xff, /*  @ - G  */
213   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  H - O  */
214   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  P - W  */
215   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  X - _  */
216   0xff,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f,0xff, /*  ` - g  */
217   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  h - o  */
218   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  p - w  */
219   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  x -127 */
220   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 128-135 */
221   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 136-143 */
222   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 144-151 */
223   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 152-159 */
224   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 160-167 */
225   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 168-175 */
226   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 176-183 */
227   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 184-191 */
228   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 192-199 */
229   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 2ff-207 */
230   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 208-215 */
231   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 216-223 */
232   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 224-231 */
233   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 232-239 */
234   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 240-247 */
235   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff};/* 248-255 */
236 
237 #else
238 
239 /* This is the "abnormal" case, for EBCDIC systems not running in UTF-8 mode. */
240 
241 static const uint8_t xdigitab[] =
242   {
243   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*   0-  7  0 */
244   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*   8- 15    */
245   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  16- 23 10 */
246   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  24- 31    */
247   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  32- 39 20 */
248   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  40- 47    */
249   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  48- 55 30 */
250   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  56- 63    */
251   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*    - 71 40 */
252   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  72- |     */
253   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  & - 87 50 */
254   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  88- 95    */
255   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  - -103 60 */
256   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 104- ?     */
257   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 112-119 70 */
258   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 120- "     */
259   0xff,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f,0xff, /* 128- g  80 */
260   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  h -143    */
261   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 144- p  90 */
262   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  q -159    */
263   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 160- x  A0 */
264   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  y -175    */
265   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  ^ -183 B0 */
266   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 184-191    */
267   0xff,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f,0xff, /*  { - G  C0 */
268   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  H -207    */
269   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  } - P  D0 */
270   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  Q -223    */
271   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  \ - X  E0 */
272   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  Y -239    */
273   0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07, /*  0 - 7  F0 */
274   0x08,0x09,0xff,0xff,0xff,0xff,0xff,0xff};/*  8 -255    */
275 #endif  /* EBCDIC */
276 
277 
278 /* Table for handling alphanumeric escaped characters. Positive returns are
279 simple data values; negative values are for special things like \d and so on.
280 Zero means further processing is needed (for things like \x), or the escape is
281 invalid. */
282 
283 /* This is the "normal" table for ASCII systems or for EBCDIC systems running
284 in UTF-8 mode. It runs from '0' to 'z'. */
285 
286 #ifndef EBCDIC
287 #define ESCAPES_FIRST       CHAR_0
288 #define ESCAPES_LAST        CHAR_z
289 #define UPPER_CASE(c)       (c-32)
290 
291 static const short int escapes[] = {
292      0,                       0,
293      0,                       0,
294      0,                       0,
295      0,                       0,
296      0,                       0,
297      CHAR_COLON,              CHAR_SEMICOLON,
298      CHAR_LESS_THAN_SIGN,     CHAR_EQUALS_SIGN,
299      CHAR_GREATER_THAN_SIGN,  CHAR_QUESTION_MARK,
300      CHAR_COMMERCIAL_AT,      -ESC_A,
301      -ESC_B,                  -ESC_C,
302      -ESC_D,                  -ESC_E,
303      0,                       -ESC_G,
304      -ESC_H,                  0,
305      0,                       -ESC_K,
306      0,                       0,
307      -ESC_N,                  0,
308      -ESC_P,                  -ESC_Q,
309      -ESC_R,                  -ESC_S,
310      0,                       0,
311      -ESC_V,                  -ESC_W,
312      -ESC_X,                  0,
313      -ESC_Z,                  CHAR_LEFT_SQUARE_BRACKET,
314      CHAR_BACKSLASH,          CHAR_RIGHT_SQUARE_BRACKET,
315      CHAR_CIRCUMFLEX_ACCENT,  CHAR_UNDERSCORE,
316      CHAR_GRAVE_ACCENT,       ESC_a,
317      -ESC_b,                  0,
318      -ESC_d,                  ESC_e,
319      ESC_f,                   0,
320      -ESC_h,                  0,
321      0,                       -ESC_k,
322      0,                       0,
323      ESC_n,                   0,
324      -ESC_p,                  0,
325      ESC_r,                   -ESC_s,
326      ESC_tee,                 0,
327      -ESC_v,                  -ESC_w,
328      0,                       0,
329      -ESC_z
330 };
331 
332 #else
333 
334 /* This is the "abnormal" table for EBCDIC systems without UTF-8 support.
335 It runs from 'a' to '9'. For some minimal testing of EBCDIC features, the code
336 is sometimes compiled on an ASCII system. In this case, we must not use CHAR_a
337 because it is defined as 'a', which of course picks up the ASCII value. */
338 
339 #if 'a' == 0x81                    /* Check for a real EBCDIC environment */
340 #define ESCAPES_FIRST       CHAR_a
341 #define ESCAPES_LAST        CHAR_9
342 #define UPPER_CASE(c)       (c+64)
343 #else                              /* Testing in an ASCII environment */
344 #define ESCAPES_FIRST  ((unsigned char)'\x81')   /* EBCDIC 'a' */
345 #define ESCAPES_LAST   ((unsigned char)'\xf9')   /* EBCDIC '9' */
346 #define UPPER_CASE(c)  (c-32)
347 #endif
348 
349 static const short int escapes[] = {
350 /*  80 */        ESC_a, -ESC_b,       0, -ESC_d, ESC_e,  ESC_f,      0,
351 /*  88 */-ESC_h,     0,      0,     '{',      0,     0,      0,      0,
352 /*  90 */     0,     0, -ESC_k,       0,      0, ESC_n,      0, -ESC_p,
353 /*  98 */     0, ESC_r,      0,     '}',      0,     0,      0,      0,
354 /*  A0 */     0,   '~', -ESC_s, ESC_tee,      0,-ESC_v, -ESC_w,      0,
355 /*  A8 */     0,-ESC_z,      0,       0,      0,   '[',      0,      0,
356 /*  B0 */     0,     0,      0,       0,      0,     0,      0,      0,
357 /*  B8 */     0,     0,      0,       0,      0,   ']',    '=',    '-',
358 /*  C0 */   '{',-ESC_A, -ESC_B,  -ESC_C, -ESC_D,-ESC_E,      0, -ESC_G,
359 /*  C8 */-ESC_H,     0,      0,       0,      0,     0,      0,      0,
360 /*  D0 */   '}',     0, -ESC_K,       0,      0,-ESC_N,      0, -ESC_P,
361 /*  D8 */-ESC_Q,-ESC_R,      0,       0,      0,     0,      0,      0,
362 /*  E0 */  '\\',     0, -ESC_S,       0,      0,-ESC_V, -ESC_W, -ESC_X,
363 /*  E8 */     0,-ESC_Z,      0,       0,      0,     0,      0,      0,
364 /*  F0 */     0,     0,      0,       0,      0,     0,      0,      0,
365 /*  F8 */     0,     0
366 };
367 
368 /* We also need a table of characters that may follow \c in an EBCDIC
369 environment for characters 0-31. */
370 
371 static unsigned char ebcdic_escape_c[] = "@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_";
372 
373 #endif   /* EBCDIC */
374 
375 
376 /* Table of special "verbs" like (*PRUNE). This is a short table, so it is
377 searched linearly. Put all the names into a single string, in order to reduce
378 the number of relocations when a shared library is dynamically linked. The
379 string is built from string macros so that it works in UTF-8 mode on EBCDIC
380 platforms. */
381 
382 typedef struct verbitem {
383   int   len;                 /* Length of verb name */
384   int   op;                  /* Op when no arg, or -1 if arg mandatory */
385   int   op_arg;              /* Op when arg present, or -1 if not allowed */
386 } verbitem;
387 
388 static const char verbnames[] =
389   "\0"                       /* Empty name is a shorthand for MARK */
390   STRING_MARK0
391   STRING_ACCEPT0
392   STRING_COMMIT0
393   STRING_F0
394   STRING_FAIL0
395   STRING_PRUNE0
396   STRING_SKIP0
397   STRING_THEN;
398 
399 static const verbitem verbs[] = {
400   { 0, -1,        OP_MARK },
401   { 4, -1,        OP_MARK },
402   { 6, OP_ACCEPT, -1 },
403   { 6, OP_COMMIT, -1 },
404   { 1, OP_FAIL,   -1 },
405   { 4, OP_FAIL,   -1 },
406   { 5, OP_PRUNE,  OP_PRUNE_ARG },
407   { 4, OP_SKIP,   OP_SKIP_ARG  },
408   { 4, OP_THEN,   OP_THEN_ARG  }
409 };
410 
411 static const int verbcount = sizeof(verbs)/sizeof(verbitem);
412 
413 
414 /* Substitutes for [[:<:]] and [[:>:]], which mean start and end of word in
415 another regex library. */
416 
417 static const PCRE2_UCHAR sub_start_of_word[] = {
418   CHAR_BACKSLASH, CHAR_b, CHAR_LEFT_PARENTHESIS, CHAR_QUESTION_MARK,
419   CHAR_EQUALS_SIGN, CHAR_BACKSLASH, CHAR_w, CHAR_RIGHT_PARENTHESIS, '\0' };
420 
421 static const PCRE2_UCHAR sub_end_of_word[] = {
422   CHAR_BACKSLASH, CHAR_b, CHAR_LEFT_PARENTHESIS, CHAR_QUESTION_MARK,
423   CHAR_LESS_THAN_SIGN, CHAR_EQUALS_SIGN, CHAR_BACKSLASH, CHAR_w,
424   CHAR_RIGHT_PARENTHESIS, '\0' };
425 
426 
427 /* Tables of names of POSIX character classes and their lengths. The names are
428 now all in a single string, to reduce the number of relocations when a shared
429 library is dynamically loaded. The list of lengths is terminated by a zero
430 length entry. The first three must be alpha, lower, upper, as this is assumed
431 for handling case independence. The indices for graph, print, and punct are
432 needed, so identify them. */
433 
434 static const char posix_names[] =
435   STRING_alpha0 STRING_lower0 STRING_upper0 STRING_alnum0
436   STRING_ascii0 STRING_blank0 STRING_cntrl0 STRING_digit0
437   STRING_graph0 STRING_print0 STRING_punct0 STRING_space0
438   STRING_word0  STRING_xdigit;
439 
440 static const uint8_t posix_name_lengths[] = {
441   5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 6, 0 };
442 
443 #define PC_GRAPH  8
444 #define PC_PRINT  9
445 #define PC_PUNCT 10
446 
447 
448 /* Table of class bit maps for each POSIX class. Each class is formed from a
449 base map, with an optional addition or removal of another map. Then, for some
450 classes, there is some additional tweaking: for [:blank:] the vertical space
451 characters are removed, and for [:alpha:] and [:alnum:] the underscore
452 character is removed. The triples in the table consist of the base map offset,
453 second map offset or -1 if no second map, and a non-negative value for map
454 addition or a negative value for map subtraction (if there are two maps). The
455 absolute value of the third field has these meanings: 0 => no tweaking, 1 =>
456 remove vertical space characters, 2 => remove underscore. */
457 
458 static const int posix_class_maps[] = {
459   cbit_word,  cbit_digit, -2,             /* alpha */
460   cbit_lower, -1,          0,             /* lower */
461   cbit_upper, -1,          0,             /* upper */
462   cbit_word,  -1,          2,             /* alnum - word without underscore */
463   cbit_print, cbit_cntrl,  0,             /* ascii */
464   cbit_space, -1,          1,             /* blank - a GNU extension */
465   cbit_cntrl, -1,          0,             /* cntrl */
466   cbit_digit, -1,          0,             /* digit */
467   cbit_graph, -1,          0,             /* graph */
468   cbit_print, -1,          0,             /* print */
469   cbit_punct, -1,          0,             /* punct */
470   cbit_space, -1,          0,             /* space */
471   cbit_word,  -1,          0,             /* word - a Perl extension */
472   cbit_xdigit,-1,          0              /* xdigit */
473 };
474 
475 /* Table of substitutes for \d etc when PCRE2_UCP is set. They are replaced by
476 Unicode property escapes. */
477 
478 #ifdef SUPPORT_UNICODE
479 static const PCRE2_UCHAR string_PNd[]  = {
480   CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
481   CHAR_N, CHAR_d, CHAR_RIGHT_CURLY_BRACKET, '\0' };
482 static const PCRE2_UCHAR string_pNd[]  = {
483   CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
484   CHAR_N, CHAR_d, CHAR_RIGHT_CURLY_BRACKET, '\0' };
485 static const PCRE2_UCHAR string_PXsp[] = {
486   CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
487   CHAR_X, CHAR_s, CHAR_p, CHAR_RIGHT_CURLY_BRACKET, '\0' };
488 static const PCRE2_UCHAR string_pXsp[] = {
489   CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
490   CHAR_X, CHAR_s, CHAR_p, CHAR_RIGHT_CURLY_BRACKET, '\0' };
491 static const PCRE2_UCHAR string_PXwd[] = {
492   CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
493   CHAR_X, CHAR_w, CHAR_d, CHAR_RIGHT_CURLY_BRACKET, '\0' };
494 static const PCRE2_UCHAR string_pXwd[] = {
495   CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
496   CHAR_X, CHAR_w, CHAR_d, CHAR_RIGHT_CURLY_BRACKET, '\0' };
497 
498 static PCRE2_SPTR substitutes[] = {
499   string_PNd,           /* \D */
500   string_pNd,           /* \d */
501   string_PXsp,          /* \S */   /* Xsp is Perl space, but from 8.34, Perl */
502   string_pXsp,          /* \s */   /* space and POSIX space are the same. */
503   string_PXwd,          /* \W */
504   string_pXwd           /* \w */
505 };
506 
507 /* The POSIX class substitutes must be in the order of the POSIX class names,
508 defined above, and there are both positive and negative cases. NULL means no
509 general substitute of a Unicode property escape (\p or \P). However, for some
510 POSIX classes (e.g. graph, print, punct) a special property code is compiled
511 directly. */
512 
513 static const PCRE2_UCHAR string_pCc[] =  {
514   CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
515   CHAR_C, CHAR_c, CHAR_RIGHT_CURLY_BRACKET, '\0' };
516 static const PCRE2_UCHAR string_pL[] =   {
517   CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
518   CHAR_L, CHAR_RIGHT_CURLY_BRACKET, '\0' };
519 static const PCRE2_UCHAR string_pLl[] =  {
520   CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
521   CHAR_L, CHAR_l, CHAR_RIGHT_CURLY_BRACKET, '\0' };
522 static const PCRE2_UCHAR string_pLu[] =  {
523   CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
524   CHAR_L, CHAR_u, CHAR_RIGHT_CURLY_BRACKET, '\0' };
525 static const PCRE2_UCHAR string_pXan[] = {
526   CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
527   CHAR_X, CHAR_a, CHAR_n, CHAR_RIGHT_CURLY_BRACKET, '\0' };
528 static const PCRE2_UCHAR string_h[] =    {
529   CHAR_BACKSLASH, CHAR_h, '\0' };
530 static const PCRE2_UCHAR string_pXps[] = {
531   CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
532   CHAR_X, CHAR_p, CHAR_s, CHAR_RIGHT_CURLY_BRACKET, '\0' };
533 static const PCRE2_UCHAR string_PCc[] =  {
534   CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
535   CHAR_C, CHAR_c, CHAR_RIGHT_CURLY_BRACKET, '\0' };
536 static const PCRE2_UCHAR string_PL[] =   {
537   CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
538   CHAR_L, CHAR_RIGHT_CURLY_BRACKET, '\0' };
539 static const PCRE2_UCHAR string_PLl[] =  {
540   CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
541   CHAR_L, CHAR_l, CHAR_RIGHT_CURLY_BRACKET, '\0' };
542 static const PCRE2_UCHAR string_PLu[] =  {
543   CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
544   CHAR_L, CHAR_u, CHAR_RIGHT_CURLY_BRACKET, '\0' };
545 static const PCRE2_UCHAR string_PXan[] = {
546   CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
547   CHAR_X, CHAR_a, CHAR_n, CHAR_RIGHT_CURLY_BRACKET, '\0' };
548 static const PCRE2_UCHAR string_H[] =    {
549   CHAR_BACKSLASH, CHAR_H, '\0' };
550 static const PCRE2_UCHAR string_PXps[] = {
551   CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
552   CHAR_X, CHAR_p, CHAR_s, CHAR_RIGHT_CURLY_BRACKET, '\0' };
553 
554 static PCRE2_SPTR posix_substitutes[] = {
555   string_pL,            /* alpha */
556   string_pLl,           /* lower */
557   string_pLu,           /* upper */
558   string_pXan,          /* alnum */
559   NULL,                 /* ascii */
560   string_h,             /* blank */
561   string_pCc,           /* cntrl */
562   string_pNd,           /* digit */
563   NULL,                 /* graph */
564   NULL,                 /* print */
565   NULL,                 /* punct */
566   string_pXps,          /* space */   /* Xps is POSIX space, but from 8.34 */
567   string_pXwd,          /* word  */   /* Perl and POSIX space are the same */
568   NULL,                 /* xdigit */
569   /* Negated cases */
570   string_PL,            /* ^alpha */
571   string_PLl,           /* ^lower */
572   string_PLu,           /* ^upper */
573   string_PXan,          /* ^alnum */
574   NULL,                 /* ^ascii */
575   string_H,             /* ^blank */
576   string_PCc,           /* ^cntrl */
577   string_PNd,           /* ^digit */
578   NULL,                 /* ^graph */
579   NULL,                 /* ^print */
580   NULL,                 /* ^punct */
581   string_PXps,          /* ^space */  /* Xps is POSIX space, but from 8.34 */
582   string_PXwd,          /* ^word */   /* Perl and POSIX space are the same */
583   NULL                  /* ^xdigit */
584 };
585 #define POSIX_SUBSIZE (sizeof(posix_substitutes) / sizeof(PCRE2_UCHAR *))
586 #endif  /* SUPPORT_UNICODE */
587 
588 /* Masks for checking option settings. */
589 
590 #define PUBLIC_COMPILE_OPTIONS \
591   (PCRE2_ANCHORED|PCRE2_ALLOW_EMPTY_CLASS|PCRE2_ALT_BSUX|PCRE2_ALT_CIRCUMFLEX| \
592    PCRE2_ALT_VERBNAMES|PCRE2_AUTO_CALLOUT|PCRE2_CASELESS|PCRE2_DOLLAR_ENDONLY| \
593    PCRE2_DOTALL|PCRE2_DUPNAMES|PCRE2_EXTENDED|PCRE2_FIRSTLINE| \
594    PCRE2_MATCH_UNSET_BACKREF|PCRE2_MULTILINE|PCRE2_NEVER_BACKSLASH_C| \
595    PCRE2_NEVER_UCP|PCRE2_NEVER_UTF|PCRE2_NO_AUTO_CAPTURE| \
596    PCRE2_NO_AUTO_POSSESS|PCRE2_NO_DOTSTAR_ANCHOR|PCRE2_NO_START_OPTIMIZE| \
597    PCRE2_NO_UTF_CHECK|PCRE2_UCP|PCRE2_UNGREEDY|PCRE2_USE_OFFSET_LIMIT| \
598    PCRE2_UTF)
599 
600 /* Compile time error code numbers. They are given names so that they can more
601 easily be tracked. When a new number is added, the tables called eint1 and
602 eint2 in pcre2posix.c may need to be updated, and a new error text must be
603 added to compile_error_texts in pcre2_error.c. */
604 
605 enum { ERR0 = COMPILE_ERROR_BASE,
606        ERR1,  ERR2,  ERR3,  ERR4,  ERR5,  ERR6,  ERR7,  ERR8,  ERR9,  ERR10,
607        ERR11, ERR12, ERR13, ERR14, ERR15, ERR16, ERR17, ERR18, ERR19, ERR20,
608        ERR21, ERR22, ERR23, ERR24, ERR25, ERR26, ERR27, ERR28, ERR29, ERR30,
609        ERR31, ERR32, ERR33, ERR34, ERR35, ERR36, ERR37, ERR38, ERR39, ERR40,
610        ERR41, ERR42, ERR43, ERR44, ERR45, ERR46, ERR47, ERR48, ERR49, ERR50,
611        ERR51, ERR52, ERR53, ERR54, ERR55, ERR56, ERR57, ERR58, ERR59, ERR60,
612        ERR61, ERR62, ERR63, ERR64, ERR65, ERR66, ERR67, ERR68, ERR69, ERR70,
613        ERR71, ERR72, ERR73, ERR74, ERR75, ERR76, ERR77, ERR78, ERR79, ERR80,
614        ERR81, ERR82, ERR83, ERR84, ERR85, ERR86, ERR87, ERR88 };
615 
616 /* Error codes that correspond to negative error codes returned by
617 find_fixedlength(). */
618 
619 static int fixed_length_errors[] =
620   {
621   ERR0,    /* Not an error */
622   ERR0,    /* Not an error; -1 is used for "process later" */
623   ERR25,   /* Lookbehind is not fixed length */
624   ERR36,   /* \C in lookbehind is not allowed */
625   ERR87,   /* Lookbehind is too long */
626   ERR86,   /* Pattern too complicated */
627   ERR70    /* Internal error: unknown opcode encountered */
628   };
629 
630 /* This is a table of start-of-pattern options such as (*UTF) and settings such
631 as (*LIMIT_MATCH=nnnn) and (*CRLF). For completeness and backward
632 compatibility, (*UTFn) is supported in the relevant libraries, but (*UTF) is
633 generic and always supported. */
634 
635 enum { PSO_OPT,     /* Value is an option bit */
636        PSO_FLG,     /* Value is a flag bit */
637        PSO_NL,      /* Value is a newline type */
638        PSO_BSR,     /* Value is a \R type */
639        PSO_LIMM,    /* Read integer value for match limit */
640        PSO_LIMR };  /* Read integer value for recursion limit */
641 
642 typedef struct pso {
643   const uint8_t *name;
644   uint16_t length;
645   uint16_t type;
646   uint32_t value;
647 } pso;
648 
649 /* NB: STRING_UTFn_RIGHTPAR contains the length as well */
650 
651 static pso pso_list[] = {
652   { (uint8_t *)STRING_UTFn_RIGHTPAR,                  PSO_OPT, PCRE2_UTF },
653   { (uint8_t *)STRING_UTF_RIGHTPAR,                4, PSO_OPT, PCRE2_UTF },
654   { (uint8_t *)STRING_UCP_RIGHTPAR,                4, PSO_OPT, PCRE2_UCP },
655   { (uint8_t *)STRING_NOTEMPTY_RIGHTPAR,           9, PSO_FLG, PCRE2_NOTEMPTY_SET },
656   { (uint8_t *)STRING_NOTEMPTY_ATSTART_RIGHTPAR,  17, PSO_FLG, PCRE2_NE_ATST_SET },
657   { (uint8_t *)STRING_NO_AUTO_POSSESS_RIGHTPAR,   16, PSO_OPT, PCRE2_NO_AUTO_POSSESS },
658   { (uint8_t *)STRING_NO_DOTSTAR_ANCHOR_RIGHTPAR, 18, PSO_OPT, PCRE2_NO_DOTSTAR_ANCHOR },
659   { (uint8_t *)STRING_NO_JIT_RIGHTPAR,             7, PSO_FLG, PCRE2_NOJIT },
660   { (uint8_t *)STRING_NO_START_OPT_RIGHTPAR,      13, PSO_OPT, PCRE2_NO_START_OPTIMIZE },
661   { (uint8_t *)STRING_LIMIT_MATCH_EQ,             12, PSO_LIMM, 0 },
662   { (uint8_t *)STRING_LIMIT_RECURSION_EQ,         16, PSO_LIMR, 0 },
663   { (uint8_t *)STRING_CR_RIGHTPAR,                 3, PSO_NL,  PCRE2_NEWLINE_CR },
664   { (uint8_t *)STRING_LF_RIGHTPAR,                 3, PSO_NL,  PCRE2_NEWLINE_LF },
665   { (uint8_t *)STRING_CRLF_RIGHTPAR,               5, PSO_NL,  PCRE2_NEWLINE_CRLF },
666   { (uint8_t *)STRING_ANY_RIGHTPAR,                4, PSO_NL,  PCRE2_NEWLINE_ANY },
667   { (uint8_t *)STRING_ANYCRLF_RIGHTPAR,            8, PSO_NL,  PCRE2_NEWLINE_ANYCRLF },
668   { (uint8_t *)STRING_BSR_ANYCRLF_RIGHTPAR,       12, PSO_BSR, PCRE2_BSR_ANYCRLF },
669   { (uint8_t *)STRING_BSR_UNICODE_RIGHTPAR,       12, PSO_BSR, PCRE2_BSR_UNICODE }
670 };
671 
672 /* This table is used when converting repeating opcodes into possessified
673 versions as a result of an explicit possessive quantifier such as ++. A zero
674 value means there is no possessified version - in those cases the item in
675 question must be wrapped in ONCE brackets. The table is truncated at OP_CALLOUT
676 because all relevant opcodes are less than that. */
677 
678 static const uint8_t opcode_possessify[] = {
679   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,   /* 0 - 15  */
680   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,   /* 16 - 31 */
681 
682   0,                       /* NOTI */
683   OP_POSSTAR, 0,           /* STAR, MINSTAR */
684   OP_POSPLUS, 0,           /* PLUS, MINPLUS */
685   OP_POSQUERY, 0,          /* QUERY, MINQUERY */
686   OP_POSUPTO, 0,           /* UPTO, MINUPTO */
687   0,                       /* EXACT */
688   0, 0, 0, 0,              /* POS{STAR,PLUS,QUERY,UPTO} */
689 
690   OP_POSSTARI, 0,          /* STARI, MINSTARI */
691   OP_POSPLUSI, 0,          /* PLUSI, MINPLUSI */
692   OP_POSQUERYI, 0,         /* QUERYI, MINQUERYI */
693   OP_POSUPTOI, 0,          /* UPTOI, MINUPTOI */
694   0,                       /* EXACTI */
695   0, 0, 0, 0,              /* POS{STARI,PLUSI,QUERYI,UPTOI} */
696 
697   OP_NOTPOSSTAR, 0,        /* NOTSTAR, NOTMINSTAR */
698   OP_NOTPOSPLUS, 0,        /* NOTPLUS, NOTMINPLUS */
699   OP_NOTPOSQUERY, 0,       /* NOTQUERY, NOTMINQUERY */
700   OP_NOTPOSUPTO, 0,        /* NOTUPTO, NOTMINUPTO */
701   0,                       /* NOTEXACT */
702   0, 0, 0, 0,              /* NOTPOS{STAR,PLUS,QUERY,UPTO} */
703 
704   OP_NOTPOSSTARI, 0,       /* NOTSTARI, NOTMINSTARI */
705   OP_NOTPOSPLUSI, 0,       /* NOTPLUSI, NOTMINPLUSI */
706   OP_NOTPOSQUERYI, 0,      /* NOTQUERYI, NOTMINQUERYI */
707   OP_NOTPOSUPTOI, 0,       /* NOTUPTOI, NOTMINUPTOI */
708   0,                       /* NOTEXACTI */
709   0, 0, 0, 0,              /* NOTPOS{STARI,PLUSI,QUERYI,UPTOI} */
710 
711   OP_TYPEPOSSTAR, 0,       /* TYPESTAR, TYPEMINSTAR */
712   OP_TYPEPOSPLUS, 0,       /* TYPEPLUS, TYPEMINPLUS */
713   OP_TYPEPOSQUERY, 0,      /* TYPEQUERY, TYPEMINQUERY */
714   OP_TYPEPOSUPTO, 0,       /* TYPEUPTO, TYPEMINUPTO */
715   0,                       /* TYPEEXACT */
716   0, 0, 0, 0,              /* TYPEPOS{STAR,PLUS,QUERY,UPTO} */
717 
718   OP_CRPOSSTAR, 0,         /* CRSTAR, CRMINSTAR */
719   OP_CRPOSPLUS, 0,         /* CRPLUS, CRMINPLUS */
720   OP_CRPOSQUERY, 0,        /* CRQUERY, CRMINQUERY */
721   OP_CRPOSRANGE, 0,        /* CRRANGE, CRMINRANGE */
722   0, 0, 0, 0,              /* CRPOS{STAR,PLUS,QUERY,RANGE} */
723 
724   0, 0, 0,                 /* CLASS, NCLASS, XCLASS */
725   0, 0,                    /* REF, REFI */
726   0, 0,                    /* DNREF, DNREFI */
727   0, 0                     /* RECURSE, CALLOUT */
728 };
729 
730 
731 
732 /*************************************************
733 *               Copy compiled code               *
734 *************************************************/
735 
736 /* Compiled JIT code cannot be copied, so the new compiled block has no
737 associated JIT data. */
738 
739 PCRE2_EXP_DEFN pcre2_code * PCRE2_CALL_CONVENTION
pcre2_code_copy(const pcre2_code * code)740 pcre2_code_copy(const pcre2_code *code)
741 {
742 PCRE2_SIZE* ref_count;
743 pcre2_code *newcode;
744 
745 if (code == NULL) return NULL;
746 newcode = code->memctl.malloc(code->blocksize, code->memctl.memory_data);
747 if (newcode == NULL) return NULL;
748 memcpy(newcode, code, code->blocksize);
749 newcode->executable_jit = NULL;
750 
751 /* If the code is one that has been deserialized, increment the reference count
752 in the decoded tables. */
753 
754 if ((code->flags & PCRE2_DEREF_TABLES) != 0)
755   {
756   ref_count = (PCRE2_SIZE *)(code->tables + tables_length);
757   (*ref_count)++;
758   }
759 
760 return newcode;
761 }
762 
763 
764 
765 /*************************************************
766 *               Free compiled code               *
767 *************************************************/
768 
769 PCRE2_EXP_DEFN void PCRE2_CALL_CONVENTION
pcre2_code_free(pcre2_code * code)770 pcre2_code_free(pcre2_code *code)
771 {
772 PCRE2_SIZE* ref_count;
773 
774 if (code != NULL)
775   {
776   if (code->executable_jit != NULL)
777     PRIV(jit_free)(code->executable_jit, &code->memctl);
778 
779   if ((code->flags & PCRE2_DEREF_TABLES) != 0)
780     {
781     /* Decoded tables belong to the codes after deserialization, and they must
782     be freed when there are no more reference to them. The *ref_count should
783     always be > 0. */
784 
785     ref_count = (PCRE2_SIZE *)(code->tables + tables_length);
786     if (*ref_count > 0)
787       {
788       (*ref_count)--;
789       if (*ref_count == 0)
790         code->memctl.free((void *)code->tables, code->memctl.memory_data);
791       }
792     }
793 
794   code->memctl.free(code, code->memctl.memory_data);
795   }
796 }
797 
798 
799 
800 /*************************************************
801 *        Insert an automatic callout point       *
802 *************************************************/
803 
804 /* This function is called when the PCRE2_AUTO_CALLOUT option is set, to insert
805 callout points before each pattern item.
806 
807 Arguments:
808   code           current code pointer
809   ptr            current pattern pointer
810   cb             general compile-time data
811 
812 Returns:         new code pointer
813 */
814 
815 static PCRE2_UCHAR *
auto_callout(PCRE2_UCHAR * code,PCRE2_SPTR ptr,compile_block * cb)816 auto_callout(PCRE2_UCHAR *code, PCRE2_SPTR ptr, compile_block *cb)
817 {
818 code[0] = OP_CALLOUT;
819 PUT(code, 1, ptr - cb->start_pattern);  /* Pattern offset */
820 PUT(code, 1 + LINK_SIZE, 0);            /* Default length */
821 code[1 + 2*LINK_SIZE] = 255;
822 return code + PRIV(OP_lengths)[OP_CALLOUT];
823 }
824 
825 
826 
827 /*************************************************
828 *         Complete a callout item                *
829 *************************************************/
830 
831 /* A callout item contains the length of the next item in the pattern, which
832 we can't fill in till after we have reached the relevant point. This is used
833 for both automatic and manual callouts.
834 
835 Arguments:
836   previous_callout   points to previous callout item
837   ptr                current pattern pointer
838   cb                 general compile-time data
839 
840 Returns:             nothing
841 */
842 
843 static void
complete_callout(PCRE2_UCHAR * previous_callout,PCRE2_SPTR ptr,compile_block * cb)844 complete_callout(PCRE2_UCHAR *previous_callout, PCRE2_SPTR ptr,
845   compile_block *cb)
846 {
847 size_t length = (size_t)(ptr - cb->start_pattern - GET(previous_callout, 1));
848 PUT(previous_callout, 1 + LINK_SIZE, length);
849 }
850 
851 
852 
853 /*************************************************
854 *        Find the fixed length of a branch       *
855 *************************************************/
856 
857 /* Scan a branch and compute the fixed length of subject that will match it, if
858 the length is fixed. This is needed for dealing with lookbehind assertions. In
859 UTF mode, the result is in code units rather than bytes. The branch is
860 temporarily terminated with OP_END when this function is called.
861 
862 This function is called when a lookbehind assertion is encountered, so that if
863 it fails, the error message can point to the correct place in the pattern.
864 However, we cannot do this when the assertion contains subroutine calls,
865 because they can be forward references. We solve this by remembering this case
866 and doing the check at the end; a flag specifies which mode we are running in.
867 
868 Lookbehind lengths are held in 16-bit fields and the maximum value is defined
869 as LOOKBEHIND_MAX.
870 
871 Arguments:
872   code        points to the start of the pattern (the bracket)
873   utf         TRUE in UTF mode
874   atend       TRUE if called when the pattern is complete
875   cb          the "compile data" structure
876   recurses    chain of recurse_check to catch mutual recursion
877   countptr    pointer to counter, to catch over-complexity
878 
879 Returns:   if non-negative, the fixed length,
880              or -1 if an OP_RECURSE item was encountered and atend is FALSE
881              or -2 if there is no fixed length,
882              or -3 if \C was encountered (in UTF mode only)
883              or -4 if length is too long
884              or -5 if regex is too complicated
885              or -6 if an unknown opcode was encountered (internal error)
886 */
887 
888 #define FFL_LATER           (-1)
889 #define FFL_NOTFIXED        (-2)
890 #define FFL_BACKSLASHC      (-3)
891 #define FFL_TOOLONG         (-4)
892 #define FFL_TOOCOMPLICATED  (-5)
893 #define FFL_UNKNOWNOP       (-6)
894 
895 static int
find_fixedlength(PCRE2_UCHAR * code,BOOL utf,BOOL atend,compile_block * cb,recurse_check * recurses,int * countptr)896 find_fixedlength(PCRE2_UCHAR *code, BOOL utf, BOOL atend, compile_block *cb,
897   recurse_check *recurses, int *countptr)
898 {
899 uint32_t length = 0xffffffffu;   /* Unset */
900 uint32_t group = 0;
901 uint32_t groupinfo = 0;
902 recurse_check this_recurse;
903 register uint32_t branchlength = 0;
904 register PCRE2_UCHAR *cc = code + 1 + LINK_SIZE;
905 
906 /* If this is a capturing group, we may have the answer cached, but we can only
907 use this information if there are no (?| groups in the pattern, because
908 otherwise group numbers are not unique. */
909 
910 if (*code == OP_CBRA || *code == OP_CBRAPOS || *code == OP_SCBRA ||
911     *code == OP_SCBRAPOS)
912   {
913   group = GET2(cc, 0);
914   cc += IMM2_SIZE;
915   groupinfo = cb->groupinfo[group];
916   if ((cb->external_flags & PCRE2_DUPCAPUSED) == 0)
917     {
918     if ((groupinfo & GI_NOT_FIXED_LENGTH) != 0) return FFL_NOTFIXED;
919     if ((groupinfo & GI_SET_FIXED_LENGTH) != 0)
920       return groupinfo & GI_FIXED_LENGTH_MASK;
921     }
922   }
923 
924 /* A large and/or complex regex can take too long to process. This can happen
925 more often when (?| groups are present in the pattern. */
926 
927 if ((*countptr)++ > 2000) return FFL_TOOCOMPLICATED;
928 
929 /* Scan along the opcodes for this branch. If we get to the end of the
930 branch, check the length against that of the other branches. */
931 
932 for (;;)
933   {
934   int d;
935   PCRE2_UCHAR *ce, *cs;
936   register PCRE2_UCHAR op = *cc;
937 
938   if (branchlength > LOOKBEHIND_MAX) return FFL_TOOLONG;
939 
940   switch (op)
941     {
942     /* We only need to continue for OP_CBRA (normal capturing bracket) and
943     OP_BRA (normal non-capturing bracket) because the other variants of these
944     opcodes are all concerned with unlimited repeated groups, which of course
945     are not of fixed length. */
946 
947     case OP_CBRA:
948     case OP_BRA:
949     case OP_ONCE:
950     case OP_ONCE_NC:
951     case OP_COND:
952     d = find_fixedlength(cc, utf, atend, cb, recurses, countptr);
953     if (d < 0) return d;
954     branchlength += (uint32_t)d;
955     do cc += GET(cc, 1); while (*cc == OP_ALT);
956     cc += 1 + LINK_SIZE;
957     break;
958 
959     /* Reached end of a branch; if it's a ket it is the end of a nested call.
960     If it's ALT it is an alternation in a nested call. An ACCEPT is effectively
961     an ALT. If it is END it's the end of the outer call. All can be handled by
962     the same code. Note that we must not include the OP_KETRxxx opcodes here,
963     because they all imply an unlimited repeat. */
964 
965     case OP_ALT:
966     case OP_KET:
967     case OP_END:
968     case OP_ACCEPT:
969     case OP_ASSERT_ACCEPT:
970     if (length == 0xffffffffu) length = branchlength;
971       else if (length != branchlength) goto ISNOTFIXED;
972     if (*cc != OP_ALT)
973       {
974       if (group > 0)
975         {
976         groupinfo |= (uint32_t)(GI_SET_FIXED_LENGTH | length);
977         cb->groupinfo[group] = groupinfo;
978         }
979       return (int)length;
980       }
981     cc += 1 + LINK_SIZE;
982     branchlength = 0;
983     break;
984 
985     /* A true recursion implies not fixed length, but a subroutine call may
986     be OK. If the subroutine is a forward reference, we can't deal with
987     it until the end of the pattern, so return FFL_LATER. */
988 
989     case OP_RECURSE:
990     if (!atend) return FFL_LATER;
991     cs = ce = (PCRE2_UCHAR *)cb->start_code + GET(cc, 1); /* Start subpattern */
992     do ce += GET(ce, 1); while (*ce == OP_ALT);           /* End subpattern */
993     if (cc > cs && cc < ce) goto ISNOTFIXED;          /* Recursion */
994     else   /* Check for mutual recursion */
995       {
996       recurse_check *r = recurses;
997       for (r = recurses; r != NULL; r = r->prev) if (r->group == cs) break;
998       if (r != NULL) goto ISNOTFIXED;   /* Mutual recursion */
999       }
1000     this_recurse.prev = recurses;
1001     this_recurse.group = cs;
1002     d = find_fixedlength(cs, utf, atend, cb, &this_recurse, countptr);
1003     if (d < 0) return d;
1004     branchlength += (uint32_t)d;
1005     cc += 1 + LINK_SIZE;
1006     break;
1007 
1008     /* Skip over assertive subpatterns. Note that we must increment cc by
1009     1 + LINK_SIZE at the end, not by OP_length[*cc] because in a recursive
1010     situation this assertion may be the one that is ultimately being checked
1011     for having a fixed length, in which case its terminating OP_KET will have
1012     been temporarily replaced by OP_END. */
1013 
1014     case OP_ASSERT:
1015     case OP_ASSERT_NOT:
1016     case OP_ASSERTBACK:
1017     case OP_ASSERTBACK_NOT:
1018     do cc += GET(cc, 1); while (*cc == OP_ALT);
1019     cc += 1 + LINK_SIZE;
1020     break;
1021 
1022     /* Skip over things that don't match chars */
1023 
1024     case OP_MARK:
1025     case OP_PRUNE_ARG:
1026     case OP_SKIP_ARG:
1027     case OP_THEN_ARG:
1028     cc += cc[1] + PRIV(OP_lengths)[*cc];
1029     break;
1030 
1031     case OP_CALLOUT:
1032     case OP_CIRC:
1033     case OP_CIRCM:
1034     case OP_CLOSE:
1035     case OP_COMMIT:
1036     case OP_CREF:
1037     case OP_FALSE:
1038     case OP_TRUE:
1039     case OP_DNCREF:
1040     case OP_DNRREF:
1041     case OP_DOLL:
1042     case OP_DOLLM:
1043     case OP_EOD:
1044     case OP_EODN:
1045     case OP_FAIL:
1046     case OP_NOT_WORD_BOUNDARY:
1047     case OP_PRUNE:
1048     case OP_REVERSE:
1049     case OP_RREF:
1050     case OP_SET_SOM:
1051     case OP_SKIP:
1052     case OP_SOD:
1053     case OP_SOM:
1054     case OP_THEN:
1055     case OP_WORD_BOUNDARY:
1056     cc += PRIV(OP_lengths)[*cc];
1057     break;
1058 
1059     case OP_CALLOUT_STR:
1060     cc += GET(cc, 1 + 2*LINK_SIZE);
1061     break;
1062 
1063     /* Handle literal characters */
1064 
1065     case OP_CHAR:
1066     case OP_CHARI:
1067     case OP_NOT:
1068     case OP_NOTI:
1069     branchlength++;
1070     cc += 2;
1071 #ifdef SUPPORT_UNICODE
1072     if (utf && HAS_EXTRALEN(cc[-1])) cc += GET_EXTRALEN(cc[-1]);
1073 #endif
1074     break;
1075 
1076     /* Handle exact repetitions. The count is already in characters, but we
1077     need to skip over a multibyte character in UTF8 mode.  */
1078 
1079     case OP_EXACT:
1080     case OP_EXACTI:
1081     case OP_NOTEXACT:
1082     case OP_NOTEXACTI:
1083     branchlength += GET2(cc,1);
1084     cc += 2 + IMM2_SIZE;
1085 #ifdef SUPPORT_UNICODE
1086     if (utf && HAS_EXTRALEN(cc[-1])) cc += GET_EXTRALEN(cc[-1]);
1087 #endif
1088     break;
1089 
1090     case OP_TYPEEXACT:
1091     branchlength += GET2(cc,1);
1092     if (cc[1 + IMM2_SIZE] == OP_PROP || cc[1 + IMM2_SIZE] == OP_NOTPROP)
1093       cc += 2;
1094     cc += 1 + IMM2_SIZE + 1;
1095     break;
1096 
1097     /* Handle single-char matchers */
1098 
1099     case OP_PROP:
1100     case OP_NOTPROP:
1101     cc += 2;
1102     /* Fall through */
1103 
1104     case OP_HSPACE:
1105     case OP_VSPACE:
1106     case OP_NOT_HSPACE:
1107     case OP_NOT_VSPACE:
1108     case OP_NOT_DIGIT:
1109     case OP_DIGIT:
1110     case OP_NOT_WHITESPACE:
1111     case OP_WHITESPACE:
1112     case OP_NOT_WORDCHAR:
1113     case OP_WORDCHAR:
1114     case OP_ANY:
1115     case OP_ALLANY:
1116     branchlength++;
1117     cc++;
1118     break;
1119 
1120     /* The single-byte matcher isn't allowed. This only happens in UTF-8 or
1121     UTF-16 mode; otherwise \C is coded as OP_ALLANY. */
1122 
1123     case OP_ANYBYTE:
1124     return FFL_BACKSLASHC;
1125 
1126     /* Check a class for variable quantification */
1127 
1128     case OP_CLASS:
1129     case OP_NCLASS:
1130 #ifdef SUPPORT_WIDE_CHARS
1131     case OP_XCLASS:
1132     /* The original code caused an unsigned overflow in 64 bit systems,
1133     so now we use a conditional statement. */
1134     if (op == OP_XCLASS)
1135       cc += GET(cc, 1);
1136     else
1137       cc += PRIV(OP_lengths)[OP_CLASS];
1138 #else
1139     cc += PRIV(OP_lengths)[OP_CLASS];
1140 #endif
1141 
1142     switch (*cc)
1143       {
1144       case OP_CRSTAR:
1145       case OP_CRMINSTAR:
1146       case OP_CRPLUS:
1147       case OP_CRMINPLUS:
1148       case OP_CRQUERY:
1149       case OP_CRMINQUERY:
1150       case OP_CRPOSSTAR:
1151       case OP_CRPOSPLUS:
1152       case OP_CRPOSQUERY:
1153       goto ISNOTFIXED;
1154 
1155       case OP_CRRANGE:
1156       case OP_CRMINRANGE:
1157       case OP_CRPOSRANGE:
1158       if (GET2(cc,1) != GET2(cc,1+IMM2_SIZE)) goto ISNOTFIXED;
1159       branchlength += GET2(cc,1);
1160       cc += 1 + 2 * IMM2_SIZE;
1161       break;
1162 
1163       default:
1164       branchlength++;
1165       }
1166     break;
1167 
1168     /* Anything else is variable length */
1169 
1170     case OP_ANYNL:
1171     case OP_BRAMINZERO:
1172     case OP_BRAPOS:
1173     case OP_BRAPOSZERO:
1174     case OP_BRAZERO:
1175     case OP_CBRAPOS:
1176     case OP_EXTUNI:
1177     case OP_KETRMAX:
1178     case OP_KETRMIN:
1179     case OP_KETRPOS:
1180     case OP_MINPLUS:
1181     case OP_MINPLUSI:
1182     case OP_MINQUERY:
1183     case OP_MINQUERYI:
1184     case OP_MINSTAR:
1185     case OP_MINSTARI:
1186     case OP_MINUPTO:
1187     case OP_MINUPTOI:
1188     case OP_NOTMINPLUS:
1189     case OP_NOTMINPLUSI:
1190     case OP_NOTMINQUERY:
1191     case OP_NOTMINQUERYI:
1192     case OP_NOTMINSTAR:
1193     case OP_NOTMINSTARI:
1194     case OP_NOTMINUPTO:
1195     case OP_NOTMINUPTOI:
1196     case OP_NOTPLUS:
1197     case OP_NOTPLUSI:
1198     case OP_NOTPOSPLUS:
1199     case OP_NOTPOSPLUSI:
1200     case OP_NOTPOSQUERY:
1201     case OP_NOTPOSQUERYI:
1202     case OP_NOTPOSSTAR:
1203     case OP_NOTPOSSTARI:
1204     case OP_NOTPOSUPTO:
1205     case OP_NOTPOSUPTOI:
1206     case OP_NOTQUERY:
1207     case OP_NOTQUERYI:
1208     case OP_NOTSTAR:
1209     case OP_NOTSTARI:
1210     case OP_NOTUPTO:
1211     case OP_NOTUPTOI:
1212     case OP_PLUS:
1213     case OP_PLUSI:
1214     case OP_POSPLUS:
1215     case OP_POSPLUSI:
1216     case OP_POSQUERY:
1217     case OP_POSQUERYI:
1218     case OP_POSSTAR:
1219     case OP_POSSTARI:
1220     case OP_POSUPTO:
1221     case OP_POSUPTOI:
1222     case OP_QUERY:
1223     case OP_QUERYI:
1224     case OP_REF:
1225     case OP_REFI:
1226     case OP_DNREF:
1227     case OP_DNREFI:
1228     case OP_SBRA:
1229     case OP_SBRAPOS:
1230     case OP_SCBRA:
1231     case OP_SCBRAPOS:
1232     case OP_SCOND:
1233     case OP_SKIPZERO:
1234     case OP_STAR:
1235     case OP_STARI:
1236     case OP_TYPEMINPLUS:
1237     case OP_TYPEMINQUERY:
1238     case OP_TYPEMINSTAR:
1239     case OP_TYPEMINUPTO:
1240     case OP_TYPEPLUS:
1241     case OP_TYPEPOSPLUS:
1242     case OP_TYPEPOSQUERY:
1243     case OP_TYPEPOSSTAR:
1244     case OP_TYPEPOSUPTO:
1245     case OP_TYPEQUERY:
1246     case OP_TYPESTAR:
1247     case OP_TYPEUPTO:
1248     case OP_UPTO:
1249     case OP_UPTOI:
1250     goto ISNOTFIXED;
1251 
1252     /* Catch unrecognized opcodes so that when new ones are added they
1253     are not forgotten, as has happened in the past. */
1254 
1255     default:
1256     return FFL_UNKNOWNOP;
1257     }
1258   }
1259 /* Control never gets here except by goto. */
1260 
1261 ISNOTFIXED:
1262 if (group > 0)
1263   {
1264   groupinfo |= GI_NOT_FIXED_LENGTH;
1265   cb->groupinfo[group] = groupinfo;
1266   }
1267 return FFL_NOTFIXED;
1268 }
1269 
1270 
1271 
1272 /*************************************************
1273 *      Find first significant op code            *
1274 *************************************************/
1275 
1276 /* This is called by several functions that scan a compiled expression looking
1277 for a fixed first character, or an anchoring op code etc. It skips over things
1278 that do not influence this. For some calls, it makes sense to skip negative
1279 forward and all backward assertions, and also the \b assertion; for others it
1280 does not.
1281 
1282 Arguments:
1283   code         pointer to the start of the group
1284   skipassert   TRUE if certain assertions are to be skipped
1285 
1286 Returns:       pointer to the first significant opcode
1287 */
1288 
1289 static const PCRE2_UCHAR*
first_significant_code(PCRE2_SPTR code,BOOL skipassert)1290 first_significant_code(PCRE2_SPTR code, BOOL skipassert)
1291 {
1292 for (;;)
1293   {
1294   switch ((int)*code)
1295     {
1296     case OP_ASSERT_NOT:
1297     case OP_ASSERTBACK:
1298     case OP_ASSERTBACK_NOT:
1299     if (!skipassert) return code;
1300     do code += GET(code, 1); while (*code == OP_ALT);
1301     code += PRIV(OP_lengths)[*code];
1302     break;
1303 
1304     case OP_WORD_BOUNDARY:
1305     case OP_NOT_WORD_BOUNDARY:
1306     if (!skipassert) return code;
1307     /* Fall through */
1308 
1309     case OP_CALLOUT:
1310     case OP_CREF:
1311     case OP_DNCREF:
1312     case OP_RREF:
1313     case OP_DNRREF:
1314     case OP_FALSE:
1315     case OP_TRUE:
1316     code += PRIV(OP_lengths)[*code];
1317     break;
1318 
1319     case OP_CALLOUT_STR:
1320     code += GET(code, 1 + 2*LINK_SIZE);
1321     break;
1322 
1323     default:
1324     return code;
1325     }
1326   }
1327 /* Control never reaches here */
1328 }
1329 
1330 
1331 
1332 /*************************************************
1333 *    Scan compiled branch for non-emptiness      *
1334 *************************************************/
1335 
1336 /* This function scans through a branch of a compiled pattern to see whether it
1337 can match the empty string. It is called at the end of compiling to check the
1338 entire pattern, and from compile_branch() when checking for an unlimited repeat
1339 of a group that can match nothing. In the latter case it is called only when
1340 doing the real compile, not during the pre-compile that measures the size of
1341 the compiled pattern.
1342 
1343 Note that first_significant_code() skips over backward and negative forward
1344 assertions when its final argument is TRUE. If we hit an unclosed bracket, we
1345 return "empty" - this means we've struck an inner bracket whose current branch
1346 will already have been scanned.
1347 
1348 Arguments:
1349   code        points to start of search
1350   endcode     points to where to stop
1351   utf         TRUE if in UTF mode
1352   cb          compile data
1353   atend       TRUE if being called to check an entire pattern
1354   recurses    chain of recurse_check to catch mutual recursion
1355   countptr    pointer to count to catch over-complicated pattern
1356 
1357 Returns:      0 if what is matched cannot be empty
1358               1 if what is matched could be empty
1359              -1 if the pattern is too complicated
1360 */
1361 
1362 #define CBE_NOTEMPTY          0
1363 #define CBE_EMPTY             1
1364 #define CBE_TOOCOMPLICATED  (-1)
1365 
1366 
1367 static int
could_be_empty_branch(PCRE2_SPTR code,PCRE2_SPTR endcode,BOOL utf,compile_block * cb,BOOL atend,recurse_check * recurses,int * countptr)1368 could_be_empty_branch(PCRE2_SPTR code, PCRE2_SPTR endcode, BOOL utf,
1369   compile_block *cb, BOOL atend, recurse_check *recurses, int *countptr)
1370 {
1371 uint32_t group = 0;
1372 uint32_t groupinfo = 0;
1373 register PCRE2_UCHAR c;
1374 recurse_check this_recurse;
1375 
1376 /* If what we are checking has already been set as "could be empty", we know
1377 the answer. */
1378 
1379 if (*code >= OP_SBRA && *code <= OP_SCOND) return CBE_EMPTY;
1380 
1381 /* If this is a capturing group, we may have the answer cached, but we can only
1382 use this information if there are no (?| groups in the pattern, because
1383 otherwise group numbers are not unique. */
1384 
1385 if ((cb->external_flags & PCRE2_DUPCAPUSED) == 0 &&
1386     (*code == OP_CBRA || *code == OP_CBRAPOS))
1387   {
1388   group = GET2(code, 1 + LINK_SIZE);
1389   groupinfo = cb->groupinfo[group];
1390   if ((groupinfo & GI_SET_COULD_BE_EMPTY) != 0)
1391     return ((groupinfo & GI_COULD_BE_EMPTY) != 0)? CBE_EMPTY : CBE_NOTEMPTY;
1392   }
1393 
1394 /* A large and/or complex regex can take too long to process. We have to assume
1395 it can match an empty string. This can happen more often when (?| groups are
1396 present in the pattern and the caching is disabled. Setting the cap at 1100
1397 allows the test for more than 1023 capturing patterns to work. */
1398 
1399 if ((*countptr)++ > 1100) return CBE_TOOCOMPLICATED;
1400 
1401 /* Scan the opcodes for this branch. */
1402 
1403 for (code = first_significant_code(code + PRIV(OP_lengths)[*code], TRUE);
1404      code < endcode;
1405      code = first_significant_code(code + PRIV(OP_lengths)[c], TRUE))
1406   {
1407   PCRE2_SPTR ccode;
1408 
1409   c = *code;
1410 
1411   /* Skip over forward assertions; the other assertions are skipped by
1412   first_significant_code() with a TRUE final argument. */
1413 
1414   if (c == OP_ASSERT)
1415     {
1416     do code += GET(code, 1); while (*code == OP_ALT);
1417     c = *code;
1418     continue;
1419     }
1420 
1421   /* For a recursion/subroutine call we can scan the recursion when this
1422   function is called at the end, to check a complete pattern. Before then,
1423   recursions just have the group number as their argument and in any case may
1424   be forward references. In that situation, we return CBE_EMPTY, just in case.
1425   It means that unlimited repeats of groups that contain recursions are always
1426   treated as "could be empty" - which just adds a bit more processing time
1427   because of the runtime check. */
1428 
1429   if (c == OP_RECURSE)
1430     {
1431     PCRE2_SPTR scode, endgroup;
1432     BOOL empty_branch;
1433 
1434     if (!atend) goto ISTRUE;
1435     scode = cb->start_code + GET(code, 1);
1436     endgroup = scode;
1437 
1438     /* We need to detect whether this is a recursive call, as otherwise there
1439     will be an infinite loop. If it is a recursion, just skip over it. Simple
1440     recursions are easily detected. For mutual recursions we keep a chain on
1441     the stack. */
1442 
1443     do endgroup += GET(endgroup, 1); while (*endgroup == OP_ALT);
1444     if (code >= scode && code <= endgroup) continue;  /* Simple recursion */
1445     else
1446       {
1447       recurse_check *r = recurses;
1448       for (r = recurses; r != NULL; r = r->prev)
1449         if (r->group == scode) break;
1450       if (r != NULL) continue;   /* Mutual recursion */
1451       }
1452 
1453     /* Scan the referenced group, remembering it on the stack chain to detect
1454     mutual recursions. */
1455 
1456     empty_branch = FALSE;
1457     this_recurse.prev = recurses;
1458     this_recurse.group = scode;
1459 
1460     do
1461       {
1462       int rc = could_be_empty_branch(scode, endcode, utf, cb, atend,
1463         &this_recurse, countptr);
1464       if (rc < 0) return rc;
1465       if (rc > 0)
1466         {
1467         empty_branch = TRUE;
1468         break;
1469         }
1470       scode += GET(scode, 1);
1471       }
1472     while (*scode == OP_ALT);
1473 
1474     if (!empty_branch) goto ISFALSE;  /* All branches are non-empty */
1475     continue;
1476     }
1477 
1478   /* Groups with zero repeats can of course be empty; skip them. */
1479 
1480   if (c == OP_BRAZERO || c == OP_BRAMINZERO || c == OP_SKIPZERO ||
1481       c == OP_BRAPOSZERO)
1482     {
1483     code += PRIV(OP_lengths)[c];
1484     do code += GET(code, 1); while (*code == OP_ALT);
1485     c = *code;
1486     continue;
1487     }
1488 
1489   /* A nested group that is already marked as "could be empty" can just be
1490   skipped. */
1491 
1492   if (c == OP_SBRA  || c == OP_SBRAPOS ||
1493       c == OP_SCBRA || c == OP_SCBRAPOS)
1494     {
1495     do code += GET(code, 1); while (*code == OP_ALT);
1496     c = *code;
1497     continue;
1498     }
1499 
1500   /* For other groups, scan the branches. */
1501 
1502   if (c == OP_BRA  || c == OP_BRAPOS ||
1503       c == OP_CBRA || c == OP_CBRAPOS ||
1504       c == OP_ONCE || c == OP_ONCE_NC ||
1505       c == OP_COND || c == OP_SCOND)
1506     {
1507     BOOL empty_branch;
1508     if (GET(code, 1) == 0) goto ISTRUE;    /* Hit unclosed bracket */
1509 
1510     /* If a conditional group has only one branch, there is a second, implied,
1511     empty branch, so just skip over the conditional, because it could be empty.
1512     Otherwise, scan the individual branches of the group. */
1513 
1514     if (c == OP_COND && code[GET(code, 1)] != OP_ALT)
1515       code += GET(code, 1);
1516     else
1517       {
1518       empty_branch = FALSE;
1519       do
1520         {
1521         if (!empty_branch)
1522           {
1523           int rc = could_be_empty_branch(code, endcode, utf, cb, atend,
1524             recurses, countptr);
1525           if (rc < 0) return rc;
1526           if (rc > 0) empty_branch = TRUE;
1527           }
1528         code += GET(code, 1);
1529         }
1530       while (*code == OP_ALT);
1531       if (!empty_branch) goto ISFALSE;   /* All branches are non-empty */
1532       }
1533 
1534     c = *code;
1535     continue;
1536     }
1537 
1538   /* Handle the other opcodes */
1539 
1540   switch (c)
1541     {
1542     /* Check for quantifiers after a class. XCLASS is used for classes that
1543     cannot be represented just by a bit map. This includes negated single
1544     high-valued characters. The length in PRIV(OP_lengths)[] is zero; the
1545     actual length is stored in the compiled code, so we must update "code"
1546     here. */
1547 
1548 #if defined SUPPORT_UNICODE || PCRE2_CODE_UNIT_WIDTH != 8
1549     case OP_XCLASS:
1550     ccode = code += GET(code, 1);
1551     goto CHECK_CLASS_REPEAT;
1552 #endif
1553 
1554     case OP_CLASS:
1555     case OP_NCLASS:
1556     ccode = code + PRIV(OP_lengths)[OP_CLASS];
1557 
1558 #if defined SUPPORT_UNICODE || PCRE2_CODE_UNIT_WIDTH != 8
1559     CHECK_CLASS_REPEAT:
1560 #endif
1561 
1562     switch (*ccode)
1563       {
1564       case OP_CRSTAR:            /* These could be empty; continue */
1565       case OP_CRMINSTAR:
1566       case OP_CRQUERY:
1567       case OP_CRMINQUERY:
1568       case OP_CRPOSSTAR:
1569       case OP_CRPOSQUERY:
1570       break;
1571 
1572       default:                   /* Non-repeat => class must match */
1573       case OP_CRPLUS:            /* These repeats aren't empty */
1574       case OP_CRMINPLUS:
1575       case OP_CRPOSPLUS:
1576       goto ISFALSE;
1577 
1578       case OP_CRRANGE:
1579       case OP_CRMINRANGE:
1580       case OP_CRPOSRANGE:
1581       if (GET2(ccode, 1) > 0) goto ISFALSE;  /* Minimum > 0 */
1582       break;
1583       }
1584     break;
1585 
1586     /* Opcodes that must match a character */
1587 
1588     case OP_ANY:
1589     case OP_ALLANY:
1590     case OP_ANYBYTE:
1591 
1592     case OP_PROP:
1593     case OP_NOTPROP:
1594     case OP_ANYNL:
1595 
1596     case OP_NOT_HSPACE:
1597     case OP_HSPACE:
1598     case OP_NOT_VSPACE:
1599     case OP_VSPACE:
1600     case OP_EXTUNI:
1601 
1602     case OP_NOT_DIGIT:
1603     case OP_DIGIT:
1604     case OP_NOT_WHITESPACE:
1605     case OP_WHITESPACE:
1606     case OP_NOT_WORDCHAR:
1607     case OP_WORDCHAR:
1608 
1609     case OP_CHAR:
1610     case OP_CHARI:
1611     case OP_NOT:
1612     case OP_NOTI:
1613 
1614     case OP_PLUS:
1615     case OP_PLUSI:
1616     case OP_MINPLUS:
1617     case OP_MINPLUSI:
1618 
1619     case OP_NOTPLUS:
1620     case OP_NOTPLUSI:
1621     case OP_NOTMINPLUS:
1622     case OP_NOTMINPLUSI:
1623 
1624     case OP_POSPLUS:
1625     case OP_POSPLUSI:
1626     case OP_NOTPOSPLUS:
1627     case OP_NOTPOSPLUSI:
1628 
1629     case OP_EXACT:
1630     case OP_EXACTI:
1631     case OP_NOTEXACT:
1632     case OP_NOTEXACTI:
1633 
1634     case OP_TYPEPLUS:
1635     case OP_TYPEMINPLUS:
1636     case OP_TYPEPOSPLUS:
1637     case OP_TYPEEXACT:
1638     goto ISFALSE;
1639 
1640     /* These are going to continue, as they may be empty, but we have to
1641     fudge the length for the \p and \P cases. */
1642 
1643     case OP_TYPESTAR:
1644     case OP_TYPEMINSTAR:
1645     case OP_TYPEPOSSTAR:
1646     case OP_TYPEQUERY:
1647     case OP_TYPEMINQUERY:
1648     case OP_TYPEPOSQUERY:
1649     if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
1650     break;
1651 
1652     /* Same for these */
1653 
1654     case OP_TYPEUPTO:
1655     case OP_TYPEMINUPTO:
1656     case OP_TYPEPOSUPTO:
1657     if (code[1 + IMM2_SIZE] == OP_PROP || code[1 + IMM2_SIZE] == OP_NOTPROP)
1658       code += 2;
1659     break;
1660 
1661     /* End of branch */
1662 
1663     case OP_KET:
1664     case OP_KETRMAX:
1665     case OP_KETRMIN:
1666     case OP_KETRPOS:
1667     case OP_ALT:
1668     goto ISTRUE;
1669 
1670     /* In UTF-8 or UTF-16 mode, STAR, MINSTAR, POSSTAR, QUERY, MINQUERY,
1671     POSQUERY, UPTO, MINUPTO, and POSUPTO and their caseless and negative
1672     versions may be followed by a multibyte character. */
1673 
1674 #ifdef MAYBE_UTF_MULTI
1675     case OP_STAR:
1676     case OP_STARI:
1677     case OP_NOTSTAR:
1678     case OP_NOTSTARI:
1679 
1680     case OP_MINSTAR:
1681     case OP_MINSTARI:
1682     case OP_NOTMINSTAR:
1683     case OP_NOTMINSTARI:
1684 
1685     case OP_POSSTAR:
1686     case OP_POSSTARI:
1687     case OP_NOTPOSSTAR:
1688     case OP_NOTPOSSTARI:
1689 
1690     case OP_QUERY:
1691     case OP_QUERYI:
1692     case OP_NOTQUERY:
1693     case OP_NOTQUERYI:
1694 
1695     case OP_MINQUERY:
1696     case OP_MINQUERYI:
1697     case OP_NOTMINQUERY:
1698     case OP_NOTMINQUERYI:
1699 
1700     case OP_POSQUERY:
1701     case OP_POSQUERYI:
1702     case OP_NOTPOSQUERY:
1703     case OP_NOTPOSQUERYI:
1704     if (utf && HAS_EXTRALEN(code[1])) code += GET_EXTRALEN(code[1]);
1705     break;
1706 
1707     case OP_UPTO:
1708     case OP_UPTOI:
1709     case OP_NOTUPTO:
1710     case OP_NOTUPTOI:
1711 
1712     case OP_MINUPTO:
1713     case OP_MINUPTOI:
1714     case OP_NOTMINUPTO:
1715     case OP_NOTMINUPTOI:
1716 
1717     case OP_POSUPTO:
1718     case OP_POSUPTOI:
1719     case OP_NOTPOSUPTO:
1720     case OP_NOTPOSUPTOI:
1721     if (utf && HAS_EXTRALEN(code[1 + IMM2_SIZE])) code += GET_EXTRALEN(code[1 + IMM2_SIZE]);
1722     break;
1723 #endif  /* MAYBE_UTF_MULTI */
1724 
1725     /* MARK, and PRUNE/SKIP/THEN with an argument must skip over the argument
1726     string. */
1727 
1728     case OP_MARK:
1729     case OP_PRUNE_ARG:
1730     case OP_SKIP_ARG:
1731     case OP_THEN_ARG:
1732     code += code[1];
1733     break;
1734 
1735     /* None of the remaining opcodes are required to match a character. */
1736 
1737     default:
1738     break;
1739     }
1740   }
1741 
1742 ISTRUE:
1743 groupinfo |= GI_COULD_BE_EMPTY;
1744 
1745 ISFALSE:
1746 if (group > 0) cb->groupinfo[group] = groupinfo | GI_SET_COULD_BE_EMPTY;
1747 
1748 return ((groupinfo & GI_COULD_BE_EMPTY) != 0)? CBE_EMPTY : CBE_NOTEMPTY;
1749 }
1750 
1751 
1752 
1753 /*************************************************
1754 *            Check for counted repeat            *
1755 *************************************************/
1756 
1757 /* This function is called when a '{' is encountered in a place where it might
1758 start a quantifier. It looks ahead to see if it really is a quantifier, that
1759 is, one of the forms {ddd} {ddd,} or {ddd,ddd} where the ddds are digits.
1760 
1761 Argument:   pointer to the first char after '{'
1762 Returns:    TRUE or FALSE
1763 */
1764 
1765 static BOOL
is_counted_repeat(PCRE2_SPTR p)1766 is_counted_repeat(PCRE2_SPTR p)
1767 {
1768 if (!IS_DIGIT(*p)) return FALSE;
1769 p++;
1770 while (IS_DIGIT(*p)) p++;
1771 if (*p == CHAR_RIGHT_CURLY_BRACKET) return TRUE;
1772 
1773 if (*p++ != CHAR_COMMA) return FALSE;
1774 if (*p == CHAR_RIGHT_CURLY_BRACKET) return TRUE;
1775 
1776 if (!IS_DIGIT(*p)) return FALSE;
1777 p++;
1778 while (IS_DIGIT(*p)) p++;
1779 
1780 return (*p == CHAR_RIGHT_CURLY_BRACKET);
1781 }
1782 
1783 
1784 
1785 /*************************************************
1786 *            Handle escapes                      *
1787 *************************************************/
1788 
1789 /* This function is called when a \ has been encountered. It either returns a
1790 positive value for a simple escape such as \d, or 0 for a data character, which
1791 is placed in chptr. A backreference to group n is returned as negative n. On
1792 entry, ptr is pointing at the \. On exit, it points the final code unit of the
1793 escape sequence.
1794 
1795 This function is also called from pcre2_substitute() to handle escape sequences
1796 in replacement strings. In this case, the cb argument is NULL, and only
1797 sequences that define a data character are recognised. The isclass argument is
1798 not relevant, but the options argument is the final value of the compiled
1799 pattern's options.
1800 
1801 There is one "trick" case: when a sequence such as [[:>:]] or \s in UCP mode is
1802 processed, it is replaced by a nested alternative sequence. If this contains a
1803 backslash (which is usually does), ptrend does not point to its end - it still
1804 points to the end of the whole pattern. However, we can detect this case
1805 because cb->nestptr[0] will be non-NULL. The nested sequences are all zero-
1806 terminated and there are only ever two levels of nesting.
1807 
1808 Arguments:
1809   ptrptr         points to the input position pointer
1810   ptrend         points to the end of the input
1811   chptr          points to a returned data character
1812   errorcodeptr   points to the errorcode variable (containing zero)
1813   options        the current options bits
1814   isclass        TRUE if inside a character class
1815   cb             compile data block
1816 
1817 Returns:         zero => a data character
1818                  positive => a special escape sequence
1819                  negative => a back reference
1820                  on error, errorcodeptr is set non-zero
1821 */
1822 
1823 int
PRIV(check_escape)1824 PRIV(check_escape)(PCRE2_SPTR *ptrptr, PCRE2_SPTR ptrend, uint32_t *chptr,
1825   int *errorcodeptr, uint32_t options, BOOL isclass, compile_block *cb)
1826 {
1827 BOOL utf = (options & PCRE2_UTF) != 0;
1828 PCRE2_SPTR ptr = *ptrptr + 1;
1829 register uint32_t c, cc;
1830 int escape = 0;
1831 int i;
1832 
1833 /* Find the end of a nested insert. */
1834 
1835 if (cb != NULL && cb->nestptr[0] != NULL)
1836   ptrend = ptr + PRIV(strlen)(ptr);
1837 
1838 /* If backslash is at the end of the string, it's an error. */
1839 
1840 if (ptr >= ptrend)
1841   {
1842   *errorcodeptr = ERR1;
1843   return 0;
1844   }
1845 
1846 GETCHARINCTEST(c, ptr);         /* Get character value, increment pointer */
1847 ptr--;                          /* Set pointer back to the last code unit */
1848 
1849 /* Non-alphanumerics are literals, so we just leave the value in c. An initial
1850 value test saves a memory lookup for code points outside the alphanumeric
1851 range. Otherwise, do a table lookup. A non-zero result is something that can be
1852 returned immediately. Otherwise further processing is required. */
1853 
1854 if (c < ESCAPES_FIRST || c > ESCAPES_LAST) {}  /* Definitely literal */
1855 
1856 else if ((i = escapes[c - ESCAPES_FIRST]) != 0)
1857   {
1858   if (i > 0) c = (uint32_t)i; else  /* Positive is a data character */
1859     {
1860     escape = -i;                    /* Else return a special escape */
1861     if (escape == ESC_P || escape == ESC_p || escape == ESC_X)
1862       cb->external_flags |= PCRE2_HASBKPORX;   /* Note \P, \p, or \X */
1863     }
1864   }
1865 
1866 /* Escapes that need further processing, including those that are unknown.
1867 When called from pcre2_substitute(), only \c, \o, and \x are recognized (and \u
1868 when BSUX is set). */
1869 
1870 else
1871   {
1872   PCRE2_SPTR oldptr;
1873   BOOL braced, negated, overflow;
1874   unsigned int s;
1875 
1876   /* Filter calls from pcre2_substitute(). */
1877 
1878   if (cb == NULL && c != CHAR_c && c != CHAR_o && c != CHAR_x &&
1879       (c != CHAR_u || (options & PCRE2_ALT_BSUX) != 0))
1880     {
1881     *errorcodeptr = ERR3;
1882     return 0;
1883     }
1884 
1885   switch (c)
1886     {
1887     /* A number of Perl escapes are not handled by PCRE. We give an explicit
1888     error. */
1889 
1890     case CHAR_l:
1891     case CHAR_L:
1892     *errorcodeptr = ERR37;
1893     break;
1894 
1895     /* \u is unrecognized when PCRE2_ALT_BSUX is not set. When it is treated
1896     specially, \u must be followed by four hex digits. Otherwise it is a
1897     lowercase u letter. */
1898 
1899     case CHAR_u:
1900     if ((options & PCRE2_ALT_BSUX) == 0) *errorcodeptr = ERR37; else
1901       {
1902       uint32_t xc;
1903       if ((cc = XDIGIT(ptr[1])) == 0xff) break;  /* Not a hex digit */
1904       if ((xc = XDIGIT(ptr[2])) == 0xff) break;  /* Not a hex digit */
1905       cc = (cc << 4) | xc;
1906       if ((xc = XDIGIT(ptr[3])) == 0xff) break;  /* Not a hex digit */
1907       cc = (cc << 4) | xc;
1908       if ((xc = XDIGIT(ptr[4])) == 0xff) break;  /* Not a hex digit */
1909       c = (cc << 4) | xc;
1910       ptr += 4;
1911       if (utf)
1912         {
1913         if (c > 0x10ffffU) *errorcodeptr = ERR77;
1914           else if (c >= 0xd800 && c <= 0xdfff) *errorcodeptr = ERR73;
1915         }
1916       else if (c > MAX_NON_UTF_CHAR) *errorcodeptr = ERR77;
1917       }
1918     break;
1919 
1920     case CHAR_U:
1921     /* \U is unrecognized unless PCRE2_ALT_BSUX is set, in which case it is an
1922     upper case letter. */
1923     if ((options & PCRE2_ALT_BSUX) == 0) *errorcodeptr = ERR37;
1924     break;
1925 
1926     /* In a character class, \g is just a literal "g". Outside a character
1927     class, \g must be followed by one of a number of specific things:
1928 
1929     (1) A number, either plain or braced. If positive, it is an absolute
1930     backreference. If negative, it is a relative backreference. This is a Perl
1931     5.10 feature.
1932 
1933     (2) Perl 5.10 also supports \g{name} as a reference to a named group. This
1934     is part of Perl's movement towards a unified syntax for back references. As
1935     this is synonymous with \k{name}, we fudge it up by pretending it really
1936     was \k.
1937 
1938     (3) For Oniguruma compatibility we also support \g followed by a name or a
1939     number either in angle brackets or in single quotes. However, these are
1940     (possibly recursive) subroutine calls, _not_ backreferences. Just return
1941     the ESC_g code (cf \k). */
1942 
1943     case CHAR_g:
1944     if (isclass) break;
1945     if (ptr[1] == CHAR_LESS_THAN_SIGN || ptr[1] == CHAR_APOSTROPHE)
1946       {
1947       escape = ESC_g;
1948       break;
1949       }
1950 
1951     /* Handle the Perl-compatible cases */
1952 
1953     if (ptr[1] == CHAR_LEFT_CURLY_BRACKET)
1954       {
1955       PCRE2_SPTR p;
1956       for (p = ptr+2; *p != CHAR_NULL && *p != CHAR_RIGHT_CURLY_BRACKET; p++)
1957         if (*p != CHAR_MINUS && !IS_DIGIT(*p)) break;
1958       if (*p != CHAR_NULL && *p != CHAR_RIGHT_CURLY_BRACKET)
1959         {
1960         escape = ESC_k;
1961         break;
1962         }
1963       braced = TRUE;
1964       ptr++;
1965       }
1966     else braced = FALSE;
1967 
1968     if (ptr[1] == CHAR_MINUS)
1969       {
1970       negated = TRUE;
1971       ptr++;
1972       }
1973     else negated = FALSE;
1974 
1975     /* The integer range is limited by the machine's int representation. */
1976     s = 0;
1977     overflow = FALSE;
1978     while (IS_DIGIT(ptr[1]))
1979       {
1980       if (s > INT_MAX / 10 - 1) /* Integer overflow */
1981         {
1982         overflow = TRUE;
1983         break;
1984         }
1985       s = s * 10 + (unsigned int)(*(++ptr) - CHAR_0);
1986       }
1987     if (overflow) /* Integer overflow */
1988       {
1989       while (IS_DIGIT(ptr[1])) ptr++;
1990       *errorcodeptr = ERR61;
1991       break;
1992       }
1993 
1994     if (braced && *(++ptr) != CHAR_RIGHT_CURLY_BRACKET)
1995       {
1996       *errorcodeptr = ERR57;
1997       break;
1998       }
1999 
2000     if (s == 0)
2001       {
2002       *errorcodeptr = ERR58;
2003       break;
2004       }
2005 
2006     if (negated)
2007       {
2008       if (s > cb->bracount)
2009         {
2010         *errorcodeptr = ERR15;
2011         break;
2012         }
2013       s = cb->bracount - (s - 1);
2014       }
2015 
2016     escape = -(int)s;
2017     break;
2018 
2019     /* The handling of escape sequences consisting of a string of digits
2020     starting with one that is not zero is not straightforward. Perl has changed
2021     over the years. Nowadays \g{} for backreferences and \o{} for octal are
2022     recommended to avoid the ambiguities in the old syntax.
2023 
2024     Outside a character class, the digits are read as a decimal number. If the
2025     number is less than 10, or if there are that many previous extracting left
2026     brackets, it is a back reference. Otherwise, up to three octal digits are
2027     read to form an escaped character code. Thus \123 is likely to be octal 123
2028     (cf \0123, which is octal 012 followed by the literal 3).
2029 
2030     Inside a character class, \ followed by a digit is always either a literal
2031     8 or 9 or an octal number. */
2032 
2033     case CHAR_1: case CHAR_2: case CHAR_3: case CHAR_4: case CHAR_5:
2034     case CHAR_6: case CHAR_7: case CHAR_8: case CHAR_9:
2035 
2036     if (!isclass)
2037       {
2038       oldptr = ptr;
2039       /* The integer range is limited by the machine's int representation. */
2040       s = c - CHAR_0;
2041       overflow = FALSE;
2042       while (IS_DIGIT(ptr[1]))
2043         {
2044         if (s > INT_MAX / 10 - 1) /* Integer overflow */
2045           {
2046           overflow = TRUE;
2047           break;
2048           }
2049         s = s * 10 + (unsigned int)(*(++ptr) - CHAR_0);
2050         }
2051       if (overflow) /* Integer overflow */
2052         {
2053         while (IS_DIGIT(ptr[1])) ptr++;
2054         *errorcodeptr = ERR61;
2055         break;
2056         }
2057 
2058       /* \1 to \9 are always back references. \8x and \9x are too; \1x to \7x
2059       are octal escapes if there are not that many previous captures. */
2060 
2061       if (s < 10 || *oldptr >= CHAR_8 || s <= cb->bracount)
2062         {
2063         escape = -(int)s;     /* Indicates a back reference */
2064         break;
2065         }
2066       ptr = oldptr;      /* Put the pointer back and fall through */
2067       }
2068 
2069     /* Handle a digit following \ when the number is not a back reference, or
2070     we are within a character class. If the first digit is 8 or 9, Perl used to
2071     generate a binary zero byte and then treat the digit as a following
2072     literal. At least by Perl 5.18 this changed so as not to insert the binary
2073     zero. */
2074 
2075     if ((c = *ptr) >= CHAR_8) break;
2076 
2077     /* Fall through with a digit less than 8 */
2078 
2079     /* \0 always starts an octal number, but we may drop through to here with a
2080     larger first octal digit. The original code used just to take the least
2081     significant 8 bits of octal numbers (I think this is what early Perls used
2082     to do). Nowadays we allow for larger numbers in UTF-8 mode and 16-bit mode,
2083     but no more than 3 octal digits. */
2084 
2085     case CHAR_0:
2086     c -= CHAR_0;
2087     while(i++ < 2 && ptr[1] >= CHAR_0 && ptr[1] <= CHAR_7)
2088         c = c * 8 + *(++ptr) - CHAR_0;
2089 #if PCRE2_CODE_UNIT_WIDTH == 8
2090     if (!utf && c > 0xff) *errorcodeptr = ERR51;
2091 #endif
2092     break;
2093 
2094     /* \o is a relatively new Perl feature, supporting a more general way of
2095     specifying character codes in octal. The only supported form is \o{ddd}. */
2096 
2097     case CHAR_o:
2098     if (ptr[1] != CHAR_LEFT_CURLY_BRACKET) *errorcodeptr = ERR55; else
2099     if (ptr[2] == CHAR_RIGHT_CURLY_BRACKET) *errorcodeptr = ERR78; else
2100       {
2101       ptr += 2;
2102       c = 0;
2103       overflow = FALSE;
2104       while (*ptr >= CHAR_0 && *ptr <= CHAR_7)
2105         {
2106         cc = *ptr++;
2107         if (c == 0 && cc == CHAR_0) continue;     /* Leading zeroes */
2108 #if PCRE2_CODE_UNIT_WIDTH == 32
2109         if (c >= 0x20000000l) { overflow = TRUE; break; }
2110 #endif
2111         c = (c << 3) + (cc - CHAR_0);
2112 #if PCRE2_CODE_UNIT_WIDTH == 8
2113         if (c > (utf ? 0x10ffffU : 0xffU)) { overflow = TRUE; break; }
2114 #elif PCRE2_CODE_UNIT_WIDTH == 16
2115         if (c > (utf ? 0x10ffffU : 0xffffU)) { overflow = TRUE; break; }
2116 #elif PCRE2_CODE_UNIT_WIDTH == 32
2117         if (utf && c > 0x10ffffU) { overflow = TRUE; break; }
2118 #endif
2119         }
2120       if (overflow)
2121         {
2122         while (*ptr >= CHAR_0 && *ptr <= CHAR_7) ptr++;
2123         *errorcodeptr = ERR34;
2124         }
2125       else if (*ptr == CHAR_RIGHT_CURLY_BRACKET)
2126         {
2127         if (utf && c >= 0xd800 && c <= 0xdfff) *errorcodeptr = ERR73;
2128         }
2129       else *errorcodeptr = ERR64;
2130       }
2131     break;
2132 
2133     /* \x is complicated. When PCRE2_ALT_BSUX is set, \x must be followed by
2134     two hexadecimal digits. Otherwise it is a lowercase x letter. */
2135 
2136     case CHAR_x:
2137     if ((options & PCRE2_ALT_BSUX) != 0)
2138       {
2139       uint32_t xc;
2140       if ((cc = XDIGIT(ptr[1])) == 0xff) break;  /* Not a hex digit */
2141       if ((xc = XDIGIT(ptr[2])) == 0xff) break;  /* Not a hex digit */
2142       c = (cc << 4) | xc;
2143       ptr += 2;
2144       }    /* End PCRE2_ALT_BSUX handling */
2145 
2146     /* Handle \x in Perl's style. \x{ddd} is a character number which can be
2147     greater than 0xff in UTF-8 or non-8bit mode, but only if the ddd are hex
2148     digits. If not, { used to be treated as a data character. However, Perl
2149     seems to read hex digits up to the first non-such, and ignore the rest, so
2150     that, for example \x{zz} matches a binary zero. This seems crazy, so PCRE
2151     now gives an error. */
2152 
2153     else
2154       {
2155       if (ptr[1] == CHAR_LEFT_CURLY_BRACKET)
2156         {
2157         ptr += 2;
2158         if (*ptr == CHAR_RIGHT_CURLY_BRACKET)
2159           {
2160           *errorcodeptr = ERR78;
2161           break;
2162           }
2163         c = 0;
2164         overflow = FALSE;
2165 
2166         while ((cc = XDIGIT(*ptr)) != 0xff)
2167           {
2168           ptr++;
2169           if (c == 0 && cc == 0) continue;   /* Leading zeroes */
2170 #if PCRE2_CODE_UNIT_WIDTH == 32
2171           if (c >= 0x10000000l) { overflow = TRUE; break; }
2172 #endif
2173           c = (c << 4) | cc;
2174           if ((utf && c > 0x10ffffU) || (!utf && c > MAX_NON_UTF_CHAR))
2175             {
2176             overflow = TRUE;
2177             break;
2178             }
2179           }
2180 
2181         if (overflow)
2182           {
2183           while (XDIGIT(*ptr) != 0xff) ptr++;
2184           *errorcodeptr = ERR34;
2185           }
2186         else if (*ptr == CHAR_RIGHT_CURLY_BRACKET)
2187           {
2188           if (utf && c >= 0xd800 && c <= 0xdfff) *errorcodeptr = ERR73;
2189           }
2190 
2191         /* If the sequence of hex digits does not end with '}', give an error.
2192         We used just to recognize this construct and fall through to the normal
2193         \x handling, but nowadays Perl gives an error, which seems much more
2194         sensible, so we do too. */
2195 
2196         else *errorcodeptr = ERR67;
2197         }   /* End of \x{} processing */
2198 
2199       /* Read a single-byte hex-defined char (up to two hex digits after \x) */
2200 
2201       else
2202         {
2203         c = 0;
2204         if ((cc = XDIGIT(ptr[1])) == 0xff) break;  /* Not a hex digit */
2205         ptr++;
2206         c = cc;
2207         if ((cc = XDIGIT(ptr[1])) == 0xff) break;  /* Not a hex digit */
2208         ptr++;
2209         c = (c << 4) | cc;
2210         }     /* End of \xdd handling */
2211       }       /* End of Perl-style \x handling */
2212     break;
2213 
2214     /* The handling of \c is different in ASCII and EBCDIC environments. In an
2215     ASCII (or Unicode) environment, an error is given if the character
2216     following \c is not a printable ASCII character. Otherwise, the following
2217     character is upper-cased if it is a letter, and after that the 0x40 bit is
2218     flipped. The result is the value of the escape.
2219 
2220     In an EBCDIC environment the handling of \c is compatible with the
2221     specification in the perlebcdic document. The following character must be
2222     a letter or one of small number of special characters. These provide a
2223     means of defining the character values 0-31.
2224 
2225     For testing the EBCDIC handling of \c in an ASCII environment, recognize
2226     the EBCDIC value of 'c' explicitly. */
2227 
2228 #if defined EBCDIC && 'a' != 0x81
2229     case 0x83:
2230 #else
2231     case CHAR_c:
2232 #endif
2233 
2234     c = *(++ptr);
2235     if (c >= CHAR_a && c <= CHAR_z) c = UPPER_CASE(c);
2236     if (c == CHAR_NULL && ptr >= ptrend)
2237       {
2238       *errorcodeptr = ERR2;
2239       break;
2240       }
2241 
2242     /* Handle \c in an ASCII/Unicode environment. */
2243 
2244 #ifndef EBCDIC    /* ASCII/UTF-8 coding */
2245     if (c < 32 || c > 126)  /* Excludes all non-printable ASCII */
2246       {
2247       *errorcodeptr = ERR68;
2248       break;
2249       }
2250     c ^= 0x40;
2251 
2252     /* Handle \c in an EBCDIC environment. The special case \c? is converted to
2253     255 (0xff) or 95 (0x5f) if other character suggest we are using th POSIX-BC
2254     encoding. (This is the way Perl indicates that it handles \c?.) The other
2255     valid sequences correspond to a list of specific characters. */
2256 
2257 #else
2258     if (c == CHAR_QUESTION_MARK)
2259       c = ('\\' == 188 && '`' == 74)? 0x5f : 0xff;
2260     else
2261       {
2262       for (i = 0; i < 32; i++)
2263         {
2264         if (c == ebcdic_escape_c[i]) break;
2265         }
2266       if (i < 32) c = i; else *errorcodeptr = ERR68;
2267       }
2268 #endif  /* EBCDIC */
2269 
2270     break;
2271 
2272     /* Any other alphanumeric following \ is an error. Perl gives an error only
2273     if in warning mode, but PCRE doesn't have a warning mode. */
2274 
2275     default:
2276     *errorcodeptr = ERR3;
2277     break;
2278     }
2279   }
2280 
2281 /* Perl supports \N{name} for character names, as well as plain \N for "not
2282 newline". PCRE does not support \N{name}. However, it does support
2283 quantification such as \N{2,3}. */
2284 
2285 if (escape == ESC_N && ptr[1] == CHAR_LEFT_CURLY_BRACKET &&
2286      !is_counted_repeat(ptr+2))
2287   *errorcodeptr = ERR37;
2288 
2289 /* If PCRE2_UCP is set, we change the values for \d etc. */
2290 
2291 if ((options & PCRE2_UCP) != 0 && escape >= ESC_D && escape <= ESC_w)
2292   escape += (ESC_DU - ESC_D);
2293 
2294 /* Set the pointer to the final character before returning. */
2295 
2296 *ptrptr = ptr;
2297 *chptr = c;
2298 return escape;
2299 }
2300 
2301 
2302 
2303 #ifdef SUPPORT_UNICODE
2304 /*************************************************
2305 *               Handle \P and \p                 *
2306 *************************************************/
2307 
2308 /* This function is called after \P or \p has been encountered, provided that
2309 PCRE2 is compiled with support for UTF and Unicode properties. On entry, the
2310 contents of ptrptr are pointing at the P or p. On exit, it is left pointing at
2311 the final code unit of the escape sequence.
2312 
2313 Arguments:
2314   ptrptr         the pattern position pointer
2315   negptr         a boolean that is set TRUE for negation else FALSE
2316   ptypeptr       an unsigned int that is set to the type value
2317   pdataptr       an unsigned int that is set to the detailed property value
2318   errorcodeptr   the error code variable
2319   cb             the compile data
2320 
2321 Returns:         TRUE if the type value was found, or FALSE for an invalid type
2322 */
2323 
2324 static BOOL
get_ucp(PCRE2_SPTR * ptrptr,BOOL * negptr,unsigned int * ptypeptr,unsigned int * pdataptr,int * errorcodeptr,compile_block * cb)2325 get_ucp(PCRE2_SPTR *ptrptr, BOOL *negptr, unsigned int *ptypeptr,
2326   unsigned int *pdataptr, int *errorcodeptr, compile_block *cb)
2327 {
2328 register PCRE2_UCHAR c;
2329 size_t i, bot, top;
2330 PCRE2_SPTR ptr = *ptrptr;
2331 PCRE2_UCHAR name[32];
2332 
2333 *negptr = FALSE;
2334 c = *(++ptr);
2335 
2336 /* \P or \p can be followed by a name in {}, optionally preceded by ^ for
2337 negation. */
2338 
2339 if (c == CHAR_LEFT_CURLY_BRACKET)
2340   {
2341   if (ptr[1] == CHAR_CIRCUMFLEX_ACCENT)
2342     {
2343     *negptr = TRUE;
2344     ptr++;
2345     }
2346   for (i = 0; i < (int)(sizeof(name) / sizeof(PCRE2_UCHAR)) - 1; i++)
2347     {
2348     c = *(++ptr);
2349     if (c == CHAR_NULL) goto ERROR_RETURN;
2350     if (c == CHAR_RIGHT_CURLY_BRACKET) break;
2351     name[i] = c;
2352     }
2353   if (c != CHAR_RIGHT_CURLY_BRACKET) goto ERROR_RETURN;
2354   name[i] = 0;
2355   }
2356 
2357 /* Otherwise there is just one following character, which must be an ASCII
2358 letter. */
2359 
2360 else if (MAX_255(c) && (cb->ctypes[c] & ctype_letter) != 0)
2361   {
2362   name[0] = c;
2363   name[1] = 0;
2364   }
2365 else goto ERROR_RETURN;
2366 
2367 *ptrptr = ptr;
2368 
2369 /* Search for a recognized property name using binary chop. */
2370 
2371 bot = 0;
2372 top = PRIV(utt_size);
2373 
2374 while (bot < top)
2375   {
2376   int r;
2377   i = (bot + top) >> 1;
2378   r = PRIV(strcmp_c8)(name, PRIV(utt_names) + PRIV(utt)[i].name_offset);
2379   if (r == 0)
2380     {
2381     *ptypeptr = PRIV(utt)[i].type;
2382     *pdataptr = PRIV(utt)[i].value;
2383     return TRUE;
2384     }
2385   if (r > 0) bot = i + 1; else top = i;
2386   }
2387 *errorcodeptr = ERR47;   /* Unrecognized name */
2388 return FALSE;
2389 
2390 ERROR_RETURN:            /* Malformed \P or \p */
2391 *errorcodeptr = ERR46;
2392 *ptrptr = ptr;
2393 return FALSE;
2394 }
2395 #endif
2396 
2397 
2398 
2399 /*************************************************
2400 *         Read repeat counts                     *
2401 *************************************************/
2402 
2403 /* Read an item of the form {n,m} and return the values. This is called only
2404 after is_counted_repeat() has confirmed that a repeat-count quantifier exists,
2405 so the syntax is guaranteed to be correct, but we need to check the values.
2406 
2407 Arguments:
2408   p              pointer to first char after '{'
2409   minp           pointer to int for min
2410   maxp           pointer to int for max
2411                  returned as -1 if no max
2412   errorcodeptr   points to error code variable
2413 
2414 Returns:         pointer to '}' on success;
2415                  current ptr on error, with errorcodeptr set non-zero
2416 */
2417 
2418 static PCRE2_SPTR
read_repeat_counts(PCRE2_SPTR p,int * minp,int * maxp,int * errorcodeptr)2419 read_repeat_counts(PCRE2_SPTR p, int *minp, int *maxp, int *errorcodeptr)
2420 {
2421 int min = 0;
2422 int max = -1;
2423 
2424 while (IS_DIGIT(*p))
2425   {
2426   min = min * 10 + (int)(*p++ - CHAR_0);
2427   if (min > 65535)
2428     {
2429     *errorcodeptr = ERR5;
2430     return p;
2431     }
2432   }
2433 
2434 if (*p == CHAR_RIGHT_CURLY_BRACKET) max = min; else
2435   {
2436   if (*(++p) != CHAR_RIGHT_CURLY_BRACKET)
2437     {
2438     max = 0;
2439     while(IS_DIGIT(*p))
2440       {
2441       max = max * 10 + (int)(*p++ - CHAR_0);
2442       if (max > 65535)
2443         {
2444         *errorcodeptr = ERR5;
2445         return p;
2446         }
2447       }
2448     if (max < min)
2449       {
2450       *errorcodeptr = ERR4;
2451       return p;
2452       }
2453     }
2454   }
2455 
2456 *minp = min;
2457 *maxp = max;
2458 return p;
2459 }
2460 
2461 
2462 
2463 /*************************************************
2464 *   Scan compiled regex for recursion reference  *
2465 *************************************************/
2466 
2467 /* This function scans through a compiled pattern until it finds an instance of
2468 OP_RECURSE.
2469 
2470 Arguments:
2471   code        points to start of expression
2472   utf         TRUE in UTF mode
2473 
2474 Returns:      pointer to the opcode for OP_RECURSE, or NULL if not found
2475 */
2476 
2477 static PCRE2_SPTR
find_recurse(PCRE2_SPTR code,BOOL utf)2478 find_recurse(PCRE2_SPTR code, BOOL utf)
2479 {
2480 for (;;)
2481   {
2482   register PCRE2_UCHAR c = *code;
2483   if (c == OP_END) return NULL;
2484   if (c == OP_RECURSE) return code;
2485 
2486   /* XCLASS is used for classes that cannot be represented just by a bit map.
2487   This includes negated single high-valued characters. CALLOUT_STR is used for
2488   callouts with string arguments. In both cases the length in the table is
2489   zero; the actual length is stored in the compiled code. */
2490 
2491   if (c == OP_XCLASS) code += GET(code, 1);
2492     else if (c == OP_CALLOUT_STR) code += GET(code, 1 + 2*LINK_SIZE);
2493 
2494   /* Otherwise, we can get the item's length from the table, except that for
2495   repeated character types, we have to test for \p and \P, which have an extra
2496   two bytes of parameters, and for MARK/PRUNE/SKIP/THEN with an argument, we
2497   must add in its length. */
2498 
2499   else
2500     {
2501     switch(c)
2502       {
2503       case OP_TYPESTAR:
2504       case OP_TYPEMINSTAR:
2505       case OP_TYPEPLUS:
2506       case OP_TYPEMINPLUS:
2507       case OP_TYPEQUERY:
2508       case OP_TYPEMINQUERY:
2509       case OP_TYPEPOSSTAR:
2510       case OP_TYPEPOSPLUS:
2511       case OP_TYPEPOSQUERY:
2512       if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
2513       break;
2514 
2515       case OP_TYPEPOSUPTO:
2516       case OP_TYPEUPTO:
2517       case OP_TYPEMINUPTO:
2518       case OP_TYPEEXACT:
2519       if (code[1 + IMM2_SIZE] == OP_PROP || code[1 + IMM2_SIZE] == OP_NOTPROP)
2520         code += 2;
2521       break;
2522 
2523       case OP_MARK:
2524       case OP_PRUNE_ARG:
2525       case OP_SKIP_ARG:
2526       case OP_THEN_ARG:
2527       code += code[1];
2528       break;
2529       }
2530 
2531     /* Add in the fixed length from the table */
2532 
2533     code += PRIV(OP_lengths)[c];
2534 
2535     /* In UTF-8 and UTF-16 modes, opcodes that are followed by a character may
2536     be followed by a multi-unit character. The length in the table is a
2537     minimum, so we have to arrange to skip the extra units. */
2538 
2539 #ifdef MAYBE_UTF_MULTI
2540     if (utf) switch(c)
2541       {
2542       case OP_CHAR:
2543       case OP_CHARI:
2544       case OP_NOT:
2545       case OP_NOTI:
2546       case OP_EXACT:
2547       case OP_EXACTI:
2548       case OP_NOTEXACT:
2549       case OP_NOTEXACTI:
2550       case OP_UPTO:
2551       case OP_UPTOI:
2552       case OP_NOTUPTO:
2553       case OP_NOTUPTOI:
2554       case OP_MINUPTO:
2555       case OP_MINUPTOI:
2556       case OP_NOTMINUPTO:
2557       case OP_NOTMINUPTOI:
2558       case OP_POSUPTO:
2559       case OP_POSUPTOI:
2560       case OP_NOTPOSUPTO:
2561       case OP_NOTPOSUPTOI:
2562       case OP_STAR:
2563       case OP_STARI:
2564       case OP_NOTSTAR:
2565       case OP_NOTSTARI:
2566       case OP_MINSTAR:
2567       case OP_MINSTARI:
2568       case OP_NOTMINSTAR:
2569       case OP_NOTMINSTARI:
2570       case OP_POSSTAR:
2571       case OP_POSSTARI:
2572       case OP_NOTPOSSTAR:
2573       case OP_NOTPOSSTARI:
2574       case OP_PLUS:
2575       case OP_PLUSI:
2576       case OP_NOTPLUS:
2577       case OP_NOTPLUSI:
2578       case OP_MINPLUS:
2579       case OP_MINPLUSI:
2580       case OP_NOTMINPLUS:
2581       case OP_NOTMINPLUSI:
2582       case OP_POSPLUS:
2583       case OP_POSPLUSI:
2584       case OP_NOTPOSPLUS:
2585       case OP_NOTPOSPLUSI:
2586       case OP_QUERY:
2587       case OP_QUERYI:
2588       case OP_NOTQUERY:
2589       case OP_NOTQUERYI:
2590       case OP_MINQUERY:
2591       case OP_MINQUERYI:
2592       case OP_NOTMINQUERY:
2593       case OP_NOTMINQUERYI:
2594       case OP_POSQUERY:
2595       case OP_POSQUERYI:
2596       case OP_NOTPOSQUERY:
2597       case OP_NOTPOSQUERYI:
2598       if (HAS_EXTRALEN(code[-1])) code += GET_EXTRALEN(code[-1]);
2599       break;
2600       }
2601 #else
2602     (void)(utf);  /* Keep compiler happy by referencing function argument */
2603 #endif  /* MAYBE_UTF_MULTI */
2604     }
2605   }
2606 }
2607 
2608 
2609 
2610 /*************************************************
2611 *           Check for POSIX class syntax         *
2612 *************************************************/
2613 
2614 /* This function is called when the sequence "[:" or "[." or "[=" is
2615 encountered in a character class. It checks whether this is followed by a
2616 sequence of characters terminated by a matching ":]" or ".]" or "=]". If we
2617 reach an unescaped ']' without the special preceding character, return FALSE.
2618 
2619 Originally, this function only recognized a sequence of letters between the
2620 terminators, but it seems that Perl recognizes any sequence of characters,
2621 though of course unknown POSIX names are subsequently rejected. Perl gives an
2622 "Unknown POSIX class" error for [:f\oo:] for example, where previously PCRE
2623 didn't consider this to be a POSIX class. Likewise for [:1234:].
2624 
2625 The problem in trying to be exactly like Perl is in the handling of escapes. We
2626 have to be sure that [abc[:x\]pqr] is *not* treated as containing a POSIX
2627 class, but [abc[:x\]pqr:]] is (so that an error can be generated). The code
2628 below handles the special cases \\ and \], but does not try to do any other
2629 escape processing. This makes it different from Perl for cases such as
2630 [:l\ower:] where Perl recognizes it as the POSIX class "lower" but PCRE does
2631 not recognize "l\ower". This is a lesser evil than not diagnosing bad classes
2632 when Perl does, I think.
2633 
2634 A user pointed out that PCRE was rejecting [:a[:digit:]] whereas Perl was not.
2635 It seems that the appearance of a nested POSIX class supersedes an apparent
2636 external class. For example, [:a[:digit:]b:] matches "a", "b", ":", or
2637 a digit. This is handled by returning FALSE if the start of a new group with
2638 the same terminator is encountered, since the next closing sequence must close
2639 the nested group, not the outer one.
2640 
2641 In Perl, unescaped square brackets may also appear as part of class names. For
2642 example, [:a[:abc]b:] gives unknown POSIX class "[:abc]b:]". However, for
2643 [:a[:abc]b][b:] it gives unknown POSIX class "[:abc]b][b:]", which does not
2644 seem right at all. PCRE does not allow closing square brackets in POSIX class
2645 names.
2646 
2647 Arguments:
2648   ptr      pointer to the initial [
2649   endptr   where to return a pointer to the terminating ':', '.', or '='
2650 
2651 Returns:   TRUE or FALSE
2652 */
2653 
2654 static BOOL
check_posix_syntax(PCRE2_SPTR ptr,PCRE2_SPTR * endptr)2655 check_posix_syntax(PCRE2_SPTR ptr, PCRE2_SPTR *endptr)
2656 {
2657 PCRE2_UCHAR terminator;  /* Don't combine these lines; the Solaris cc */
2658 terminator = *(++ptr);   /* compiler warns about "non-constant" initializer. */
2659 
2660 for (++ptr; *ptr != CHAR_NULL; ptr++)
2661   {
2662   if (*ptr == CHAR_BACKSLASH &&
2663       (ptr[1] == CHAR_RIGHT_SQUARE_BRACKET || ptr[1] == CHAR_BACKSLASH))
2664     ptr++;
2665   else if ((*ptr == CHAR_LEFT_SQUARE_BRACKET && ptr[1] == terminator) ||
2666             *ptr == CHAR_RIGHT_SQUARE_BRACKET) return FALSE;
2667   else if (*ptr == terminator && ptr[1] == CHAR_RIGHT_SQUARE_BRACKET)
2668     {
2669     *endptr = ptr;
2670     return TRUE;
2671     }
2672   }
2673 
2674 return FALSE;
2675 }
2676 
2677 
2678 
2679 /*************************************************
2680 *          Check POSIX class name                *
2681 *************************************************/
2682 
2683 /* This function is called to check the name given in a POSIX-style class entry
2684 such as [:alnum:].
2685 
2686 Arguments:
2687   ptr        points to the first letter
2688   len        the length of the name
2689 
2690 Returns:     a value representing the name, or -1 if unknown
2691 */
2692 
2693 static int
check_posix_name(PCRE2_SPTR ptr,int len)2694 check_posix_name(PCRE2_SPTR ptr, int len)
2695 {
2696 const char *pn = posix_names;
2697 register int yield = 0;
2698 while (posix_name_lengths[yield] != 0)
2699   {
2700   if (len == posix_name_lengths[yield] &&
2701     PRIV(strncmp_c8)(ptr, pn, (unsigned int)len) == 0) return yield;
2702   pn += posix_name_lengths[yield] + 1;
2703   yield++;
2704   }
2705 return -1;
2706 }
2707 
2708 
2709 
2710 #ifdef SUPPORT_UNICODE
2711 /*************************************************
2712 *           Get othercase range                  *
2713 *************************************************/
2714 
2715 /* This function is passed the start and end of a class range in UCT mode. It
2716 searches up the characters, looking for ranges of characters in the "other"
2717 case. Each call returns the next one, updating the start address. A character
2718 with multiple other cases is returned on its own with a special return value.
2719 
2720 Arguments:
2721   cptr        points to starting character value; updated
2722   d           end value
2723   ocptr       where to put start of othercase range
2724   odptr       where to put end of othercase range
2725 
2726 Yield:        -1 when no more
2727                0 when a range is returned
2728               >0 the CASESET offset for char with multiple other cases
2729                 in this case, ocptr contains the original
2730 */
2731 
2732 static int
get_othercase_range(uint32_t * cptr,uint32_t d,uint32_t * ocptr,uint32_t * odptr)2733 get_othercase_range(uint32_t *cptr, uint32_t d, uint32_t *ocptr,
2734   uint32_t *odptr)
2735 {
2736 uint32_t c, othercase, next;
2737 unsigned int co;
2738 
2739 /* Find the first character that has an other case. If it has multiple other
2740 cases, return its case offset value. */
2741 
2742 for (c = *cptr; c <= d; c++)
2743   {
2744   if ((co = UCD_CASESET(c)) != 0)
2745     {
2746     *ocptr = c++;   /* Character that has the set */
2747     *cptr = c;      /* Rest of input range */
2748     return (int)co;
2749     }
2750   if ((othercase = UCD_OTHERCASE(c)) != c) break;
2751   }
2752 
2753 if (c > d) return -1;  /* Reached end of range */
2754 
2755 /* Found a character that has a single other case. Search for the end of the
2756 range, which is either the end of the input range, or a character that has zero
2757 or more than one other cases. */
2758 
2759 *ocptr = othercase;
2760 next = othercase + 1;
2761 
2762 for (++c; c <= d; c++)
2763   {
2764   if ((co = UCD_CASESET(c)) != 0 || UCD_OTHERCASE(c) != next) break;
2765   next++;
2766   }
2767 
2768 *odptr = next - 1;     /* End of othercase range */
2769 *cptr = c;             /* Rest of input range */
2770 return 0;
2771 }
2772 #endif  /* SUPPORT_UNICODE */
2773 
2774 
2775 
2776 /*************************************************
2777 *        Add a character or range to a class     *
2778 *************************************************/
2779 
2780 /* This function packages up the logic of adding a character or range of
2781 characters to a class. The character values in the arguments will be within the
2782 valid values for the current mode (8-bit, 16-bit, UTF, etc). This function is
2783 mutually recursive with the function immediately below.
2784 
2785 Arguments:
2786   classbits     the bit map for characters < 256
2787   uchardptr     points to the pointer for extra data
2788   options       the options word
2789   cb            compile data
2790   start         start of range character
2791   end           end of range character
2792 
2793 Returns:        the number of < 256 characters added
2794                 the pointer to extra data is updated
2795 */
2796 
2797 static unsigned int
add_to_class(uint8_t * classbits,PCRE2_UCHAR ** uchardptr,uint32_t options,compile_block * cb,uint32_t start,uint32_t end)2798 add_to_class(uint8_t *classbits, PCRE2_UCHAR **uchardptr, uint32_t options,
2799   compile_block *cb, uint32_t start, uint32_t end)
2800 {
2801 uint32_t c;
2802 uint32_t classbits_end = (end <= 0xff ? end : 0xff);
2803 unsigned int n8 = 0;
2804 
2805 /* If caseless matching is required, scan the range and process alternate
2806 cases. In Unicode, there are 8-bit characters that have alternate cases that
2807 are greater than 255 and vice-versa. Sometimes we can just extend the original
2808 range. */
2809 
2810 if ((options & PCRE2_CASELESS) != 0)
2811   {
2812 #ifdef SUPPORT_UNICODE
2813   if ((options & PCRE2_UTF) != 0)
2814     {
2815     int rc;
2816     uint32_t oc, od;
2817 
2818     options &= ~PCRE2_CASELESS;   /* Remove for recursive calls */
2819     c = start;
2820 
2821     while ((rc = get_othercase_range(&c, end, &oc, &od)) >= 0)
2822       {
2823       /* Handle a single character that has more than one other case. */
2824 
2825       if (rc > 0) n8 += add_list_to_class(classbits, uchardptr, options, cb,
2826         PRIV(ucd_caseless_sets) + rc, oc);
2827 
2828       /* Do nothing if the other case range is within the original range. */
2829 
2830       else if (oc >= start && od <= end) continue;
2831 
2832       /* Extend the original range if there is overlap, noting that if oc < c, we
2833       can't have od > end because a subrange is always shorter than the basic
2834       range. Otherwise, use a recursive call to add the additional range. */
2835 
2836       else if (oc < start && od >= start - 1) start = oc; /* Extend downwards */
2837       else if (od > end && oc <= end + 1)
2838         {
2839         end = od;       /* Extend upwards */
2840         if (end > classbits_end) classbits_end = (end <= 0xff ? end : 0xff);
2841         }
2842       else n8 += add_to_class(classbits, uchardptr, options, cb, oc, od);
2843       }
2844     }
2845   else
2846 #endif  /* SUPPORT_UNICODE */
2847 
2848   /* Not UTF mode */
2849 
2850   for (c = start; c <= classbits_end; c++)
2851     {
2852     SETBIT(classbits, cb->fcc[c]);
2853     n8++;
2854     }
2855   }
2856 
2857 /* Now handle the original range. Adjust the final value according to the bit
2858 length - this means that the same lists of (e.g.) horizontal spaces can be used
2859 in all cases. */
2860 
2861 if ((options & PCRE2_UTF) == 0 && end > MAX_NON_UTF_CHAR)
2862   end = MAX_NON_UTF_CHAR;
2863 
2864 /* Use the bitmap for characters < 256. Otherwise use extra data.*/
2865 
2866 for (c = start; c <= classbits_end; c++)
2867   {
2868   /* Regardless of start, c will always be <= 255. */
2869   SETBIT(classbits, c);
2870   n8++;
2871   }
2872 
2873 #ifdef SUPPORT_WIDE_CHARS
2874 if (start <= 0xff) start = 0xff + 1;
2875 
2876 if (end >= start)
2877   {
2878   PCRE2_UCHAR *uchardata = *uchardptr;
2879 
2880 #ifdef SUPPORT_UNICODE
2881   if ((options & PCRE2_UTF) != 0)
2882     {
2883     if (start < end)
2884       {
2885       *uchardata++ = XCL_RANGE;
2886       uchardata += PRIV(ord2utf)(start, uchardata);
2887       uchardata += PRIV(ord2utf)(end, uchardata);
2888       }
2889     else if (start == end)
2890       {
2891       *uchardata++ = XCL_SINGLE;
2892       uchardata += PRIV(ord2utf)(start, uchardata);
2893       }
2894     }
2895   else
2896 #endif  /* SUPPORT_UNICODE */
2897 
2898   /* Without UTF support, character values are constrained by the bit length,
2899   and can only be > 256 for 16-bit and 32-bit libraries. */
2900 
2901 #if PCRE2_CODE_UNIT_WIDTH == 8
2902     {}
2903 #else
2904   if (start < end)
2905     {
2906     *uchardata++ = XCL_RANGE;
2907     *uchardata++ = start;
2908     *uchardata++ = end;
2909     }
2910   else if (start == end)
2911     {
2912     *uchardata++ = XCL_SINGLE;
2913     *uchardata++ = start;
2914     }
2915 #endif
2916   *uchardptr = uchardata;   /* Updata extra data pointer */
2917   }
2918 #else
2919   (void)uchardptr;          /* Avoid compiler warning */
2920 #endif /* SUPPORT_WIDE_CHARS */
2921 
2922 return n8;    /* Number of 8-bit characters */
2923 }
2924 
2925 
2926 
2927 /*************************************************
2928 *        Add a list of characters to a class     *
2929 *************************************************/
2930 
2931 /* This function is used for adding a list of case-equivalent characters to a
2932 class, and also for adding a list of horizontal or vertical whitespace. If the
2933 list is in order (which it should be), ranges of characters are detected and
2934 handled appropriately. This function is mutually recursive with the function
2935 above.
2936 
2937 Arguments:
2938   classbits     the bit map for characters < 256
2939   uchardptr     points to the pointer for extra data
2940   options       the options word
2941   cb            contains pointers to tables etc.
2942   p             points to row of 32-bit values, terminated by NOTACHAR
2943   except        character to omit; this is used when adding lists of
2944                   case-equivalent characters to avoid including the one we
2945                   already know about
2946 
2947 Returns:        the number of < 256 characters added
2948                 the pointer to extra data is updated
2949 */
2950 
2951 static unsigned int
add_list_to_class(uint8_t * classbits,PCRE2_UCHAR ** uchardptr,uint32_t options,compile_block * cb,const uint32_t * p,unsigned int except)2952 add_list_to_class(uint8_t *classbits, PCRE2_UCHAR **uchardptr, uint32_t options,
2953   compile_block *cb, const uint32_t *p, unsigned int except)
2954 {
2955 unsigned int n8 = 0;
2956 while (p[0] < NOTACHAR)
2957   {
2958   unsigned int n = 0;
2959   if (p[0] != except)
2960     {
2961     while(p[n+1] == p[0] + n + 1) n++;
2962     n8 += add_to_class(classbits, uchardptr, options, cb, p[0], p[n]);
2963     }
2964   p += n + 1;
2965   }
2966 return n8;
2967 }
2968 
2969 
2970 
2971 /*************************************************
2972 *    Add characters not in a list to a class     *
2973 *************************************************/
2974 
2975 /* This function is used for adding the complement of a list of horizontal or
2976 vertical whitespace to a class. The list must be in order.
2977 
2978 Arguments:
2979   classbits     the bit map for characters < 256
2980   uchardptr     points to the pointer for extra data
2981   options       the options word
2982   cb            contains pointers to tables etc.
2983   p             points to row of 32-bit values, terminated by NOTACHAR
2984 
2985 Returns:        the number of < 256 characters added
2986                 the pointer to extra data is updated
2987 */
2988 
2989 static unsigned int
add_not_list_to_class(uint8_t * classbits,PCRE2_UCHAR ** uchardptr,uint32_t options,compile_block * cb,const uint32_t * p)2990 add_not_list_to_class(uint8_t *classbits, PCRE2_UCHAR **uchardptr,
2991   uint32_t options, compile_block *cb, const uint32_t *p)
2992 {
2993 BOOL utf = (options & PCRE2_UTF) != 0;
2994 unsigned int n8 = 0;
2995 if (p[0] > 0)
2996   n8 += add_to_class(classbits, uchardptr, options, cb, 0, p[0] - 1);
2997 while (p[0] < NOTACHAR)
2998   {
2999   while (p[1] == p[0] + 1) p++;
3000   n8 += add_to_class(classbits, uchardptr, options, cb, p[0] + 1,
3001     (p[1] == NOTACHAR) ? (utf ? 0x10ffffu : 0xffffffffu) : p[1] - 1);
3002   p++;
3003   }
3004 return n8;
3005 }
3006 
3007 
3008 
3009 /*************************************************
3010 *       Process (*VERB) name for escapes         *
3011 *************************************************/
3012 
3013 /* This function is called when the PCRE2_ALT_VERBNAMES option is set, to
3014 process the characters in a verb's name argument. It is called twice, once with
3015 codeptr == NULL, to find out the length of the processed name, and again to put
3016 the name into memory.
3017 
3018 Arguments:
3019   ptrptr        pointer to the input pointer
3020   codeptr       pointer to the compiled code pointer
3021   errorcodeptr  pointer to the error code
3022   options       the options bits
3023   utf           TRUE if processing UTF
3024   cb            compile data block
3025 
3026 Returns:        length of the processed name, or < 0 on error
3027 */
3028 
3029 static int
process_verb_name(PCRE2_SPTR * ptrptr,PCRE2_UCHAR ** codeptr,int * errorcodeptr,uint32_t options,BOOL utf,compile_block * cb)3030 process_verb_name(PCRE2_SPTR *ptrptr, PCRE2_UCHAR **codeptr, int *errorcodeptr,
3031   uint32_t options, BOOL utf, compile_block *cb)
3032 {
3033 int32_t arglen = 0;
3034 BOOL inescq = FALSE;
3035 PCRE2_SPTR ptr = *ptrptr;
3036 PCRE2_UCHAR *code = (codeptr == NULL)? NULL : *codeptr;
3037 
3038 for (; ptr < cb->end_pattern; ptr++)
3039   {
3040   uint32_t x = *ptr;
3041 
3042   /* Skip over literals */
3043 
3044   if (inescq)
3045     {
3046     if (x == CHAR_BACKSLASH && ptr[1] == CHAR_E)
3047       {
3048       inescq = FALSE;
3049       ptr++;;
3050       continue;
3051       }
3052     }
3053 
3054   else  /* Not a literal character */
3055     {
3056     if (x == CHAR_RIGHT_PARENTHESIS) break;
3057 
3058     /* Skip over comments and whitespace in extended mode. */
3059 
3060     if ((options & PCRE2_EXTENDED) != 0)
3061       {
3062       PCRE2_SPTR wscptr = ptr;
3063       while (MAX_255(x) && (cb->ctypes[x] & ctype_space) != 0) x = *(++ptr);
3064       if (x == CHAR_NUMBER_SIGN)
3065         {
3066         ptr++;
3067         while (*ptr != CHAR_NULL || ptr < cb->end_pattern)
3068           {
3069           if (IS_NEWLINE(ptr))       /* For non-fixed-length newline cases, */
3070             {                        /* IS_NEWLINE sets cb->nllen. */
3071             ptr += cb->nllen;
3072             break;
3073             }
3074           ptr++;
3075 #ifdef SUPPORT_UNICODE
3076           if (utf) FORWARDCHAR(ptr);
3077 #endif
3078           }
3079         }
3080 
3081       /* If we have skipped any characters, restart the loop. */
3082 
3083       if (ptr > wscptr)
3084         {
3085         ptr--;
3086         continue;
3087         }
3088       }
3089 
3090     /* Process escapes */
3091 
3092     if (x == '\\')
3093       {
3094       int rc;
3095       *errorcodeptr = 0;
3096       rc = PRIV(check_escape)(&ptr, cb->end_pattern, &x, errorcodeptr, options,
3097         FALSE, cb);
3098       *ptrptr = ptr;   /* For possible error */
3099       if (*errorcodeptr != 0) return -1;
3100       if (rc != 0)
3101         {
3102         if (rc == ESC_Q)
3103           {
3104           inescq = TRUE;
3105           continue;
3106           }
3107         if (rc == ESC_E) continue;
3108         *errorcodeptr = ERR40;
3109         return -1;
3110         }
3111       }
3112     }
3113 
3114   /* We have the next character in the name. */
3115 
3116 #ifdef SUPPORT_UNICODE
3117   if (utf)
3118     {
3119     if (code == NULL)   /* Just want the length */
3120       {
3121 #if PCRE2_CODE_UNIT_WIDTH == 8
3122       int i;
3123       for (i = 0; i < PRIV(utf8_table1_size); i++)
3124         if ((int)x <= PRIV(utf8_table1)[i]) break;
3125       arglen += i;
3126 #elif PCRE2_CODE_UNIT_WIDTH == 16
3127       if (x > 0xffff) arglen++;
3128 #endif
3129       }
3130     else
3131       {
3132       PCRE2_UCHAR cbuff[8];
3133       x = PRIV(ord2utf)(x, cbuff);
3134       memcpy(code, cbuff, CU2BYTES(x));
3135       code += x;
3136       }
3137     }
3138   else
3139 #endif  /* SUPPORT_UNICODE */
3140 
3141   /* Not UTF */
3142     {
3143     if (code != NULL) *code++ = (PCRE2_UCHAR)x;
3144     }
3145 
3146   arglen++;
3147 
3148   if ((unsigned int)arglen > MAX_MARK)
3149     {
3150     *errorcodeptr = ERR76;
3151     *ptrptr = ptr;
3152     return -1;
3153     }
3154   }
3155 
3156 /* Update the pointers before returning. */
3157 
3158 *ptrptr = ptr;
3159 if (codeptr != NULL) *codeptr = code;
3160 return arglen;
3161 }
3162 
3163 
3164 
3165 /*************************************************
3166 *          Macro for the next two functions      *
3167 *************************************************/
3168 
3169 /* Both scan_for_captures() and compile_branch() use this macro to generate a
3170 fragment of code that reads the characters of a name and sets its length
3171 (checking for not being too long). Count the characters dynamically, to avoid
3172 the possibility of integer overflow. The same macro is used for reading *VERB
3173 names. */
3174 
3175 #define READ_NAME(ctype, errno, errset)                      \
3176   namelen = 0;                                               \
3177   while (MAX_255(*ptr) && (cb->ctypes[*ptr] & ctype) != 0)   \
3178     {                                                        \
3179     ptr++;                                                   \
3180     namelen++;                                               \
3181     if (namelen > MAX_NAME_SIZE)                             \
3182       {                                                      \
3183       errset = errno;                                        \
3184       goto FAILED;                                           \
3185       }                                                      \
3186     }
3187 
3188 
3189 
3190 /*************************************************
3191 *      Scan regex to identify named groups       *
3192 *************************************************/
3193 
3194 /* This function is called first of all, to scan for named capturing groups so
3195 that information about them is fully available to both the compiling scans.
3196 It skips over everything except parenthesized items.
3197 
3198 Arguments:
3199   ptrptr   points to pointer to the start of the pattern
3200   options  compiling dynamic options
3201   cb       pointer to the compile data block
3202 
3203 Returns:   zero on success or a non-zero error code, with pointer updated
3204 */
3205 
3206 typedef struct nest_save {
3207   uint16_t  nest_depth;
3208   uint16_t  reset_group;
3209   uint16_t  max_group;
3210   uint16_t  flags;
3211 } nest_save;
3212 
3213 #define NSF_RESET    0x0001u
3214 #define NSF_EXTENDED 0x0002u
3215 #define NSF_DUPNAMES 0x0004u
3216 
scan_for_captures(PCRE2_SPTR * ptrptr,uint32_t options,compile_block * cb)3217 static int scan_for_captures(PCRE2_SPTR *ptrptr, uint32_t options,
3218   compile_block *cb)
3219 {
3220 uint32_t c;
3221 uint32_t delimiter;
3222 uint32_t set, unset, *optset;
3223 uint32_t skiptoket = 0;
3224 uint16_t nest_depth = 0;
3225 int errorcode = 0;
3226 int escape;
3227 int namelen;
3228 int i;
3229 BOOL inescq = FALSE;
3230 BOOL isdupname;
3231 BOOL utf = (options & PCRE2_UTF) != 0;
3232 BOOL negate_class;
3233 PCRE2_SPTR name;
3234 PCRE2_SPTR start;
3235 PCRE2_SPTR ptr = *ptrptr;
3236 named_group *ng;
3237 nest_save *top_nest = NULL;
3238 nest_save *end_nests = (nest_save *)(cb->start_workspace + cb->workspace_size);
3239 
3240 /* The size of the nest_save structure might not be a factor of the size of the
3241 workspace. Therefore we must round down end_nests so as to correctly avoid
3242 creating a nest_save that spans the end of the workspace. */
3243 
3244 end_nests = (nest_save *)((char *)end_nests -
3245   ((cb->workspace_size * sizeof(PCRE2_UCHAR)) % sizeof(nest_save)));
3246 
3247 /* Now scan the pattern */
3248 
3249 for (; ptr < cb->end_pattern; ptr++)
3250   {
3251   c = *ptr;
3252 
3253   /* Parenthesized groups set skiptoket when all following characters up to the
3254   next closing parenthesis must be ignored. The parenthesis itself must be
3255   processed (to end the nested parenthesized item). */
3256 
3257   if (skiptoket != 0)
3258     {
3259     if (c != CHAR_RIGHT_PARENTHESIS) continue;
3260     skiptoket = 0;
3261     }
3262 
3263   /* Skip over literals */
3264 
3265   if (inescq)
3266     {
3267     if (c == CHAR_BACKSLASH && ptr[1] == CHAR_E)
3268       {
3269       inescq = FALSE;
3270       ptr++;
3271       }
3272     continue;
3273     }
3274 
3275   /* Skip over # comments and whitespace in extended mode. */
3276 
3277   if ((options & PCRE2_EXTENDED) != 0)
3278     {
3279     PCRE2_SPTR wscptr = ptr;
3280     while (MAX_255(c) && (cb->ctypes[c] & ctype_space) != 0) c = *(++ptr);
3281     if (c == CHAR_NUMBER_SIGN)
3282       {
3283       ptr++;
3284       while (ptr < cb->end_pattern)
3285         {
3286         if (IS_NEWLINE(ptr))         /* For non-fixed-length newline cases, */
3287           {                          /* IS_NEWLINE sets cb->nllen. */
3288           ptr += cb->nllen;
3289           break;
3290           }
3291         ptr++;
3292 #ifdef SUPPORT_UNICODE
3293         if (utf) FORWARDCHAR(ptr);
3294 #endif
3295         }
3296       }
3297 
3298     /* If we skipped any characters, restart the loop. Otherwise, we didn't see
3299     a comment. */
3300 
3301     if (ptr > wscptr)
3302       {
3303       ptr--;
3304       continue;
3305       }
3306     }
3307 
3308   /* Process the next pattern item. */
3309 
3310   switch(c)
3311     {
3312     default:              /* Most characters are just skipped */
3313     break;
3314 
3315     /* Skip escapes except for \Q */
3316 
3317     case CHAR_BACKSLASH:
3318     errorcode = 0;
3319     escape = PRIV(check_escape)(&ptr, cb->end_pattern, &c, &errorcode, options,
3320       FALSE, cb);
3321     if (errorcode != 0) goto FAILED;
3322     if (escape == ESC_Q) inescq = TRUE;
3323     break;
3324 
3325     /* Skip a character class. The syntax is complicated so we have to
3326     replicate some of what happens when a class is processed for real. */
3327 
3328     case CHAR_LEFT_SQUARE_BRACKET:
3329     if (PRIV(strncmp_c8)(ptr+1, STRING_WEIRD_STARTWORD, 6) == 0 ||
3330         PRIV(strncmp_c8)(ptr+1, STRING_WEIRD_ENDWORD, 6) == 0)
3331       {
3332       ptr += 6;
3333       break;
3334       }
3335 
3336     /* If the first character is '^', set the negation flag (not actually used
3337     here, except to recognize only one ^) and skip it. If the first few
3338     characters (either before or after ^) are \Q\E or \E we skip them too. This
3339     makes for compatibility with Perl. */
3340 
3341     negate_class = FALSE;
3342     for (;;)
3343       {
3344       c = *(++ptr);   /* First character in class */
3345       if (c == CHAR_BACKSLASH)
3346         {
3347         if (ptr[1] == CHAR_E)
3348           ptr++;
3349         else if (PRIV(strncmp_c8)(ptr + 1, STR_Q STR_BACKSLASH STR_E, 3) == 0)
3350           ptr += 3;
3351         else
3352           break;
3353         }
3354       else if (!negate_class && c == CHAR_CIRCUMFLEX_ACCENT)
3355         negate_class = TRUE;
3356       else break;
3357       }
3358 
3359     if (c == CHAR_RIGHT_SQUARE_BRACKET &&
3360         (cb->external_options & PCRE2_ALLOW_EMPTY_CLASS) != 0)
3361       break;
3362 
3363     /* Loop for the contents of the class */
3364 
3365     for (;;)
3366       {
3367       PCRE2_SPTR tempptr;
3368 
3369       if (c == CHAR_NULL && ptr >= cb->end_pattern)
3370         {
3371         errorcode = ERR6;  /* Missing terminating ']' */
3372         goto FAILED;
3373         }
3374 
3375 #ifdef SUPPORT_UNICODE
3376       if (utf && HAS_EXTRALEN(c))
3377         {                           /* Braces are required because the */
3378         GETCHARLEN(c, ptr, ptr);    /* macro generates multiple statements */
3379         }
3380 #endif
3381 
3382       /* Inside \Q...\E everything is literal except \E */
3383 
3384       if (inescq)
3385         {
3386         if (c == CHAR_BACKSLASH && ptr[1] == CHAR_E)  /* If we are at \E */
3387           {
3388           inescq = FALSE;                   /* Reset literal state */
3389           ptr++;                            /* Skip the 'E' */
3390           }
3391         goto CONTINUE_CLASS;
3392         }
3393 
3394       /* Skip POSIX class names. */
3395       if (c == CHAR_LEFT_SQUARE_BRACKET &&
3396           (ptr[1] == CHAR_COLON || ptr[1] == CHAR_DOT ||
3397            ptr[1] == CHAR_EQUALS_SIGN) && check_posix_syntax(ptr, &tempptr))
3398         {
3399         ptr = tempptr + 1;
3400         }
3401       else if (c == CHAR_BACKSLASH)
3402         {
3403         errorcode = 0;
3404         escape = PRIV(check_escape)(&ptr, cb->end_pattern, &c, &errorcode,
3405           options, TRUE, cb);
3406         if (errorcode != 0) goto FAILED;
3407         if (escape == ESC_Q) inescq = TRUE;
3408         }
3409 
3410       CONTINUE_CLASS:
3411       c = *(++ptr);
3412       if (c == CHAR_RIGHT_SQUARE_BRACKET && !inescq) break;
3413       }     /* End of class-processing loop */
3414     break;
3415 
3416     /* This is the real work of this function - handling parentheses. */
3417 
3418     case CHAR_LEFT_PARENTHESIS:
3419     nest_depth++;
3420 
3421     if (ptr[1] != CHAR_QUESTION_MARK)
3422       {
3423       if (ptr[1] != CHAR_ASTERISK)
3424         {
3425         if ((options & PCRE2_NO_AUTO_CAPTURE) == 0) cb->bracount++;
3426         }
3427 
3428       /* (*something) - skip over a name, and then just skip to closing ket
3429       unless PCRE2_ALT_VERBNAMES is set, in which case we have to process
3430       escapes in the string after a verb name terminated by a colon. */
3431 
3432       else
3433         {
3434         ptr += 2;
3435         while (MAX_255(*ptr) && (cb->ctypes[*ptr] & ctype_word) != 0) ptr++;
3436         if (*ptr == CHAR_COLON && (options & PCRE2_ALT_VERBNAMES) != 0)
3437           {
3438           ptr++;
3439           if (process_verb_name(&ptr, NULL, &errorcode, options, utf, cb) < 0)
3440             goto FAILED;
3441           }
3442         else
3443           {
3444           while (ptr < cb->end_pattern && *ptr != CHAR_RIGHT_PARENTHESIS)
3445             ptr++;
3446           }
3447         nest_depth--;
3448         }
3449       }
3450 
3451     /* Handle (?...) groups */
3452 
3453     else switch(ptr[2])
3454       {
3455       default:
3456       ptr += 2;
3457       if (ptr[0] == CHAR_R ||                           /* (?R) */
3458           ptr[0] == CHAR_NUMBER_SIGN ||                 /* (?#) */
3459           IS_DIGIT(ptr[0]) ||                           /* (?n) */
3460           (ptr[0] == CHAR_MINUS && IS_DIGIT(ptr[1])))   /* (?-n) */
3461         {
3462         skiptoket = ptr[0];
3463         break;
3464         }
3465 
3466       /* Handle (?| and (?imsxJU: which are the only other valid forms. Both
3467       need a new block on the nest stack. */
3468 
3469       if (top_nest == NULL) top_nest = (nest_save *)(cb->start_workspace);
3470       else if (++top_nest >= end_nests)
3471         {
3472         errorcode = ERR84;
3473         goto FAILED;
3474         }
3475       top_nest->nest_depth = nest_depth;
3476       top_nest->flags = 0;
3477       if ((options & PCRE2_EXTENDED) != 0) top_nest->flags |= NSF_EXTENDED;
3478       if ((options & PCRE2_DUPNAMES) != 0) top_nest->flags |= NSF_DUPNAMES;
3479 
3480       if (*ptr == CHAR_VERTICAL_LINE)
3481         {
3482         top_nest->reset_group = (uint16_t)cb->bracount;
3483         top_nest->max_group = (uint16_t)cb->bracount;
3484         top_nest->flags |= NSF_RESET;
3485         cb->external_flags |= PCRE2_DUPCAPUSED;
3486         break;
3487         }
3488 
3489       /* Scan options */
3490 
3491       top_nest->reset_group = 0;
3492       top_nest->max_group = 0;
3493 
3494       set = unset = 0;
3495       optset = &set;
3496 
3497       /* Need only track (?x: and (?J: at this stage */
3498 
3499       while (*ptr != CHAR_RIGHT_PARENTHESIS && *ptr != CHAR_COLON)
3500         {
3501         switch (*ptr++)
3502           {
3503           case CHAR_MINUS: optset = &unset; break;
3504 
3505           case CHAR_x: *optset |= PCRE2_EXTENDED; break;
3506 
3507           case CHAR_J:
3508           *optset |= PCRE2_DUPNAMES;
3509           cb->external_flags |= PCRE2_JCHANGED;
3510           break;
3511 
3512           case CHAR_i:
3513           case CHAR_m:
3514           case CHAR_s:
3515           case CHAR_U:
3516           break;
3517 
3518           default:
3519           errorcode = ERR11;
3520           ptr--;    /* Correct the offset */
3521           goto FAILED;
3522           }
3523         }
3524 
3525       options = (options | set) & (~unset);
3526 
3527       /* If the options ended with ')' this is not the start of a nested
3528       group with option changes, so the options change at this level. If the
3529       previous level set up a nest block, discard the one we have just created.
3530       Otherwise adjust it for the previous level. */
3531 
3532       if (*ptr == CHAR_RIGHT_PARENTHESIS)
3533         {
3534         nest_depth--;
3535         if (top_nest > (nest_save *)(cb->start_workspace) &&
3536             (top_nest-1)->nest_depth == nest_depth) top_nest --;
3537         else top_nest->nest_depth = nest_depth;
3538         }
3539       break;
3540 
3541       /* Skip over a numerical or string argument for a callout. */
3542 
3543       case CHAR_C:
3544       ptr += 2;
3545       if (ptr[1] == CHAR_RIGHT_PARENTHESIS) break;
3546       if (IS_DIGIT(ptr[1]))
3547         {
3548         while (IS_DIGIT(ptr[1])) ptr++;
3549         }
3550 
3551       /* Handle a string argument */
3552 
3553       else
3554         {
3555         ptr++;
3556         delimiter = 0;
3557         for (i = 0; PRIV(callout_start_delims)[i] != 0; i++)
3558           {
3559           if (*ptr == PRIV(callout_start_delims)[i])
3560             {
3561             delimiter = PRIV(callout_end_delims)[i];
3562             break;
3563             }
3564           }
3565 
3566         if (delimiter == 0)
3567           {
3568           errorcode = ERR82;
3569           goto FAILED;
3570           }
3571 
3572         start = ptr;
3573         do
3574           {
3575           if (++ptr >= cb->end_pattern)
3576             {
3577             errorcode = ERR81;
3578             ptr = start;   /* To give a more useful message */
3579             goto FAILED;
3580             }
3581           if (ptr[0] == delimiter && ptr[1] == delimiter) ptr += 2;
3582           }
3583         while (ptr[0] != delimiter);
3584         }
3585 
3586       /* Check terminating ) */
3587 
3588       if (ptr[1] != CHAR_RIGHT_PARENTHESIS)
3589         {
3590         errorcode = ERR39;
3591         ptr++;
3592         goto FAILED;
3593         }
3594       break;
3595 
3596       /* Conditional group */
3597 
3598       case CHAR_LEFT_PARENTHESIS:
3599       if (ptr[3] != CHAR_QUESTION_MARK)   /* Not assertion or callout */
3600         {
3601         nest_depth++;
3602         ptr += 2;
3603         break;
3604         }
3605 
3606       /* Must be an assertion or a callout */
3607 
3608       switch(ptr[4])
3609        {
3610        case CHAR_LESS_THAN_SIGN:
3611        if (ptr[5] != CHAR_EXCLAMATION_MARK && ptr[5] != CHAR_EQUALS_SIGN)
3612          goto MISSING_ASSERTION;
3613        /* Fall through */
3614 
3615        case CHAR_C:
3616        case CHAR_EXCLAMATION_MARK:
3617        case CHAR_EQUALS_SIGN:
3618        ptr++;
3619        break;
3620 
3621        default:
3622        MISSING_ASSERTION:
3623        ptr += 3;            /* To improve error message */
3624        errorcode = ERR28;
3625        goto FAILED;
3626        }
3627       break;
3628 
3629       case CHAR_COLON:
3630       case CHAR_GREATER_THAN_SIGN:
3631       case CHAR_EQUALS_SIGN:
3632       case CHAR_EXCLAMATION_MARK:
3633       case CHAR_AMPERSAND:
3634       case CHAR_PLUS:
3635       ptr += 2;
3636       break;
3637 
3638       case CHAR_P:
3639       if (ptr[3] != CHAR_LESS_THAN_SIGN)
3640         {
3641         ptr += 3;
3642         break;
3643         }
3644       ptr++;
3645       c = CHAR_GREATER_THAN_SIGN;   /* Terminator */
3646       goto DEFINE_NAME;
3647 
3648       case CHAR_LESS_THAN_SIGN:
3649       if (ptr[3] == CHAR_EQUALS_SIGN || ptr[3] == CHAR_EXCLAMATION_MARK)
3650         {
3651         ptr += 3;
3652         break;
3653         }
3654       c = CHAR_GREATER_THAN_SIGN;   /* Terminator */
3655       goto DEFINE_NAME;
3656 
3657       case CHAR_APOSTROPHE:
3658       c = CHAR_APOSTROPHE;    /* Terminator */
3659 
3660       DEFINE_NAME:
3661       name = ptr = ptr + 3;
3662 
3663       if (*ptr == c)          /* Empty name */
3664         {
3665         errorcode = ERR62;
3666         goto FAILED;
3667         }
3668 
3669       if (IS_DIGIT(*ptr))
3670         {
3671         errorcode = ERR44;   /* Group name must start with non-digit */
3672         goto FAILED;
3673         }
3674 
3675       if (MAX_255(*ptr) && (cb->ctypes[*ptr] & ctype_word) == 0)
3676         {
3677         errorcode = ERR24;
3678         goto FAILED;
3679         }
3680 
3681       /* Advance ptr, set namelen and check its length. */
3682       READ_NAME(ctype_word, ERR48, errorcode);
3683 
3684       if (*ptr != c)
3685         {
3686         errorcode = ERR42;
3687         goto FAILED;
3688         }
3689 
3690       if (cb->names_found >= MAX_NAME_COUNT)
3691         {
3692         errorcode = ERR49;
3693         goto FAILED;
3694         }
3695 
3696       if (namelen + IMM2_SIZE + 1 > cb->name_entry_size)
3697         cb->name_entry_size = (uint16_t)(namelen + IMM2_SIZE + 1);
3698 
3699       /* We have a valid name for this capturing group. */
3700 
3701       cb->bracount++;
3702 
3703       /* Scan the list to check for duplicates. For duplicate names, if the
3704       number is the same, break the loop, which causes the name to be
3705       discarded; otherwise, if DUPNAMES is not set, give an error.
3706       If it is set, allow the name with a different number, but continue
3707       scanning in case this is a duplicate with the same number. For
3708       non-duplicate names, give an error if the number is duplicated. */
3709 
3710       isdupname = FALSE;
3711       ng = cb->named_groups;
3712       for (i = 0; i < cb->names_found; i++, ng++)
3713         {
3714         if (namelen == ng->length &&
3715             PRIV(strncmp)(name, ng->name, (size_t)namelen) == 0)
3716           {
3717           if (ng->number == cb->bracount) break;
3718           if ((options & PCRE2_DUPNAMES) == 0)
3719             {
3720             errorcode = ERR43;
3721             goto FAILED;
3722             }
3723           isdupname = ng->isdup = TRUE;     /* Mark as a duplicate */
3724           cb->dupnames = TRUE;              /* Duplicate names exist */
3725           }
3726         else if (ng->number == cb->bracount)
3727           {
3728           errorcode = ERR65;
3729           goto FAILED;
3730           }
3731         }
3732 
3733       if (i < cb->names_found) break;   /* Ignore duplicate with same number */
3734 
3735       /* Increase the list size if necessary */
3736 
3737       if (cb->names_found >= cb->named_group_list_size)
3738         {
3739         uint32_t newsize = cb->named_group_list_size * 2;
3740         named_group *newspace =
3741           cb->cx->memctl.malloc(newsize * sizeof(named_group),
3742           cb->cx->memctl.memory_data);
3743         if (newspace == NULL)
3744           {
3745           errorcode = ERR21;
3746           goto FAILED;
3747           }
3748 
3749         memcpy(newspace, cb->named_groups,
3750           cb->named_group_list_size * sizeof(named_group));
3751         if (cb->named_group_list_size > NAMED_GROUP_LIST_SIZE)
3752           cb->cx->memctl.free((void *)cb->named_groups,
3753           cb->cx->memctl.memory_data);
3754         cb->named_groups = newspace;
3755         cb->named_group_list_size = newsize;
3756         }
3757 
3758       /* Add this name to the list */
3759 
3760       cb->named_groups[cb->names_found].name = name;
3761       cb->named_groups[cb->names_found].length = (uint16_t)namelen;
3762       cb->named_groups[cb->names_found].number = cb->bracount;
3763       cb->named_groups[cb->names_found].isdup = (uint16_t)isdupname;
3764       cb->names_found++;
3765       break;
3766       }        /* End of (? switch */
3767     break;     /* End of ( handling */
3768 
3769     /* At an alternation, reset the capture count if we are in a (?| group. */
3770 
3771     case CHAR_VERTICAL_LINE:
3772     if (top_nest != NULL && top_nest->nest_depth == nest_depth &&
3773         (top_nest->flags & NSF_RESET) != 0)
3774       {
3775       if (cb->bracount > top_nest->max_group)
3776         top_nest->max_group = (uint16_t)cb->bracount;
3777       cb->bracount = top_nest->reset_group;
3778       }
3779     break;
3780 
3781     /* At a right parenthesis, reset the capture count to the maximum if we
3782     are in a (?| group and/or reset the extended option. */
3783 
3784     case CHAR_RIGHT_PARENTHESIS:
3785     if (top_nest != NULL && top_nest->nest_depth == nest_depth)
3786       {
3787       if ((top_nest->flags & NSF_RESET) != 0 &&
3788           top_nest->max_group > cb->bracount)
3789         cb->bracount = top_nest->max_group;
3790       if ((top_nest->flags & NSF_EXTENDED) != 0) options |= PCRE2_EXTENDED;
3791         else options &= ~PCRE2_EXTENDED;
3792       if ((top_nest->flags & NSF_DUPNAMES) != 0) options |= PCRE2_DUPNAMES;
3793         else options &= ~PCRE2_DUPNAMES;
3794       if (top_nest == (nest_save *)(cb->start_workspace)) top_nest = NULL;
3795         else top_nest--;
3796       }
3797     if (nest_depth == 0)    /* Unmatched closing parenthesis */
3798       {
3799       errorcode = ERR22;
3800       goto FAILED;
3801       }
3802     nest_depth--;
3803     break;
3804     }
3805   }
3806 
3807 if (nest_depth == 0)
3808   {
3809   cb->final_bracount = cb->bracount;
3810   return 0;
3811   }
3812 
3813 /* We give a special error for a missing closing parentheses after (?# because
3814 it might otherwise be hard to see where the missing character is. */
3815 
3816 errorcode = (skiptoket == CHAR_NUMBER_SIGN)? ERR18 : ERR14;
3817 
3818 FAILED:
3819 *ptrptr = ptr;
3820 return errorcode;
3821 }
3822 
3823 
3824 
3825 /*************************************************
3826 *           Compile one branch                   *
3827 *************************************************/
3828 
3829 /* Scan the pattern, compiling it into the a vector. If the options are
3830 changed during the branch, the pointer is used to change the external options
3831 bits. This function is used during the pre-compile phase when we are trying
3832 to find out the amount of memory needed, as well as during the real compile
3833 phase. The value of lengthptr distinguishes the two phases.
3834 
3835 Arguments:
3836   optionsptr        pointer to the option bits
3837   codeptr           points to the pointer to the current code point
3838   ptrptr            points to the current pattern pointer
3839   errorcodeptr      points to error code variable
3840   firstcuptr        place to put the first required code unit
3841   firstcuflagsptr   place to put the first code unit flags, or a negative number
3842   reqcuptr          place to put the last required code unit
3843   reqcuflagsptr     place to put the last required code unit flags, or a negative number
3844   bcptr             points to current branch chain
3845   cond_depth        conditional nesting depth
3846   cb                contains pointers to tables etc.
3847   lengthptr         NULL during the real compile phase
3848                     points to length accumulator during pre-compile phase
3849 
3850 Returns:            TRUE on success
3851                     FALSE, with *errorcodeptr set non-zero on error
3852 */
3853 
3854 static BOOL
compile_branch(uint32_t * optionsptr,PCRE2_UCHAR ** codeptr,PCRE2_SPTR * ptrptr,int * errorcodeptr,uint32_t * firstcuptr,int32_t * firstcuflagsptr,uint32_t * reqcuptr,int32_t * reqcuflagsptr,branch_chain * bcptr,int cond_depth,compile_block * cb,size_t * lengthptr)3855 compile_branch(uint32_t *optionsptr, PCRE2_UCHAR **codeptr,
3856   PCRE2_SPTR *ptrptr, int *errorcodeptr,
3857   uint32_t *firstcuptr, int32_t *firstcuflagsptr,
3858   uint32_t *reqcuptr, int32_t *reqcuflagsptr,
3859   branch_chain *bcptr, int cond_depth,
3860   compile_block *cb, size_t *lengthptr)
3861 {
3862 int repeat_min = 0, repeat_max = 0;      /* To please picky compilers */
3863 int bravalue = 0;
3864 uint32_t greedy_default, greedy_non_default;
3865 uint32_t repeat_type, op_type;
3866 uint32_t options = *optionsptr;               /* May change dynamically */
3867 uint32_t firstcu, reqcu;
3868 int32_t firstcuflags, reqcuflags;
3869 uint32_t zeroreqcu, zerofirstcu;
3870 int32_t zeroreqcuflags, zerofirstcuflags;
3871 int32_t req_caseopt, reqvary, tempreqvary;
3872 int after_manual_callout = 0;
3873 int escape;
3874 size_t length_prevgroup = 0;
3875 register uint32_t c;
3876 register PCRE2_UCHAR *code = *codeptr;
3877 PCRE2_UCHAR *last_code = code;
3878 PCRE2_UCHAR *orig_code = code;
3879 PCRE2_UCHAR *tempcode;
3880 BOOL inescq = FALSE;
3881 BOOL groupsetfirstcu = FALSE;
3882 PCRE2_SPTR ptr = *ptrptr;
3883 PCRE2_SPTR tempptr;
3884 PCRE2_UCHAR *previous = NULL;
3885 PCRE2_UCHAR *previous_callout = NULL;
3886 uint8_t classbits[32];
3887 
3888 /* We can fish out the UTF setting once and for all into a BOOL, but we must
3889 not do this for other options (e.g. PCRE2_EXTENDED) because they may change
3890 dynamically as we process the pattern. */
3891 
3892 #ifdef SUPPORT_UNICODE
3893 BOOL utf = (options & PCRE2_UTF) != 0;
3894 #if PCRE2_CODE_UNIT_WIDTH != 32
3895 PCRE2_UCHAR utf_units[6];      /* For setting up multi-cu chars */
3896 #endif
3897 
3898 #else  /* No UTF support */
3899 BOOL utf = FALSE;
3900 #endif
3901 
3902 /* Helper variables for OP_XCLASS opcode (for characters > 255). We define
3903 class_uchardata always so that it can be passed to add_to_class() always,
3904 though it will not be used in non-UTF 8-bit cases. This avoids having to supply
3905 alternative calls for the different cases. */
3906 
3907 PCRE2_UCHAR *class_uchardata;
3908 #ifdef SUPPORT_WIDE_CHARS
3909 BOOL xclass;
3910 PCRE2_UCHAR *class_uchardata_base;
3911 #endif
3912 
3913 /* Set up the default and non-default settings for greediness */
3914 
3915 greedy_default = ((options & PCRE2_UNGREEDY) != 0);
3916 greedy_non_default = greedy_default ^ 1;
3917 
3918 /* Initialize no first unit, no required unit. REQ_UNSET means "no char
3919 matching encountered yet". It gets changed to REQ_NONE if we hit something that
3920 matches a non-fixed first unit; reqcu just remains unset if we never find one.
3921 
3922 When we hit a repeat whose minimum is zero, we may have to adjust these values
3923 to take the zero repeat into account. This is implemented by setting them to
3924 zerofirstcu and zeroreqcu when such a repeat is encountered. The individual
3925 item types that can be repeated set these backoff variables appropriately. */
3926 
3927 firstcu = reqcu = zerofirstcu = zeroreqcu = 0;
3928 firstcuflags = reqcuflags = zerofirstcuflags = zeroreqcuflags = REQ_UNSET;
3929 
3930 /* The variable req_caseopt contains either the REQ_CASELESS value or zero,
3931 according to the current setting of the caseless flag. The REQ_CASELESS value
3932 leaves the lower 28 bit empty. It is added into the firstcu or reqcu variables
3933 to record the case status of the value. This is used only for ASCII characters.
3934 */
3935 
3936 req_caseopt = ((options & PCRE2_CASELESS) != 0)? REQ_CASELESS:0;
3937 
3938 /* Switch on next character until the end of the branch */
3939 
3940 for (;; ptr++)
3941   {
3942   BOOL negate_class;
3943   BOOL should_flip_negation;
3944   BOOL match_all_or_no_wide_chars;
3945   BOOL possessive_quantifier;
3946   BOOL is_quantifier;
3947   BOOL is_recurse;
3948   BOOL is_dupname;
3949   BOOL reset_bracount;
3950   int class_has_8bitchar;
3951   int class_one_char;
3952 #ifdef SUPPORT_WIDE_CHARS
3953   BOOL xclass_has_prop;
3954 #endif
3955   int recno;                               /* Must be signed */
3956   int refsign;                             /* Must be signed */
3957   int terminator;                          /* Must be signed */
3958   unsigned int mclength;
3959   unsigned int tempbracount;
3960   uint32_t ec;
3961   uint32_t newoptions;
3962   uint32_t skipunits;
3963   uint32_t subreqcu, subfirstcu;
3964   int32_t subreqcuflags, subfirstcuflags;  /* Must be signed */
3965   PCRE2_UCHAR mcbuffer[8];
3966 
3967   /* Come here to restart the loop. */
3968 
3969   REDO_LOOP:
3970 
3971   /* Get next character in the pattern */
3972 
3973   c = *ptr;
3974 
3975   /* If we are at the end of a nested substitution, revert to the outer level
3976   string. Nesting only happens one or two levels deep, and the inserted string
3977   is always zero terminated. */
3978 
3979   if (c == CHAR_NULL && cb->nestptr[0] != NULL)
3980     {
3981     ptr = cb->nestptr[0];
3982     cb->nestptr[0] = cb->nestptr[1];
3983     cb->nestptr[1] = NULL;
3984     c = *ptr;
3985     }
3986 
3987   /* If we are in the pre-compile phase, accumulate the length used for the
3988   previous cycle of this loop. */
3989 
3990   if (lengthptr != NULL)
3991     {
3992     if (code > cb->start_workspace + cb->workspace_size -
3993         WORK_SIZE_SAFETY_MARGIN)                       /* Check for overrun */
3994       {
3995       *errorcodeptr = (code >= cb->start_workspace + cb->workspace_size)?
3996         ERR52 : ERR86;
3997       goto FAILED;
3998       }
3999 
4000     /* There is at least one situation where code goes backwards: this is the
4001     case of a zero quantifier after a class (e.g. [ab]{0}). At compile time,
4002     the class is simply eliminated. However, it is created first, so we have to
4003     allow memory for it. Therefore, don't ever reduce the length at this point.
4004     */
4005 
4006     if (code < last_code) code = last_code;
4007 
4008     /* Paranoid check for integer overflow */
4009 
4010     if (OFLOW_MAX - *lengthptr < (size_t)(code - last_code))
4011       {
4012       *errorcodeptr = ERR20;
4013       goto FAILED;
4014       }
4015     *lengthptr += (size_t)(code - last_code);
4016 
4017     /* If "previous" is set and it is not at the start of the work space, move
4018     it back to there, in order to avoid filling up the work space. Otherwise,
4019     if "previous" is NULL, reset the current code pointer to the start. */
4020 
4021     if (previous != NULL)
4022       {
4023       if (previous > orig_code)
4024         {
4025         memmove(orig_code, previous, (size_t)CU2BYTES(code - previous));
4026         code -= previous - orig_code;
4027         previous = orig_code;
4028         }
4029       }
4030     else code = orig_code;
4031 
4032     /* Remember where this code item starts so we can pick up the length
4033     next time round. */
4034 
4035     last_code = code;
4036     }
4037 
4038   /* Before doing anything else we must handle all the special items that do
4039   nothing, and which may come between an item and its quantifier. Otherwise,
4040   when auto-callouts are enabled, a callout gets incorrectly inserted before
4041   the quantifier is recognized. After recognizing a "do nothing" item, restart
4042   the loop in case another one follows. */
4043 
4044   /* If c is not NULL we are not at the end of the pattern. If it is NULL, we
4045   may still be in the pattern with a NULL data item. In these cases, if we are
4046   in \Q...\E, check for the \E that ends the literal string; if not, we have a
4047   literal character. If not in \Q...\E, an isolated \E is ignored. */
4048 
4049   if (c != CHAR_NULL || ptr < cb->end_pattern)
4050     {
4051     if (c == CHAR_BACKSLASH && ptr[1] == CHAR_E)
4052       {
4053       inescq = FALSE;
4054       ptr++;
4055       continue;
4056       }
4057     else if (inescq)   /* Literal character */
4058       {
4059       if (previous_callout != NULL)
4060         {
4061         if (lengthptr == NULL)  /* Don't attempt in pre-compile phase */
4062           complete_callout(previous_callout, ptr, cb);
4063         previous_callout = NULL;
4064         }
4065       if ((options & PCRE2_AUTO_CALLOUT) != 0)
4066         {
4067         previous_callout = code;
4068         code = auto_callout(code, ptr, cb);
4069         }
4070       goto NORMAL_CHAR;
4071       }
4072 
4073     /* Check for the start of a \Q...\E sequence. We must do this here rather
4074     than later in case it is immediately followed by \E, which turns it into a
4075     "do nothing" sequence. */
4076 
4077     if (c == CHAR_BACKSLASH && ptr[1] == CHAR_Q)
4078       {
4079       inescq = TRUE;
4080       ptr++;
4081       continue;
4082       }
4083     }
4084 
4085   /* In extended mode, skip white space and #-comments that end at newline. */
4086 
4087   if ((options & PCRE2_EXTENDED) != 0)
4088     {
4089     PCRE2_SPTR wscptr = ptr;
4090     while (MAX_255(c) && (cb->ctypes[c] & ctype_space) != 0) c = *(++ptr);
4091     if (c == CHAR_NUMBER_SIGN)
4092       {
4093       ptr++;
4094       while (ptr < cb->end_pattern)
4095         {
4096         if (IS_NEWLINE(ptr))         /* For non-fixed-length newline cases, */
4097           {                          /* IS_NEWLINE sets cb->nllen. */
4098           ptr += cb->nllen;
4099           break;
4100           }
4101         ptr++;
4102 #ifdef SUPPORT_UNICODE
4103         if (utf) FORWARDCHAR(ptr);
4104 #endif
4105         }
4106       }
4107 
4108     /* If we skipped any characters, restart the loop. Otherwise, we didn't see
4109     a comment. */
4110 
4111     if (ptr > wscptr) goto REDO_LOOP;
4112     }
4113 
4114   /* Skip over (?# comments. */
4115 
4116   if (c == CHAR_LEFT_PARENTHESIS && ptr[1] == CHAR_QUESTION_MARK &&
4117       ptr[2] == CHAR_NUMBER_SIGN)
4118     {
4119     ptr += 3;
4120     while (ptr < cb->end_pattern && *ptr != CHAR_RIGHT_PARENTHESIS) ptr++;
4121     if (*ptr != CHAR_RIGHT_PARENTHESIS)
4122       {
4123       *errorcodeptr = ERR18;
4124       goto FAILED;
4125       }
4126     continue;
4127     }
4128 
4129   /* End of processing "do nothing" items. See if the next thing is a
4130   quantifier. */
4131 
4132   is_quantifier =
4133     c == CHAR_ASTERISK || c == CHAR_PLUS || c == CHAR_QUESTION_MARK ||
4134      (c == CHAR_LEFT_CURLY_BRACKET && is_counted_repeat(ptr+1));
4135 
4136   /* Fill in length of a previous callout and create an auto callout if
4137   required, except when the next thing is a quantifier or when processing a
4138   property substitution string for \w etc in UCP mode. */
4139 
4140   if (!is_quantifier && cb->nestptr[0] == NULL)
4141     {
4142     if (previous_callout != NULL && after_manual_callout-- <= 0)
4143       {
4144       if (lengthptr == NULL)      /* Don't attempt in pre-compile phase */
4145         complete_callout(previous_callout, ptr, cb);
4146       previous_callout = NULL;
4147       }
4148 
4149     if ((options & PCRE2_AUTO_CALLOUT) != 0)
4150       {
4151       previous_callout = code;
4152       code = auto_callout(code, ptr, cb);
4153       }
4154     }
4155 
4156   /* Process the next pattern item. */
4157 
4158   switch(c)
4159     {
4160     /* ===================================================================*/
4161     /* The branch terminates at string end or | or ) */
4162 
4163     case CHAR_NULL:
4164     if (ptr < cb->end_pattern) goto NORMAL_CHAR;   /* Zero data character */
4165     /* Fall through */
4166 
4167     case CHAR_VERTICAL_LINE:
4168     case CHAR_RIGHT_PARENTHESIS:
4169     *firstcuptr = firstcu;
4170     *firstcuflagsptr = firstcuflags;
4171     *reqcuptr = reqcu;
4172     *reqcuflagsptr = reqcuflags;
4173     *codeptr = code;
4174     *ptrptr = ptr;
4175     if (lengthptr != NULL)
4176       {
4177       if (OFLOW_MAX - *lengthptr < (size_t)(code - last_code))
4178         {
4179         *errorcodeptr = ERR20;
4180         goto FAILED;
4181         }
4182       *lengthptr += (size_t)(code - last_code);  /* To include callout length */
4183       }
4184     return TRUE;
4185 
4186 
4187     /* ===================================================================*/
4188     /* Handle single-character metacharacters. In multiline mode, ^ disables
4189     the setting of any following char as a first character. */
4190 
4191     case CHAR_CIRCUMFLEX_ACCENT:
4192     previous = NULL;
4193     if ((options & PCRE2_MULTILINE) != 0)
4194       {
4195       if (firstcuflags == REQ_UNSET)
4196         zerofirstcuflags = firstcuflags = REQ_NONE;
4197       *code++ = OP_CIRCM;
4198       }
4199     else *code++ = OP_CIRC;
4200     break;
4201 
4202     case CHAR_DOLLAR_SIGN:
4203     previous = NULL;
4204     *code++ = ((options & PCRE2_MULTILINE) != 0)? OP_DOLLM : OP_DOLL;
4205     break;
4206 
4207     /* There can never be a first char if '.' is first, whatever happens about
4208     repeats. The value of reqcu doesn't change either. */
4209 
4210     case CHAR_DOT:
4211     if (firstcuflags == REQ_UNSET) firstcuflags = REQ_NONE;
4212     zerofirstcu = firstcu;
4213     zerofirstcuflags = firstcuflags;
4214     zeroreqcu = reqcu;
4215     zeroreqcuflags = reqcuflags;
4216     previous = code;
4217     *code++ = ((options & PCRE2_DOTALL) != 0)? OP_ALLANY: OP_ANY;
4218     break;
4219 
4220 
4221     /* ===================================================================*/
4222     /* Character classes. If the included characters are all < 256, we build a
4223     32-byte bitmap of the permitted characters, except in the special case
4224     where there is only one such character. For negated classes, we build the
4225     map as usual, then invert it at the end. However, we use a different opcode
4226     so that data characters > 255 can be handled correctly.
4227 
4228     If the class contains characters outside the 0-255 range, a different
4229     opcode is compiled. It may optionally have a bit map for characters < 256,
4230     but those above are are explicitly listed afterwards. A flag byte tells
4231     whether the bitmap is present, and whether this is a negated class or not.
4232 
4233     An isolated ']' character is not treated specially, so is just another data
4234     character. In earlier versions of PCRE that used the original API there was
4235     a "JavaScript compatibility mode" in which it gave an error. However,
4236     JavaScript itself has changed in this respect so there is no longer any
4237     need for this special handling.
4238 
4239     In another (POSIX) regex library, the ugly syntax [[:<:]] and [[:>:]] is
4240     used for "start of word" and "end of word". As these are otherwise illegal
4241     sequences, we don't break anything by recognizing them. They are replaced
4242     by \b(?=\w) and \b(?<=\w) respectively. This can only happen at the top
4243     nesting level, as no other inserted sequences will contains these oddities.
4244     Sequences like [a[:<:]] are erroneous and are handled by the normal code
4245     below. */
4246 
4247     case CHAR_LEFT_SQUARE_BRACKET:
4248     if (PRIV(strncmp_c8)(ptr+1, STRING_WEIRD_STARTWORD, 6) == 0)
4249       {
4250       cb->nestptr[0] = ptr + 7;
4251       ptr = sub_start_of_word;
4252       goto REDO_LOOP;
4253       }
4254 
4255     if (PRIV(strncmp_c8)(ptr+1, STRING_WEIRD_ENDWORD, 6) == 0)
4256       {
4257       cb->nestptr[0] = ptr + 7;
4258       ptr = sub_end_of_word;
4259       goto REDO_LOOP;
4260       }
4261 
4262     /* Handle a real character class. */
4263 
4264     previous = code;
4265 
4266     /* PCRE supports POSIX class stuff inside a class. Perl gives an error if
4267     they are encountered at the top level, so we'll do that too. */
4268 
4269     if ((ptr[1] == CHAR_COLON || ptr[1] == CHAR_DOT ||
4270          ptr[1] == CHAR_EQUALS_SIGN) &&
4271         check_posix_syntax(ptr, &tempptr))
4272       {
4273       *errorcodeptr = (ptr[1] == CHAR_COLON)? ERR12 : ERR13;
4274       goto FAILED;
4275       }
4276 
4277     /* If the first character is '^', set the negation flag and skip it. Also,
4278     if the first few characters (either before or after ^) are \Q\E or \E we
4279     skip them too. This makes for compatibility with Perl. */
4280 
4281     negate_class = FALSE;
4282     for (;;)
4283       {
4284       c = *(++ptr);
4285       if (c == CHAR_BACKSLASH)
4286         {
4287         if (ptr[1] == CHAR_E)
4288           ptr++;
4289         else if (PRIV(strncmp_c8)(ptr + 1, STR_Q STR_BACKSLASH STR_E, 3) == 0)
4290           ptr += 3;
4291         else
4292           break;
4293         }
4294       else if (!negate_class && c == CHAR_CIRCUMFLEX_ACCENT)
4295         negate_class = TRUE;
4296       else break;
4297       }
4298 
4299     /* Empty classes are allowed if PCRE2_ALLOW_EMPTY_CLASS is set. Otherwise,
4300     an initial ']' is taken as a data character -- the code below handles
4301     that. When empty classes are allowed, [] must always fail, so generate
4302     OP_FAIL, whereas [^] must match any character, so generate OP_ALLANY. */
4303 
4304     if (c == CHAR_RIGHT_SQUARE_BRACKET &&
4305         (cb->external_options & PCRE2_ALLOW_EMPTY_CLASS) != 0)
4306       {
4307       *code++ = negate_class? OP_ALLANY : OP_FAIL;
4308       if (firstcuflags == REQ_UNSET) firstcuflags = REQ_NONE;
4309       zerofirstcu = firstcu;
4310       zerofirstcuflags = firstcuflags;
4311       break;
4312       }
4313 
4314     /* If a non-extended class contains a negative special such as \S, we need
4315     to flip the negation flag at the end, so that support for characters > 255
4316     works correctly (they are all included in the class). An extended class may
4317     need to insert specific matching or non-matching code for wide characters.
4318     */
4319 
4320     should_flip_negation = match_all_or_no_wide_chars = FALSE;
4321 
4322     /* Extended class (xclass) will be used when characters > 255
4323     might match. */
4324 
4325 #ifdef SUPPORT_WIDE_CHARS
4326     xclass = FALSE;
4327     class_uchardata = code + LINK_SIZE + 2;   /* For XCLASS items */
4328     class_uchardata_base = class_uchardata;   /* Save the start */
4329 #endif
4330 
4331     /* For optimization purposes, we track some properties of the class:
4332     class_has_8bitchar will be non-zero if the class contains at least one 256
4333     character with a code point less than 256; class_one_char will be 1 if the
4334     class contains just one character; xclass_has_prop will be TRUE if Unicode
4335     property checks are present in the class. */
4336 
4337     class_has_8bitchar = 0;
4338     class_one_char = 0;
4339 #ifdef SUPPORT_WIDE_CHARS
4340     xclass_has_prop = FALSE;
4341 #endif
4342 
4343     /* Initialize the 256-bit (32-byte) bit map to all zeros. We build the map
4344     in a temporary bit of memory, in case the class contains fewer than two
4345     8-bit characters because in that case the compiled code doesn't use the bit
4346     map. */
4347 
4348     memset(classbits, 0, 32 * sizeof(uint8_t));
4349 
4350     /* Process characters until ] is reached. As the test is at the end of the
4351     loop, an initial ] is taken as a data character. At the start of the loop,
4352     c contains the first code unit of the character. If it is zero, check for
4353     the end of the pattern, to allow binary zero as data. */
4354 
4355     for(;;)
4356       {
4357       PCRE2_SPTR oldptr;
4358 #ifdef EBCDIC
4359       BOOL range_is_literal = TRUE;
4360 #endif
4361 
4362       if (c == CHAR_NULL && ptr >= cb->end_pattern)
4363         {
4364         *errorcodeptr = ERR6;  /* Missing terminating ']' */
4365         goto FAILED;
4366         }
4367 
4368 #ifdef SUPPORT_UNICODE
4369       if (utf && HAS_EXTRALEN(c))
4370         {                           /* Braces are required because the */
4371         GETCHARLEN(c, ptr, ptr);    /* macro generates multiple statements */
4372         }
4373 #endif
4374 
4375       /* Inside \Q...\E everything is literal except \E */
4376 
4377       if (inescq)
4378         {
4379         if (c == CHAR_BACKSLASH && ptr[1] == CHAR_E)  /* If we are at \E */
4380           {
4381           inescq = FALSE;                   /* Reset literal state */
4382           ptr++;                            /* Skip the 'E' */
4383           goto CONTINUE_CLASS;              /* Carry on with next char */
4384           }
4385         goto CHECK_RANGE;                   /* Could be range if \E follows */
4386         }
4387 
4388       /* Handle POSIX class names. Perl allows a negation extension of the
4389       form [:^name:]. A square bracket that doesn't match the syntax is
4390       treated as a literal. We also recognize the POSIX constructions
4391       [.ch.] and [=ch=] ("collating elements") and fault them, as Perl
4392       5.6 and 5.8 do. */
4393 
4394       if (c == CHAR_LEFT_SQUARE_BRACKET &&
4395           (ptr[1] == CHAR_COLON || ptr[1] == CHAR_DOT ||
4396            ptr[1] == CHAR_EQUALS_SIGN) && check_posix_syntax(ptr, &tempptr))
4397         {
4398         BOOL local_negate = FALSE;
4399         int posix_class, taboffset, tabopt;
4400         register const uint8_t *cbits = cb->cbits;
4401         uint8_t pbits[32];
4402 
4403         if (ptr[1] != CHAR_COLON)
4404           {
4405           *errorcodeptr = ERR13;
4406           goto FAILED;
4407           }
4408 
4409         ptr += 2;
4410         if (*ptr == CHAR_CIRCUMFLEX_ACCENT)
4411           {
4412           local_negate = TRUE;
4413           should_flip_negation = TRUE;  /* Note negative special */
4414           ptr++;
4415           }
4416 
4417         posix_class = check_posix_name(ptr, (int)(tempptr - ptr));
4418         if (posix_class < 0)
4419           {
4420           *errorcodeptr = ERR30;
4421           goto FAILED;
4422           }
4423 
4424         /* If matching is caseless, upper and lower are converted to
4425         alpha. This relies on the fact that the class table starts with
4426         alpha, lower, upper as the first 3 entries. */
4427 
4428         if ((options & PCRE2_CASELESS) != 0 && posix_class <= 2)
4429           posix_class = 0;
4430 
4431         /* When PCRE2_UCP is set, some of the POSIX classes are converted to
4432         different escape sequences that use Unicode properties \p or \P. Others
4433         that are not available via \p or \P generate XCL_PROP/XCL_NOTPROP
4434         directly. UCP support is not available unless UTF support is.*/
4435 
4436 #ifdef SUPPORT_UNICODE
4437         if ((options & PCRE2_UCP) != 0)
4438           {
4439           unsigned int ptype = 0;
4440           int pc = posix_class + ((local_negate)? POSIX_SUBSIZE/2 : 0);
4441 
4442           /* The posix_substitutes table specifies which POSIX classes can be
4443           converted to \p or \P items. This can only happen at top nestling
4444           level, as there will never be a POSIX class in a string that is
4445           substituted for something else. */
4446 
4447           if (posix_substitutes[pc] != NULL)
4448             {
4449             cb->nestptr[0] = tempptr + 1;
4450             ptr = posix_substitutes[pc] - 1;
4451             goto CONTINUE_CLASS;
4452             }
4453 
4454           /* There are three other classes that generate special property calls
4455           that are recognized only in an XCLASS. */
4456 
4457           else switch(posix_class)
4458             {
4459             case PC_GRAPH:
4460             ptype = PT_PXGRAPH;
4461             /* Fall through */
4462             case PC_PRINT:
4463             if (ptype == 0) ptype = PT_PXPRINT;
4464             /* Fall through */
4465             case PC_PUNCT:
4466             if (ptype == 0) ptype = PT_PXPUNCT;
4467             *class_uchardata++ = local_negate? XCL_NOTPROP : XCL_PROP;
4468             *class_uchardata++ = (PCRE2_UCHAR)ptype;
4469             *class_uchardata++ = 0;
4470             xclass_has_prop = TRUE;
4471             ptr = tempptr + 1;
4472             goto CONTINUE_CLASS;
4473 
4474             /* For the other POSIX classes (ascii, xdigit) we are going to fall
4475             through to the non-UCP case and build a bit map for characters with
4476             code points less than 256. However, if we are in a negated POSIX
4477             class, characters with code points greater than 255 must either all
4478             match or all not match, depending on whether the whole class is not
4479             or is negated. For example, for [[:^ascii:]... they must all match,
4480             whereas for [^[:^xdigit:]... they must not.
4481 
4482             In the special case where there are no xclass items, this is
4483             automatically handled by the use of OP_CLASS or OP_NCLASS, but an
4484             explicit range is needed for OP_XCLASS. Setting a flag here causes
4485             the range to be generated later when it is known that OP_XCLASS is
4486             required. */
4487 
4488             default:
4489             match_all_or_no_wide_chars |= local_negate;
4490             break;
4491             }
4492           }
4493 #endif  /* SUPPORT_UNICODE */
4494 
4495         /* In the non-UCP case, or when UCP makes no difference, we build the
4496         bit map for the POSIX class in a chunk of local store because we may be
4497         adding and subtracting from it, and we don't want to subtract bits that
4498         may be in the main map already. At the end we or the result into the
4499         bit map that is being built. */
4500 
4501         posix_class *= 3;
4502 
4503         /* Copy in the first table (always present) */
4504 
4505         memcpy(pbits, cbits + posix_class_maps[posix_class],
4506           32 * sizeof(uint8_t));
4507 
4508         /* If there is a second table, add or remove it as required. */
4509 
4510         taboffset = posix_class_maps[posix_class + 1];
4511         tabopt = posix_class_maps[posix_class + 2];
4512 
4513         if (taboffset >= 0)
4514           {
4515           if (tabopt >= 0)
4516             for (c = 0; c < 32; c++) pbits[c] |= cbits[(int)c + taboffset];
4517           else
4518             for (c = 0; c < 32; c++) pbits[c] &= ~cbits[(int)c + taboffset];
4519           }
4520 
4521         /* Now see if we need to remove any special characters. An option
4522         value of 1 removes vertical space and 2 removes underscore. */
4523 
4524         if (tabopt < 0) tabopt = -tabopt;
4525         if (tabopt == 1) pbits[1] &= ~0x3c;
4526           else if (tabopt == 2) pbits[11] &= 0x7f;
4527 
4528         /* Add the POSIX table or its complement into the main table that is
4529         being built and we are done. */
4530 
4531         if (local_negate)
4532           for (c = 0; c < 32; c++) classbits[c] |= ~pbits[c];
4533         else
4534           for (c = 0; c < 32; c++) classbits[c] |= pbits[c];
4535 
4536         ptr = tempptr + 1;
4537         /* Every class contains at least one < 256 character. */
4538         class_has_8bitchar = 1;
4539         /* Every class contains at least two characters. */
4540         class_one_char = 2;
4541         goto CONTINUE_CLASS;    /* End of POSIX syntax handling */
4542         }
4543 
4544       /* Backslash may introduce a single character, or it may introduce one
4545       of the specials, which just set a flag. The sequence \b is a special
4546       case. Inside a class (and only there) it is treated as backspace. We
4547       assume that other escapes have more than one character in them, so
4548       speculatively set both class_has_8bitchar and class_one_char bigger
4549       than one. Unrecognized escapes fall through and are faulted. */
4550 
4551       if (c == CHAR_BACKSLASH)
4552         {
4553         escape = PRIV(check_escape)(&ptr, cb->end_pattern, &ec, errorcodeptr,
4554           options, TRUE, cb);
4555         if (*errorcodeptr != 0) goto FAILED;
4556         if (escape == 0)    /* Escaped single char */
4557           {
4558           c = ec;
4559 #ifdef EBCDIC
4560           range_is_literal = FALSE;
4561 #endif
4562           }
4563         else if (escape == ESC_b) c = CHAR_BS; /* \b is backspace in a class */
4564         else if (escape == ESC_N)          /* \N is not supported in a class */
4565           {
4566           *errorcodeptr = ERR71;
4567           goto FAILED;
4568           }
4569         else if (escape == ESC_Q)            /* Handle start of quoted string */
4570           {
4571           if (ptr[1] == CHAR_BACKSLASH && ptr[2] == CHAR_E)
4572             {
4573             ptr += 2; /* avoid empty string */
4574             }
4575           else inescq = TRUE;
4576           goto CONTINUE_CLASS;
4577           }
4578         else if (escape == ESC_E) goto CONTINUE_CLASS;  /* Ignore orphan \E */
4579 
4580         else  /* Handle \d-type escapes */
4581           {
4582           register const uint8_t *cbits = cb->cbits;
4583           /* Every class contains at least two < 256 characters. */
4584           class_has_8bitchar++;
4585           /* Every class contains at least two characters. */
4586           class_one_char += 2;
4587 
4588           switch (escape)
4589             {
4590 #ifdef SUPPORT_UNICODE
4591             case ESC_du:     /* These are the values given for \d etc */
4592             case ESC_DU:     /* when PCRE2_UCP is set. We replace the */
4593             case ESC_wu:     /* escape sequence with an appropriate \p */
4594             case ESC_WU:     /* or \P to test Unicode properties instead */
4595             case ESC_su:     /* of the default ASCII testing. This might be */
4596             case ESC_SU:     /* a 2nd-level nesting for [[:<:]] or [[:>:]]. */
4597             cb->nestptr[1] = cb->nestptr[0];
4598             cb->nestptr[0] = ptr;
4599             ptr = substitutes[escape - ESC_DU] - 1;  /* Just before substitute */
4600             class_has_8bitchar--;                /* Undo! */
4601             break;
4602 #endif
4603             case ESC_d:
4604             for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_digit];
4605             break;
4606 
4607             case ESC_D:
4608             should_flip_negation = TRUE;
4609             for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_digit];
4610             break;
4611 
4612             case ESC_w:
4613             for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_word];
4614             break;
4615 
4616             case ESC_W:
4617             should_flip_negation = TRUE;
4618             for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_word];
4619             break;
4620 
4621             /* Perl 5.004 onwards omitted VT from \s, but restored it at Perl
4622             5.18. Before PCRE 8.34, we had to preserve the VT bit if it was
4623             previously set by something earlier in the character class.
4624             Luckily, the value of CHAR_VT is 0x0b in both ASCII and EBCDIC, so
4625             we could just adjust the appropriate bit. From PCRE 8.34 we no
4626             longer treat \s and \S specially. */
4627 
4628             case ESC_s:
4629             for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_space];
4630             break;
4631 
4632             case ESC_S:
4633             should_flip_negation = TRUE;
4634             for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_space];
4635             break;
4636 
4637             /* The rest apply in both UCP and non-UCP cases. */
4638 
4639             case ESC_h:
4640             (void)add_list_to_class(classbits, &class_uchardata, options, cb,
4641               PRIV(hspace_list), NOTACHAR);
4642             break;
4643 
4644             case ESC_H:
4645             (void)add_not_list_to_class(classbits, &class_uchardata, options,
4646               cb, PRIV(hspace_list));
4647             break;
4648 
4649             case ESC_v:
4650             (void)add_list_to_class(classbits, &class_uchardata, options, cb,
4651               PRIV(vspace_list), NOTACHAR);
4652             break;
4653 
4654             case ESC_V:
4655             (void)add_not_list_to_class(classbits, &class_uchardata, options,
4656               cb, PRIV(vspace_list));
4657             break;
4658 
4659             case ESC_p:
4660             case ESC_P:
4661 #ifdef SUPPORT_UNICODE
4662               {
4663               BOOL negated;
4664               unsigned int ptype = 0, pdata = 0;
4665               if (!get_ucp(&ptr, &negated, &ptype, &pdata, errorcodeptr, cb))
4666                 goto FAILED;
4667               *class_uchardata++ = ((escape == ESC_p) != negated)?
4668                 XCL_PROP : XCL_NOTPROP;
4669               *class_uchardata++ = ptype;
4670               *class_uchardata++ = pdata;
4671               xclass_has_prop = TRUE;
4672               class_has_8bitchar--;                /* Undo! */
4673               }
4674             break;
4675 #else
4676             *errorcodeptr = ERR45;
4677             goto FAILED;
4678 #endif
4679             /* Unrecognized escapes are faulted. */
4680 
4681             default:
4682             *errorcodeptr = ERR7;
4683             goto FAILED;
4684             }
4685 
4686           /* Handled \d-type escape */
4687 
4688           goto CONTINUE_CLASS;
4689           }
4690 
4691         /* Control gets here if the escape just defined a single character.
4692         This is in c and may be greater than 256. */
4693 
4694         escape = 0;
4695         }   /* End of backslash handling */
4696 
4697       /* A character may be followed by '-' to form a range. However, Perl does
4698       not permit ']' to be the end of the range. A '-' character at the end is
4699       treated as a literal. Perl ignores orphaned \E sequences entirely. The
4700       code for handling \Q and \E is messy. */
4701 
4702       CHECK_RANGE:
4703       while (ptr[1] == CHAR_BACKSLASH && ptr[2] == CHAR_E)
4704         {
4705         inescq = FALSE;
4706         ptr += 2;
4707         }
4708       oldptr = ptr;
4709 
4710       /* Remember if \r or \n were explicitly used */
4711 
4712       if (c == CHAR_CR || c == CHAR_NL) cb->external_flags |= PCRE2_HASCRORLF;
4713 
4714       /* Check for range */
4715 
4716       if (!inescq && ptr[1] == CHAR_MINUS)
4717         {
4718         uint32_t d;
4719         ptr += 2;
4720         while (*ptr == CHAR_BACKSLASH && ptr[1] == CHAR_E) ptr += 2;
4721 
4722         /* If we hit \Q (not followed by \E) at this point, go into escaped
4723         mode. */
4724 
4725         while (*ptr == CHAR_BACKSLASH && ptr[1] == CHAR_Q)
4726           {
4727           ptr += 2;
4728           if (*ptr == CHAR_BACKSLASH && ptr[1] == CHAR_E)
4729             { ptr += 2; continue; }
4730           inescq = TRUE;
4731           break;
4732           }
4733 
4734         /* Minus (hyphen) at the end of a class is treated as a literal, so put
4735         back the pointer and jump to handle the character that preceded it. */
4736 
4737         if (*ptr == CHAR_NULL || (!inescq && *ptr == CHAR_RIGHT_SQUARE_BRACKET))
4738           {
4739           ptr = oldptr;
4740           goto CLASS_SINGLE_CHARACTER;
4741           }
4742 
4743         /* Otherwise, we have a potential range; pick up the next character */
4744 
4745 #ifdef SUPPORT_UNICODE
4746         if (utf)
4747           {                           /* Braces are required because the */
4748           GETCHARLEN(d, ptr, ptr);    /* macro generates multiple statements */
4749           }
4750         else
4751 #endif
4752         d = *ptr;  /* Not UTF mode */
4753 
4754         /* The second part of a range can be a single-character escape
4755         sequence, but not any of the other escapes. Perl treats a hyphen as a
4756         literal in such circumstances. However, in Perl's warning mode, a
4757         warning is given, so PCRE now faults it as it is almost certainly a
4758         mistake on the user's part. */
4759 
4760         if (!inescq)
4761           {
4762           if (d == CHAR_BACKSLASH)
4763             {
4764             int descape;
4765             descape = PRIV(check_escape)(&ptr, cb->end_pattern, &d,
4766               errorcodeptr, options, TRUE, cb);
4767             if (*errorcodeptr != 0) goto FAILED;
4768 #ifdef EBCDIC
4769             range_is_literal = FALSE;
4770 #endif
4771             /* 0 means a character was put into d; \b is backspace; any other
4772             special causes an error. */
4773 
4774             if (descape != 0)
4775               {
4776               if (descape == ESC_b) d = CHAR_BS; else
4777                 {
4778                 *errorcodeptr = ERR50;
4779                 goto FAILED;
4780                 }
4781               }
4782             }
4783 
4784           /* A hyphen followed by a POSIX class is treated in the same way. */
4785 
4786           else if (d == CHAR_LEFT_SQUARE_BRACKET &&
4787                    (ptr[1] == CHAR_COLON || ptr[1] == CHAR_DOT ||
4788                     ptr[1] == CHAR_EQUALS_SIGN) &&
4789                    check_posix_syntax(ptr, &tempptr))
4790             {
4791             *errorcodeptr = ERR50;
4792             goto FAILED;
4793             }
4794           }
4795 
4796         /* Check that the two values are in the correct order. Optimize
4797         one-character ranges. */
4798 
4799         if (d < c)
4800           {
4801           *errorcodeptr = ERR8;
4802           goto FAILED;
4803           }
4804         if (d == c) goto CLASS_SINGLE_CHARACTER;  /* A few lines below */
4805 
4806         /* We have found a character range, so single character optimizations
4807         cannot be done anymore. Any value greater than 1 indicates that there
4808         is more than one character. */
4809 
4810         class_one_char = 2;
4811 
4812         /* Remember an explicit \r or \n, and add the range to the class. */
4813 
4814         if (d == CHAR_CR || d == CHAR_NL) cb->external_flags |= PCRE2_HASCRORLF;
4815 
4816         /* In an EBCDIC environment, Perl treats alphabetic ranges specially
4817         because there are holes in the encoding, and simply using the range A-Z
4818         (for example) would include the characters in the holes. This applies
4819         only to literal ranges; [\xC1-\xE9] is different to [A-Z]. */
4820 
4821 #ifdef EBCDIC
4822         if (range_is_literal &&
4823              (cb->ctypes[c] & ctype_letter) != 0 &&
4824              (cb->ctypes[d] & ctype_letter) != 0 &&
4825              (c <= CHAR_z) == (d <= CHAR_z))
4826           {
4827           uint32_t uc = (c <= CHAR_z)? 0 : 64;
4828           uint32_t C = c - uc;
4829           uint32_t D = d - uc;
4830 
4831           if (C <= CHAR_i)
4832             {
4833             class_has_8bitchar +=
4834               add_to_class(classbits, &class_uchardata, options, cb, C + uc,
4835                 ((D < CHAR_i)? D : CHAR_i) + uc);
4836             C = CHAR_j;
4837             }
4838 
4839           if (C <= D && C <= CHAR_r)
4840             {
4841             class_has_8bitchar +=
4842               add_to_class(classbits, &class_uchardata, options, cb, C + uc,
4843                 ((D < CHAR_r)? D : CHAR_r) + uc);
4844             C = CHAR_s;
4845             }
4846 
4847           if (C <= D)
4848             {
4849             class_has_8bitchar +=
4850               add_to_class(classbits, &class_uchardata, options, cb, C + uc,
4851                 D + uc);
4852             }
4853           }
4854         else
4855 #endif
4856         class_has_8bitchar +=
4857           add_to_class(classbits, &class_uchardata, options, cb, c, d);
4858         goto CONTINUE_CLASS;   /* Go get the next char in the class */
4859         }
4860 
4861       /* Handle a single character - we can get here for a normal non-escape
4862       char, or after \ that introduces a single character or for an apparent
4863       range that isn't. Only the value 1 matters for class_one_char, so don't
4864       increase it if it is already 2 or more ... just in case there's a class
4865       with a zillion characters in it. */
4866 
4867       CLASS_SINGLE_CHARACTER:
4868       if (class_one_char < 2) class_one_char++;
4869 
4870       /* If class_one_char is 1 and xclass_has_prop is false, we have the first
4871       single character in the class, and there have been no prior ranges, or
4872       XCLASS items generated by escapes. If this is the final character in the
4873       class, we can optimize by turning the item into a 1-character OP_CHAR[I]
4874       if it's positive, or OP_NOT[I] if it's negative. In the positive case, it
4875       can cause firstcu to be set. Otherwise, there can be no first char if
4876       this item is first, whatever repeat count may follow. In the case of
4877       reqcu, save the previous value for reinstating. */
4878 
4879       if (!inescq &&
4880 #ifdef SUPPORT_UNICODE
4881           !xclass_has_prop &&
4882 #endif
4883           class_one_char == 1 && ptr[1] == CHAR_RIGHT_SQUARE_BRACKET)
4884         {
4885         ptr++;
4886         zeroreqcu = reqcu;
4887         zeroreqcuflags = reqcuflags;
4888 
4889         if (negate_class)
4890           {
4891 #ifdef SUPPORT_UNICODE
4892           int d;
4893 #endif
4894           if (firstcuflags == REQ_UNSET) firstcuflags = REQ_NONE;
4895           zerofirstcu = firstcu;
4896           zerofirstcuflags = firstcuflags;
4897 
4898           /* For caseless UTF mode, check whether this character has more than
4899           one other case. If so, generate a special OP_NOTPROP item instead of
4900           OP_NOTI. */
4901 
4902 #ifdef SUPPORT_UNICODE
4903           if (utf && (options & PCRE2_CASELESS) != 0 &&
4904               (d = UCD_CASESET(c)) != 0)
4905             {
4906             *code++ = OP_NOTPROP;
4907             *code++ = PT_CLIST;
4908             *code++ = d;
4909             }
4910           else
4911 #endif
4912           /* Char has only one other case, or UCP not available */
4913 
4914             {
4915             *code++ = ((options & PCRE2_CASELESS) != 0)? OP_NOTI: OP_NOT;
4916             code += PUTCHAR(c, code);
4917             }
4918 
4919           /* We are finished with this character class */
4920 
4921           goto END_CLASS;
4922           }
4923 
4924         /* For a single, positive character, get the value into mcbuffer, and
4925         then we can handle this with the normal one-character code. */
4926 
4927         mclength = PUTCHAR(c, mcbuffer);
4928         goto ONE_CHAR;
4929         }       /* End of 1-char optimization */
4930 
4931       /* There is more than one character in the class, or an XCLASS item
4932       has been generated. Add this character to the class. */
4933 
4934       class_has_8bitchar +=
4935         add_to_class(classbits, &class_uchardata, options, cb, c, c);
4936 
4937       /* Continue to the next character in the class. Closing square bracket
4938       not within \Q..\E ends the class. A NULL character terminates a
4939       nested substitution string, but may be a data character in the main
4940       pattern (tested at the start of this loop). */
4941 
4942       CONTINUE_CLASS:
4943       c = *(++ptr);
4944       if (c == CHAR_NULL && cb->nestptr[0] != NULL)
4945         {
4946         ptr = cb->nestptr[0];
4947         cb->nestptr[0] = cb->nestptr[1];
4948         cb->nestptr[1] = NULL;
4949         c = *(++ptr);
4950         }
4951 
4952 #ifdef SUPPORT_WIDE_CHARS
4953       /* If any wide characters have been encountered, set xclass = TRUE. Then,
4954       in the pre-compile phase, accumulate the length of the wide characters
4955       and reset the pointer. This is so that very large classes that contain a
4956       zillion wide characters do not overwrite the work space (which is on the
4957       stack). */
4958 
4959       if (class_uchardata > class_uchardata_base)
4960         {
4961         xclass = TRUE;
4962         if (lengthptr != NULL)
4963           {
4964           *lengthptr += class_uchardata - class_uchardata_base;
4965           class_uchardata = class_uchardata_base;
4966           }
4967         }
4968 #endif
4969       /* An unescaped ] ends the class */
4970 
4971       if (c == CHAR_RIGHT_SQUARE_BRACKET && !inescq) break;
4972       }   /* End of main class-processing loop */
4973 
4974     /* If this is the first thing in the branch, there can be no first char
4975     setting, whatever the repeat count. Any reqcu setting must remain
4976     unchanged after any kind of repeat. */
4977 
4978     if (firstcuflags == REQ_UNSET) firstcuflags = REQ_NONE;
4979     zerofirstcu = firstcu;
4980     zerofirstcuflags = firstcuflags;
4981     zeroreqcu = reqcu;
4982     zeroreqcuflags = reqcuflags;
4983 
4984     /* If there are characters with values > 255, or Unicode property settings
4985     (\p or \P), we have to compile an extended class, with its own opcode,
4986     unless there were no property settings and there was a negated special such
4987     as \S in the class, and PCRE2_UCP is not set, because in that case all
4988     characters > 255 are in or not in the class, so any that were explicitly
4989     given as well can be ignored.
4990 
4991     In the UCP case, if certain negated POSIX classes ([:^ascii:] or
4992     [^:xdigit:]) were present in a class, we either have to match or not match
4993     all wide characters (depending on whether the whole class is or is not
4994     negated). This requirement is indicated by match_all_or_no_wide_chars being
4995     true. We do this by including an explicit range, which works in both cases.
4996 
4997     If, when generating an xclass, there are no characters < 256, we can omit
4998     the bitmap in the actual compiled code. */
4999 
5000 #ifdef SUPPORT_WIDE_CHARS
5001 #ifdef SUPPORT_UNICODE
5002     if (xclass && (xclass_has_prop || !should_flip_negation ||
5003          (options & PCRE2_UCP) != 0))
5004 #elif PCRE2_CODE_UNIT_WIDTH != 8
5005     if (xclass && (xclass_has_prop || !should_flip_negation))
5006 #endif
5007       {
5008       if (match_all_or_no_wide_chars)
5009         {
5010         *class_uchardata++ = XCL_RANGE;
5011         class_uchardata += PRIV(ord2utf)(0x100, class_uchardata);
5012         class_uchardata += PRIV(ord2utf)(MAX_UTF_CODE_POINT, class_uchardata);
5013         }
5014       *class_uchardata++ = XCL_END;    /* Marks the end of extra data */
5015       *code++ = OP_XCLASS;
5016       code += LINK_SIZE;
5017       *code = negate_class? XCL_NOT:0;
5018       if (xclass_has_prop) *code |= XCL_HASPROP;
5019 
5020       /* If the map is required, move up the extra data to make room for it;
5021       otherwise just move the code pointer to the end of the extra data. */
5022 
5023       if (class_has_8bitchar > 0)
5024         {
5025         *code++ |= XCL_MAP;
5026         memmove(code + (32 / sizeof(PCRE2_UCHAR)), code,
5027           CU2BYTES(class_uchardata - code));
5028         if (negate_class && !xclass_has_prop)
5029           for (c = 0; c < 32; c++) classbits[c] = ~classbits[c];
5030         memcpy(code, classbits, 32);
5031         code = class_uchardata + (32 / sizeof(PCRE2_UCHAR));
5032         }
5033       else code = class_uchardata;
5034 
5035       /* Now fill in the complete length of the item */
5036 
5037       PUT(previous, 1, (int)(code - previous));
5038       break;   /* End of class handling */
5039       }
5040 #endif
5041 
5042     /* If there are no characters > 255, or they are all to be included or
5043     excluded, set the opcode to OP_CLASS or OP_NCLASS, depending on whether the
5044     whole class was negated and whether there were negative specials such as \S
5045     (non-UCP) in the class. Then copy the 32-byte map into the code vector,
5046     negating it if necessary. */
5047 
5048     *code++ = (negate_class == should_flip_negation) ? OP_CLASS : OP_NCLASS;
5049     if (lengthptr == NULL)    /* Save time in the pre-compile phase */
5050       {
5051       if (negate_class)
5052         for (c = 0; c < 32; c++) classbits[c] = ~classbits[c];
5053       memcpy(code, classbits, 32);
5054       }
5055     code += 32 / sizeof(PCRE2_UCHAR);
5056 
5057     END_CLASS:
5058     break;
5059 
5060 
5061     /* ===================================================================*/
5062     /* Various kinds of repeat; '{' is not necessarily a quantifier, but this
5063     has been tested above. */
5064 
5065     case CHAR_LEFT_CURLY_BRACKET:
5066     if (!is_quantifier) goto NORMAL_CHAR;
5067     ptr = read_repeat_counts(ptr+1, &repeat_min, &repeat_max, errorcodeptr);
5068     if (*errorcodeptr != 0) goto FAILED;
5069     goto REPEAT;
5070 
5071     case CHAR_ASTERISK:
5072     repeat_min = 0;
5073     repeat_max = -1;
5074     goto REPEAT;
5075 
5076     case CHAR_PLUS:
5077     repeat_min = 1;
5078     repeat_max = -1;
5079     goto REPEAT;
5080 
5081     case CHAR_QUESTION_MARK:
5082     repeat_min = 0;
5083     repeat_max = 1;
5084 
5085     REPEAT:
5086     if (previous == NULL)
5087       {
5088       *errorcodeptr = ERR9;
5089       goto FAILED;
5090       }
5091 
5092     if (repeat_min == 0)
5093       {
5094       firstcu = zerofirstcu;    /* Adjust for zero repeat */
5095       firstcuflags = zerofirstcuflags;
5096       reqcu = zeroreqcu;        /* Ditto */
5097       reqcuflags = zeroreqcuflags;
5098       }
5099 
5100     /* Remember whether this is a variable length repeat */
5101 
5102     reqvary = (repeat_min == repeat_max)? 0 : REQ_VARY;
5103 
5104     op_type = 0;                    /* Default single-char op codes */
5105     possessive_quantifier = FALSE;  /* Default not possessive quantifier */
5106 
5107     /* Save start of previous item, in case we have to move it up in order to
5108     insert something before it. */
5109 
5110     tempcode = previous;
5111 
5112     /* Before checking for a possessive quantifier, we must skip over
5113     whitespace and comments in extended mode because Perl allows white space at
5114     this point. */
5115 
5116     if ((options & PCRE2_EXTENDED) != 0)
5117       {
5118       ptr++;
5119       for (;;)
5120         {
5121         while (MAX_255(*ptr) && (cb->ctypes[*ptr] & ctype_space) != 0) ptr++;
5122         if (*ptr != CHAR_NUMBER_SIGN) break;
5123         ptr++;
5124         while (ptr < cb->end_pattern)
5125           {
5126           if (IS_NEWLINE(ptr))         /* For non-fixed-length newline cases, */
5127             {                        /* IS_NEWLINE sets cb->nllen. */
5128             ptr += cb->nllen;
5129             break;
5130             }
5131           ptr++;
5132 #ifdef SUPPORT_UNICODE
5133           if (utf) FORWARDCHAR(ptr);
5134 #endif
5135           }           /* Loop for comment characters */
5136         }             /* Loop for multiple comments */
5137       ptr--;          /* Last code unit of previous character. */
5138       }
5139 
5140     /* If the next character is '+', we have a possessive quantifier. This
5141     implies greediness, whatever the setting of the PCRE2_UNGREEDY option.
5142     If the next character is '?' this is a minimizing repeat, by default,
5143     but if PCRE2_UNGREEDY is set, it works the other way round. We change the
5144     repeat type to the non-default. */
5145 
5146     if (ptr[1] == CHAR_PLUS)
5147       {
5148       repeat_type = 0;                  /* Force greedy */
5149       possessive_quantifier = TRUE;
5150       ptr++;
5151       }
5152     else if (ptr[1] == CHAR_QUESTION_MARK)
5153       {
5154       repeat_type = greedy_non_default;
5155       ptr++;
5156       }
5157     else repeat_type = greedy_default;
5158 
5159     /* If the repeat is {1} we can ignore it. */
5160 
5161     if (repeat_max == 1 && repeat_min == 1) goto END_REPEAT;
5162 
5163     /* If previous was a recursion call, wrap it in atomic brackets so that
5164     previous becomes the atomic group. All recursions were so wrapped in the
5165     past, but it no longer happens for non-repeated recursions. In fact, the
5166     repeated ones could be re-implemented independently so as not to need this,
5167     but for the moment we rely on the code for repeating groups. */
5168 
5169     if (*previous == OP_RECURSE)
5170       {
5171       memmove(previous + 1 + LINK_SIZE, previous, CU2BYTES(1 + LINK_SIZE));
5172       *previous = OP_ONCE;
5173       PUT(previous, 1, 2 + 2*LINK_SIZE);
5174       previous[2 + 2*LINK_SIZE] = OP_KET;
5175       PUT(previous, 3 + 2*LINK_SIZE, 2 + 2*LINK_SIZE);
5176       code += 2 + 2 * LINK_SIZE;
5177       length_prevgroup = 3 + 3*LINK_SIZE;
5178       }
5179 
5180     /* Now handle repetition for the different types of item. */
5181 
5182     /* If previous was a character or negated character match, abolish the item
5183     and generate a repeat item instead. If a char item has a minimum of more
5184     than one, ensure that it is set in reqcu - it might not be if a sequence
5185     such as x{3} is the first thing in a branch because the x will have gone
5186     into firstcu instead.  */
5187 
5188     if (*previous == OP_CHAR || *previous == OP_CHARI
5189         || *previous == OP_NOT || *previous == OP_NOTI)
5190       {
5191       switch (*previous)
5192         {
5193         default: /* Make compiler happy. */
5194         case OP_CHAR:  op_type = OP_STAR - OP_STAR; break;
5195         case OP_CHARI: op_type = OP_STARI - OP_STAR; break;
5196         case OP_NOT:   op_type = OP_NOTSTAR - OP_STAR; break;
5197         case OP_NOTI:  op_type = OP_NOTSTARI - OP_STAR; break;
5198         }
5199 
5200       /* Deal with UTF characters that take up more than one code unit. It's
5201       easier to write this out separately than try to macrify it. Use c to
5202       hold the length of the character in code units, plus UTF_LENGTH to flag
5203       that it's a length rather than a small character. */
5204 
5205 #ifdef MAYBE_UTF_MULTI
5206       if (utf && NOT_FIRSTCU(code[-1]))
5207         {
5208         PCRE2_UCHAR *lastchar = code - 1;
5209         BACKCHAR(lastchar);
5210         c = (int)(code - lastchar);               /* Length of UTF character */
5211         memcpy(utf_units, lastchar, CU2BYTES(c)); /* Save the char */
5212         c |= UTF_LENGTH;                          /* Flag c as a length */
5213         }
5214       else
5215 #endif  /* MAYBE_UTF_MULTI */
5216 
5217       /* Handle the case of a single charater - either with no UTF support, or
5218       with UTF disabled, or for a single-code-unit UTF character. */
5219         {
5220         c = code[-1];
5221         if (*previous <= OP_CHARI && repeat_min > 1)
5222           {
5223           reqcu = c;
5224           reqcuflags = req_caseopt | cb->req_varyopt;
5225           }
5226         }
5227 
5228       goto OUTPUT_SINGLE_REPEAT;   /* Code shared with single character types */
5229       }
5230 
5231     /* If previous was a character type match (\d or similar), abolish it and
5232     create a suitable repeat item. The code is shared with single-character
5233     repeats by setting op_type to add a suitable offset into repeat_type. Note
5234     the the Unicode property types will be present only when SUPPORT_UNICODE is
5235     defined, but we don't wrap the little bits of code here because it just
5236     makes it horribly messy. */
5237 
5238     else if (*previous < OP_EODN)
5239       {
5240       PCRE2_UCHAR *oldcode;
5241       int prop_type, prop_value;
5242       op_type = OP_TYPESTAR - OP_STAR;      /* Use type opcodes */
5243       c = *previous;                        /* Save previous opcode */
5244       if (c == OP_PROP || c == OP_NOTPROP)
5245         {
5246         prop_type = previous[1];
5247         prop_value = previous[2];
5248         }
5249       else
5250         {
5251         /* Come here from just above with a character in c */
5252         OUTPUT_SINGLE_REPEAT:
5253         prop_type = prop_value = -1;
5254         }
5255 
5256       /* At this point we either have prop_type == prop_value == -1 and either
5257       a code point or a character type that is not OP_[NOT]PROP in c, or we
5258       have OP_[NOT]PROP in c and prop_type/prop_value not negative. */
5259 
5260       oldcode = code;                   /* Save where we were */
5261       code = previous;                  /* Usually overwrite previous item */
5262 
5263       /* If the maximum is zero then the minimum must also be zero; Perl allows
5264       this case, so we do too - by simply omitting the item altogether. */
5265 
5266       if (repeat_max == 0) goto END_REPEAT;
5267 
5268       /* Combine the op_type with the repeat_type */
5269 
5270       repeat_type += op_type;
5271 
5272       /* A minimum of zero is handled either as the special case * or ?, or as
5273       an UPTO, with the maximum given. */
5274 
5275       if (repeat_min == 0)
5276         {
5277         if (repeat_max == -1) *code++ = OP_STAR + repeat_type;
5278           else if (repeat_max == 1) *code++ = OP_QUERY + repeat_type;
5279         else
5280           {
5281           *code++ = OP_UPTO + repeat_type;
5282           PUT2INC(code, 0, repeat_max);
5283           }
5284         }
5285 
5286       /* A repeat minimum of 1 is optimized into some special cases. If the
5287       maximum is unlimited, we use OP_PLUS. Otherwise, the original item is
5288       left in place and, if the maximum is greater than 1, we use OP_UPTO with
5289       one less than the maximum. */
5290 
5291       else if (repeat_min == 1)
5292         {
5293         if (repeat_max == -1)
5294           *code++ = OP_PLUS + repeat_type;
5295         else
5296           {
5297           code = oldcode;                 /* Leave previous item in place */
5298           if (repeat_max == 1) goto END_REPEAT;
5299           *code++ = OP_UPTO + repeat_type;
5300           PUT2INC(code, 0, repeat_max - 1);
5301           }
5302         }
5303 
5304       /* The case {n,n} is just an EXACT, while the general case {n,m} is
5305       handled as an EXACT followed by an UPTO or STAR or QUERY. */
5306 
5307       else
5308         {
5309         *code++ = OP_EXACT + op_type;  /* NB EXACT doesn't have repeat_type */
5310         PUT2INC(code, 0, repeat_min);
5311 
5312         /* Unless repeat_max equals repeat_min, fill in the data for EXACT, and
5313         then generate the second opcode. In UTF mode, multi-code-unit
5314         characters have their length in c, with the UTF_LENGTH bit as a flag,
5315         and the code units in utf_units. For a repeated Unicode property match,
5316         there are two extra values that define the required property, and c
5317         never has the UTF_LENGTH bit set. */
5318 
5319         if (repeat_max != repeat_min)
5320           {
5321 #ifdef MAYBE_UTF_MULTI
5322           if (utf && (c & UTF_LENGTH) != 0)
5323             {
5324             memcpy(code, utf_units, CU2BYTES(c & 7));
5325             code += c & 7;
5326             }
5327           else
5328 #endif  /* MAYBE_UTF_MULTI */
5329             {
5330             *code++ = c;
5331             if (prop_type >= 0)
5332               {
5333               *code++ = prop_type;
5334               *code++ = prop_value;
5335               }
5336             }
5337 
5338           /* Now set up the following opcode */
5339 
5340           if (repeat_max < 0) *code++ = OP_STAR + repeat_type; else
5341             {
5342             repeat_max -= repeat_min;
5343             if (repeat_max == 1)
5344               {
5345               *code++ = OP_QUERY + repeat_type;
5346               }
5347             else
5348               {
5349               *code++ = OP_UPTO + repeat_type;
5350               PUT2INC(code, 0, repeat_max);
5351               }
5352             }
5353           }
5354         }
5355 
5356       /* Fill in the character or character type for the final opcode. */
5357 
5358 #ifdef MAYBE_UTF_MULTI
5359       if (utf && (c & UTF_LENGTH) != 0)
5360         {
5361         memcpy(code, utf_units, CU2BYTES(c & 7));
5362         code += c & 7;
5363         }
5364       else
5365 #endif  /* MAYBEW_UTF_MULTI */
5366         {
5367         *code++ = c;
5368         if (prop_type >= 0)
5369           {
5370           *code++ = prop_type;
5371           *code++ = prop_value;
5372           }
5373         }
5374       }
5375 
5376     /* If previous was a character class or a back reference, we put the repeat
5377     stuff after it, but just skip the item if the repeat was {0,0}. */
5378 
5379     else if (*previous == OP_CLASS || *previous == OP_NCLASS ||
5380 #ifdef SUPPORT_WIDE_CHARS
5381              *previous == OP_XCLASS ||
5382 #endif
5383              *previous == OP_REF   || *previous == OP_REFI ||
5384              *previous == OP_DNREF || *previous == OP_DNREFI)
5385       {
5386       if (repeat_max == 0)
5387         {
5388         code = previous;
5389         goto END_REPEAT;
5390         }
5391 
5392       if (repeat_min == 0 && repeat_max == -1)
5393         *code++ = OP_CRSTAR + repeat_type;
5394       else if (repeat_min == 1 && repeat_max == -1)
5395         *code++ = OP_CRPLUS + repeat_type;
5396       else if (repeat_min == 0 && repeat_max == 1)
5397         *code++ = OP_CRQUERY + repeat_type;
5398       else
5399         {
5400         *code++ = OP_CRRANGE + repeat_type;
5401         PUT2INC(code, 0, repeat_min);
5402         if (repeat_max == -1) repeat_max = 0;  /* 2-byte encoding for max */
5403         PUT2INC(code, 0, repeat_max);
5404         }
5405       }
5406 
5407     /* If previous was a bracket group, we may have to replicate it in certain
5408     cases. Note that at this point we can encounter only the "basic" bracket
5409     opcodes such as BRA and CBRA, as this is the place where they get converted
5410     into the more special varieties such as BRAPOS and SBRA. A test for >=
5411     OP_ASSERT and <= OP_COND includes ASSERT, ASSERT_NOT, ASSERTBACK,
5412     ASSERTBACK_NOT, ONCE, ONCE_NC, BRA, BRAPOS, CBRA, CBRAPOS, and COND.
5413     Originally, PCRE did not allow repetition of assertions, but now it does,
5414     for Perl compatibility. */
5415 
5416     else if (*previous >= OP_ASSERT && *previous <= OP_COND)
5417       {
5418       register int i;
5419       int len = (int)(code - previous);
5420       PCRE2_UCHAR *bralink = NULL;
5421       PCRE2_UCHAR *brazeroptr = NULL;
5422 
5423       /* Repeating a DEFINE group (or any group where the condition is always
5424       FALSE and there is only one branch) is pointless, but Perl allows the
5425       syntax, so we just ignore the repeat. */
5426 
5427       if (*previous == OP_COND && previous[LINK_SIZE+1] == OP_FALSE &&
5428           previous[GET(previous, 1)] != OP_ALT)
5429         goto END_REPEAT;
5430 
5431       /* There is no sense in actually repeating assertions. The only potential
5432       use of repetition is in cases when the assertion is optional. Therefore,
5433       if the minimum is greater than zero, just ignore the repeat. If the
5434       maximum is not zero or one, set it to 1. */
5435 
5436       if (*previous < OP_ONCE)    /* Assertion */
5437         {
5438         if (repeat_min > 0) goto END_REPEAT;
5439         if (repeat_max < 0 || repeat_max > 1) repeat_max = 1;
5440         }
5441 
5442       /* The case of a zero minimum is special because of the need to stick
5443       OP_BRAZERO in front of it, and because the group appears once in the
5444       data, whereas in other cases it appears the minimum number of times. For
5445       this reason, it is simplest to treat this case separately, as otherwise
5446       the code gets far too messy. There are several special subcases when the
5447       minimum is zero. */
5448 
5449       if (repeat_min == 0)
5450         {
5451         /* If the maximum is also zero, we used to just omit the group from the
5452         output altogether, like this:
5453 
5454         ** if (repeat_max == 0)
5455         **   {
5456         **   code = previous;
5457         **   goto END_REPEAT;
5458         **   }
5459 
5460         However, that fails when a group or a subgroup within it is referenced
5461         as a subroutine from elsewhere in the pattern, so now we stick in
5462         OP_SKIPZERO in front of it so that it is skipped on execution. As we
5463         don't have a list of which groups are referenced, we cannot do this
5464         selectively.
5465 
5466         If the maximum is 1 or unlimited, we just have to stick in the BRAZERO
5467         and do no more at this point. */
5468 
5469         if (repeat_max <= 1)    /* Covers 0, 1, and unlimited */
5470           {
5471           memmove(previous + 1, previous, CU2BYTES(len));
5472           code++;
5473           if (repeat_max == 0)
5474             {
5475             *previous++ = OP_SKIPZERO;
5476             goto END_REPEAT;
5477             }
5478           brazeroptr = previous;    /* Save for possessive optimizing */
5479           *previous++ = OP_BRAZERO + repeat_type;
5480           }
5481 
5482         /* If the maximum is greater than 1 and limited, we have to replicate
5483         in a nested fashion, sticking OP_BRAZERO before each set of brackets.
5484         The first one has to be handled carefully because it's the original
5485         copy, which has to be moved up. The remainder can be handled by code
5486         that is common with the non-zero minimum case below. We have to
5487         adjust the value or repeat_max, since one less copy is required. */
5488 
5489         else
5490           {
5491           int offset;
5492           memmove(previous + 2 + LINK_SIZE, previous, CU2BYTES(len));
5493           code += 2 + LINK_SIZE;
5494           *previous++ = OP_BRAZERO + repeat_type;
5495           *previous++ = OP_BRA;
5496 
5497           /* We chain together the bracket offset fields that have to be
5498           filled in later when the ends of the brackets are reached. */
5499 
5500           offset = (bralink == NULL)? 0 : (int)(previous - bralink);
5501           bralink = previous;
5502           PUTINC(previous, 0, offset);
5503           }
5504 
5505         repeat_max--;
5506         }
5507 
5508       /* If the minimum is greater than zero, replicate the group as many
5509       times as necessary, and adjust the maximum to the number of subsequent
5510       copies that we need. */
5511 
5512       else
5513         {
5514         if (repeat_min > 1)
5515           {
5516           /* In the pre-compile phase, we don't actually do the replication. We
5517           just adjust the length as if we had. Do some paranoid checks for
5518           potential integer overflow. The INT64_OR_DOUBLE type is a 64-bit
5519           integer type when available, otherwise double. */
5520 
5521           if (lengthptr != NULL)
5522             {
5523             size_t delta = (repeat_min - 1)*length_prevgroup;
5524             if ((INT64_OR_DOUBLE)(repeat_min - 1)*
5525                   (INT64_OR_DOUBLE)length_prevgroup >
5526                     (INT64_OR_DOUBLE)INT_MAX ||
5527                 OFLOW_MAX - *lengthptr < delta)
5528               {
5529               *errorcodeptr = ERR20;
5530               goto FAILED;
5531               }
5532             *lengthptr += delta;
5533             }
5534 
5535           /* This is compiling for real. If there is a set first byte for
5536           the group, and we have not yet set a "required byte", set it. */
5537 
5538           else
5539             {
5540             if (groupsetfirstcu && reqcuflags < 0)
5541               {
5542               reqcu = firstcu;
5543               reqcuflags = firstcuflags;
5544               }
5545             for (i = 1; i < repeat_min; i++)
5546               {
5547               memcpy(code, previous, CU2BYTES(len));
5548               code += len;
5549               }
5550             }
5551           }
5552 
5553         if (repeat_max > 0) repeat_max -= repeat_min;
5554         }
5555 
5556       /* This code is common to both the zero and non-zero minimum cases. If
5557       the maximum is limited, it replicates the group in a nested fashion,
5558       remembering the bracket starts on a stack. In the case of a zero minimum,
5559       the first one was set up above. In all cases the repeat_max now specifies
5560       the number of additional copies needed. Again, we must remember to
5561       replicate entries on the forward reference list. */
5562 
5563       if (repeat_max >= 0)
5564         {
5565         /* In the pre-compile phase, we don't actually do the replication. We
5566         just adjust the length as if we had. For each repetition we must add 1
5567         to the length for BRAZERO and for all but the last repetition we must
5568         add 2 + 2*LINKSIZE to allow for the nesting that occurs. Do some
5569         paranoid checks to avoid integer overflow. The INT64_OR_DOUBLE type is
5570         a 64-bit integer type when available, otherwise double. */
5571 
5572         if (lengthptr != NULL && repeat_max > 0)
5573           {
5574           size_t delta = repeat_max*(length_prevgroup + 1 + 2 + 2*LINK_SIZE) -
5575                       2 - 2*LINK_SIZE;   /* Last one doesn't nest */
5576           if ((INT64_OR_DOUBLE)repeat_max *
5577                 (INT64_OR_DOUBLE)(length_prevgroup + 1 + 2 + 2*LINK_SIZE)
5578                   > (INT64_OR_DOUBLE)INT_MAX ||
5579               OFLOW_MAX - *lengthptr < delta)
5580             {
5581             *errorcodeptr = ERR20;
5582             goto FAILED;
5583             }
5584           *lengthptr += delta;
5585           }
5586 
5587         /* This is compiling for real */
5588 
5589         else for (i = repeat_max - 1; i >= 0; i--)
5590           {
5591           *code++ = OP_BRAZERO + repeat_type;
5592 
5593           /* All but the final copy start a new nesting, maintaining the
5594           chain of brackets outstanding. */
5595 
5596           if (i != 0)
5597             {
5598             int offset;
5599             *code++ = OP_BRA;
5600             offset = (bralink == NULL)? 0 : (int)(code - bralink);
5601             bralink = code;
5602             PUTINC(code, 0, offset);
5603             }
5604 
5605           memcpy(code, previous, CU2BYTES(len));
5606           code += len;
5607           }
5608 
5609         /* Now chain through the pending brackets, and fill in their length
5610         fields (which are holding the chain links pro tem). */
5611 
5612         while (bralink != NULL)
5613           {
5614           int oldlinkoffset;
5615           int offset = (int)(code - bralink + 1);
5616           PCRE2_UCHAR *bra = code - offset;
5617           oldlinkoffset = GET(bra, 1);
5618           bralink = (oldlinkoffset == 0)? NULL : bralink - oldlinkoffset;
5619           *code++ = OP_KET;
5620           PUTINC(code, 0, offset);
5621           PUT(bra, 1, offset);
5622           }
5623         }
5624 
5625       /* If the maximum is unlimited, set a repeater in the final copy. For
5626       ONCE brackets, that's all we need to do. However, possessively repeated
5627       ONCE brackets can be converted into non-capturing brackets, as the
5628       behaviour of (?:xx)++ is the same as (?>xx)++ and this saves having to
5629       deal with possessive ONCEs specially.
5630 
5631       Otherwise, when we are doing the actual compile phase, check to see
5632       whether this group is one that could match an empty string. If so,
5633       convert the initial operator to the S form (e.g. OP_BRA -> OP_SBRA) so
5634       that runtime checking can be done. [This check is also applied to ONCE
5635       groups at runtime, but in a different way.]
5636 
5637       Then, if the quantifier was possessive and the bracket is not a
5638       conditional, we convert the BRA code to the POS form, and the KET code to
5639       KETRPOS. (It turns out to be convenient at runtime to detect this kind of
5640       subpattern at both the start and at the end.) The use of special opcodes
5641       makes it possible to reduce greatly the stack usage in pcre2_match(). If
5642       the group is preceded by OP_BRAZERO, convert this to OP_BRAPOSZERO.
5643 
5644       Then, if the minimum number of matches is 1 or 0, cancel the possessive
5645       flag so that the default action below, of wrapping everything inside
5646       atomic brackets, does not happen. When the minimum is greater than 1,
5647       there will be earlier copies of the group, and so we still have to wrap
5648       the whole thing. */
5649 
5650       else
5651         {
5652         PCRE2_UCHAR *ketcode = code - 1 - LINK_SIZE;
5653         PCRE2_UCHAR *bracode = ketcode - GET(ketcode, 1);
5654 
5655         /* Convert possessive ONCE brackets to non-capturing */
5656 
5657         if ((*bracode == OP_ONCE || *bracode == OP_ONCE_NC) &&
5658             possessive_quantifier) *bracode = OP_BRA;
5659 
5660         /* For non-possessive ONCE brackets, all we need to do is to
5661         set the KET. */
5662 
5663         if (*bracode == OP_ONCE || *bracode == OP_ONCE_NC)
5664           *ketcode = OP_KETRMAX + repeat_type;
5665 
5666         /* Handle non-ONCE brackets and possessive ONCEs (which have been
5667         converted to non-capturing above). */
5668 
5669         else
5670           {
5671           /* In the compile phase, check whether the group could match an empty
5672           string. */
5673 
5674           if (lengthptr == NULL)
5675             {
5676             PCRE2_UCHAR *scode = bracode;
5677             do
5678               {
5679               int count = 0;
5680               int rc = could_be_empty_branch(scode, ketcode, utf, cb, FALSE,
5681                 NULL, &count);
5682               if (rc < 0)
5683                 {
5684                 *errorcodeptr = ERR86;
5685                 goto FAILED;
5686                 }
5687               if (rc > 0)
5688                 {
5689                 *bracode += OP_SBRA - OP_BRA;
5690                 break;
5691                 }
5692               scode += GET(scode, 1);
5693               }
5694             while (*scode == OP_ALT);
5695 
5696             /* A conditional group with only one branch has an implicit empty
5697             alternative branch. */
5698 
5699             if (*bracode == OP_COND && bracode[GET(bracode,1)] != OP_ALT)
5700               *bracode = OP_SCOND;
5701             }
5702 
5703           /* Handle possessive quantifiers. */
5704 
5705           if (possessive_quantifier)
5706             {
5707             /* For COND brackets, we wrap the whole thing in a possessively
5708             repeated non-capturing bracket, because we have not invented POS
5709             versions of the COND opcodes. */
5710 
5711             if (*bracode == OP_COND || *bracode == OP_SCOND)
5712               {
5713               int nlen = (int)(code - bracode);
5714               memmove(bracode + 1 + LINK_SIZE, bracode, CU2BYTES(nlen));
5715               code += 1 + LINK_SIZE;
5716               nlen += 1 + LINK_SIZE;
5717               *bracode = (*bracode == OP_COND)? OP_BRAPOS : OP_SBRAPOS;
5718               *code++ = OP_KETRPOS;
5719               PUTINC(code, 0, nlen);
5720               PUT(bracode, 1, nlen);
5721               }
5722 
5723             /* For non-COND brackets, we modify the BRA code and use KETRPOS. */
5724 
5725             else
5726               {
5727               *bracode += 1;              /* Switch to xxxPOS opcodes */
5728               *ketcode = OP_KETRPOS;
5729               }
5730 
5731             /* If the minimum is zero, mark it as possessive, then unset the
5732             possessive flag when the minimum is 0 or 1. */
5733 
5734             if (brazeroptr != NULL) *brazeroptr = OP_BRAPOSZERO;
5735             if (repeat_min < 2) possessive_quantifier = FALSE;
5736             }
5737 
5738           /* Non-possessive quantifier */
5739 
5740           else *ketcode = OP_KETRMAX + repeat_type;
5741           }
5742         }
5743       }
5744 
5745     /* If previous is OP_FAIL, it was generated by an empty class []
5746     (PCRE2_ALLOW_EMPTY_CLASS is set). The other ways in which OP_FAIL can be
5747     generated, that is by (*FAIL) or (?!), set previous to NULL, which gives a
5748     "nothing to repeat" error above. We can just ignore the repeat in empty
5749     class case. */
5750 
5751     else if (*previous == OP_FAIL) goto END_REPEAT;
5752 
5753     /* Else there's some kind of shambles */
5754 
5755     else
5756       {
5757       *errorcodeptr = ERR10;
5758       goto FAILED;
5759       }
5760 
5761     /* If the character following a repeat is '+', possessive_quantifier is
5762     TRUE. For some opcodes, there are special alternative opcodes for this
5763     case. For anything else, we wrap the entire repeated item inside OP_ONCE
5764     brackets. Logically, the '+' notation is just syntactic sugar, taken from
5765     Sun's Java package, but the special opcodes can optimize it.
5766 
5767     Some (but not all) possessively repeated subpatterns have already been
5768     completely handled in the code just above. For them, possessive_quantifier
5769     is always FALSE at this stage. Note that the repeated item starts at
5770     tempcode, not at previous, which might be the first part of a string whose
5771     (former) last char we repeated. */
5772 
5773     if (possessive_quantifier)
5774       {
5775       int len;
5776 
5777       /* Possessifying an EXACT quantifier has no effect, so we can ignore it.
5778       However, QUERY, STAR, or UPTO may follow (for quantifiers such as {5,6},
5779       {5,}, or {5,10}). We skip over an EXACT item; if the length of what
5780       remains is greater than zero, there's a further opcode that can be
5781       handled. If not, do nothing, leaving the EXACT alone. */
5782 
5783       switch(*tempcode)
5784         {
5785         case OP_TYPEEXACT:
5786         tempcode += PRIV(OP_lengths)[*tempcode] +
5787           ((tempcode[1 + IMM2_SIZE] == OP_PROP
5788           || tempcode[1 + IMM2_SIZE] == OP_NOTPROP)? 2 : 0);
5789         break;
5790 
5791         /* CHAR opcodes are used for exacts whose count is 1. */
5792 
5793         case OP_CHAR:
5794         case OP_CHARI:
5795         case OP_NOT:
5796         case OP_NOTI:
5797         case OP_EXACT:
5798         case OP_EXACTI:
5799         case OP_NOTEXACT:
5800         case OP_NOTEXACTI:
5801         tempcode += PRIV(OP_lengths)[*tempcode];
5802 #ifdef SUPPORT_UNICODE
5803         if (utf && HAS_EXTRALEN(tempcode[-1]))
5804           tempcode += GET_EXTRALEN(tempcode[-1]);
5805 #endif
5806         break;
5807 
5808         /* For the class opcodes, the repeat operator appears at the end;
5809         adjust tempcode to point to it. */
5810 
5811         case OP_CLASS:
5812         case OP_NCLASS:
5813         tempcode += 1 + 32/sizeof(PCRE2_UCHAR);
5814         break;
5815 
5816 #ifdef SUPPORT_WIDE_CHARS
5817         case OP_XCLASS:
5818         tempcode += GET(tempcode, 1);
5819         break;
5820 #endif
5821         }
5822 
5823       /* If tempcode is equal to code (which points to the end of the repeated
5824       item), it means we have skipped an EXACT item but there is no following
5825       QUERY, STAR, or UPTO; the value of len will be 0, and we do nothing. In
5826       all other cases, tempcode will be pointing to the repeat opcode, and will
5827       be less than code, so the value of len will be greater than 0. */
5828 
5829       len = (int)(code - tempcode);
5830       if (len > 0)
5831         {
5832         unsigned int repcode = *tempcode;
5833 
5834         /* There is a table for possessifying opcodes, all of which are less
5835         than OP_CALLOUT. A zero entry means there is no possessified version.
5836         */
5837 
5838         if (repcode < OP_CALLOUT && opcode_possessify[repcode] > 0)
5839           *tempcode = opcode_possessify[repcode];
5840 
5841         /* For opcode without a special possessified version, wrap the item in
5842         ONCE brackets. */
5843 
5844         else
5845           {
5846           memmove(tempcode + 1 + LINK_SIZE, tempcode, CU2BYTES(len));
5847           code += 1 + LINK_SIZE;
5848           len += 1 + LINK_SIZE;
5849           tempcode[0] = OP_ONCE;
5850           *code++ = OP_KET;
5851           PUTINC(code, 0, len);
5852           PUT(tempcode, 1, len);
5853           }
5854         }
5855       }
5856 
5857     /* In all case we no longer have a previous item. We also set the
5858     "follows varying string" flag for subsequently encountered reqcus if
5859     it isn't already set and we have just passed a varying length item. */
5860 
5861     END_REPEAT:
5862     previous = NULL;
5863     cb->req_varyopt |= reqvary;
5864     break;
5865 
5866 
5867     /* ===================================================================*/
5868     /* Start of nested parenthesized sub-expression, or lookahead or lookbehind
5869     or option setting or condition or all the other extended parenthesis forms.
5870     We must save the current high-water-mark for the forward reference list so
5871     that we know where they start for this group. However, because the list may
5872     be extended when there are very many forward references (usually the result
5873     of a replicated inner group), we must use an offset rather than an absolute
5874     address. Note that (?# comments are dealt with at the top of the loop;
5875     they do not get this far. */
5876 
5877     case CHAR_LEFT_PARENTHESIS:
5878     ptr++;
5879 
5880     /* Deal with various "verbs" that can be introduced by '*'. */
5881 
5882     if (ptr[0] == CHAR_ASTERISK && (ptr[1] == ':'
5883          || (MAX_255(ptr[1]) && ((cb->ctypes[ptr[1]] & ctype_letter) != 0))))
5884       {
5885       int i, namelen;
5886       int arglen = 0;
5887       const char *vn = verbnames;
5888       PCRE2_SPTR name = ptr + 1;
5889       PCRE2_SPTR arg = NULL;
5890       previous = NULL;
5891       ptr++;
5892 
5893       /* Increment ptr, set namelen, check length */
5894 
5895       READ_NAME(ctype_letter, ERR60, *errorcodeptr);
5896 
5897       /* It appears that Perl allows any characters whatsoever, other than
5898       a closing parenthesis, to appear in arguments, so we no longer insist on
5899       letters, digits, and underscores. Perl does not, however, do any
5900       interpretation within arguments, and has no means of including a closing
5901       parenthesis. PCRE supports escape processing but only when it is
5902       requested by an option. Note that check_escape() will not return values
5903       greater than the code unit maximum when not in UTF mode. */
5904 
5905       if (*ptr == CHAR_COLON)
5906         {
5907         arg = ++ptr;
5908 
5909         if ((options & PCRE2_ALT_VERBNAMES) == 0)
5910           {
5911           arglen = 0;
5912           while (ptr < cb->end_pattern && *ptr != CHAR_RIGHT_PARENTHESIS)
5913             {
5914             ptr++;                                /* Check length as we go */
5915             arglen++;                             /* along, to avoid the   */
5916             if ((unsigned int)arglen > MAX_MARK)  /* possibility of overflow. */
5917               {
5918               *errorcodeptr = ERR76;
5919               goto FAILED;
5920               }
5921             }
5922           }
5923         else
5924           {
5925           /* The length check is in process_verb_names() */
5926           arglen = process_verb_name(&ptr, NULL, errorcodeptr, options,
5927             utf, cb);
5928           if (arglen < 0) goto FAILED;
5929           }
5930         }
5931 
5932       if (*ptr != CHAR_RIGHT_PARENTHESIS)
5933         {
5934         *errorcodeptr = ERR60;
5935         goto FAILED;
5936         }
5937 
5938       /* Scan the table of verb names */
5939 
5940       for (i = 0; i < verbcount; i++)
5941         {
5942         if (namelen == verbs[i].len &&
5943             PRIV(strncmp_c8)(name, vn, namelen) == 0)
5944           {
5945           int setverb;
5946 
5947           /* Check for open captures before ACCEPT and convert it to
5948           ASSERT_ACCEPT if in an assertion. */
5949 
5950           if (verbs[i].op == OP_ACCEPT)
5951             {
5952             open_capitem *oc;
5953             if (arglen != 0)
5954               {
5955               *errorcodeptr = ERR59;
5956               goto FAILED;
5957               }
5958             cb->had_accept = TRUE;
5959 
5960             /* In the first pass, just accumulate the length required;
5961             otherwise hitting (*ACCEPT) inside many nested parentheses can
5962             cause workspace overflow. */
5963 
5964             for (oc = cb->open_caps; oc != NULL; oc = oc->next)
5965               {
5966               if (lengthptr != NULL)
5967                 {
5968                 *lengthptr += CU2BYTES(1) + IMM2_SIZE;
5969                 }
5970               else
5971                 {
5972                 *code++ = OP_CLOSE;
5973                 PUT2INC(code, 0, oc->number);
5974                 }
5975               }
5976             setverb = *code++ =
5977               (cb->assert_depth > 0)? OP_ASSERT_ACCEPT : OP_ACCEPT;
5978 
5979             /* Do not set firstcu after *ACCEPT */
5980             if (firstcuflags == REQ_UNSET) firstcuflags = REQ_NONE;
5981             }
5982 
5983           /* Handle other cases with/without an argument */
5984 
5985           else if (arglen == 0)    /* There is no argument */
5986             {
5987             if (verbs[i].op < 0)   /* Argument is mandatory */
5988               {
5989               *errorcodeptr = ERR66;
5990               goto FAILED;
5991               }
5992             setverb = *code++ = verbs[i].op;
5993             }
5994 
5995           else                        /* An argument is present */
5996             {
5997             if (verbs[i].op_arg < 0)  /* Argument is forbidden */
5998               {
5999               *errorcodeptr = ERR59;
6000               goto FAILED;
6001               }
6002             setverb = *code++ = verbs[i].op_arg;
6003 
6004             /* Arguments can be very long, especially in 16- and 32-bit modes,
6005             and can overflow the workspace in the first pass. Instead of
6006             putting the argument into memory, we just update the length counter
6007             and set up an empty argument. */
6008 
6009             if (lengthptr != NULL)
6010               {
6011               *lengthptr += arglen;
6012               *code++ = 0;
6013               }
6014             else
6015               {
6016               *code++ = arglen;
6017               if ((options & PCRE2_ALT_VERBNAMES) != 0)
6018                 {
6019                 PCRE2_UCHAR *memcode = code;  /* code is "register" */
6020                 (void)process_verb_name(&arg, &memcode, errorcodeptr, options,
6021                   utf, cb);
6022                 code = memcode;
6023                 }
6024               else   /* No argument processing */
6025                 {
6026                 memcpy(code, arg, CU2BYTES(arglen));
6027                 code += arglen;
6028                 }
6029               }
6030 
6031             *code++ = 0;
6032             }
6033 
6034           switch (setverb)
6035             {
6036             case OP_THEN:
6037             case OP_THEN_ARG:
6038             cb->external_flags |= PCRE2_HASTHEN;
6039             break;
6040 
6041             case OP_PRUNE:
6042             case OP_PRUNE_ARG:
6043             case OP_SKIP:
6044             case OP_SKIP_ARG:
6045             cb->had_pruneorskip = TRUE;
6046             break;
6047             }
6048 
6049           break;  /* Found verb, exit loop */
6050           }
6051 
6052         vn += verbs[i].len + 1;
6053         }
6054 
6055       if (i < verbcount) continue;    /* Successfully handled a verb */
6056       *errorcodeptr = ERR60;          /* Verb not recognized */
6057       goto FAILED;
6058       }
6059 
6060     /* Initialization for "real" parentheses */
6061 
6062     newoptions = options;
6063     skipunits = 0;
6064     bravalue = OP_CBRA;
6065     reset_bracount = FALSE;
6066 
6067     /* Deal with the extended parentheses; all are introduced by '?', and the
6068     appearance of any of them means that this is not a capturing group. */
6069 
6070     if (*ptr == CHAR_QUESTION_MARK)
6071       {
6072       int i, count;
6073       int namelen;                /* Must be signed */
6074       uint32_t index;
6075       uint32_t set, unset, *optset;
6076       named_group *ng;
6077       PCRE2_SPTR name;
6078       PCRE2_UCHAR *slot;
6079 
6080       switch (*(++ptr))
6081         {
6082         /* ------------------------------------------------------------ */
6083         case CHAR_VERTICAL_LINE:  /* Reset capture count for each branch */
6084         reset_bracount = TRUE;
6085         /* Fall through */
6086 
6087         /* ------------------------------------------------------------ */
6088         case CHAR_COLON:          /* Non-capturing bracket */
6089         bravalue = OP_BRA;
6090         ptr++;
6091         break;
6092 
6093         /* ------------------------------------------------------------ */
6094         case CHAR_LEFT_PARENTHESIS:
6095         bravalue = OP_COND;       /* Conditional group */
6096         tempptr = ptr;
6097 
6098         /* A condition can be an assertion, a number (referring to a numbered
6099         group's having been set), a name (referring to a named group), or 'R',
6100         referring to recursion. R<digits> and R&name are also permitted for
6101         recursion tests.
6102 
6103         There are ways of testing a named group: (?(name)) is used by Python;
6104         Perl 5.10 onwards uses (?(<name>) or (?('name')).
6105 
6106         There is one unfortunate ambiguity, caused by history. 'R' can be the
6107         recursive thing or the name 'R' (and similarly for 'R' followed by
6108         digits). We look for a name first; if not found, we try the other case.
6109 
6110         For compatibility with auto-callouts, we allow a callout to be
6111         specified before a condition that is an assertion. First, check for the
6112         syntax of a callout; if found, adjust the temporary pointer that is
6113         used to check for an assertion condition. That's all that is needed! */
6114 
6115         if (ptr[1] == CHAR_QUESTION_MARK && ptr[2] == CHAR_C)
6116           {
6117           if (IS_DIGIT(ptr[3]) || ptr[3] == CHAR_RIGHT_PARENTHESIS)
6118             {
6119             for (i = 3;; i++) if (!IS_DIGIT(ptr[i])) break;
6120             if (ptr[i] == CHAR_RIGHT_PARENTHESIS)
6121               tempptr += i + 1;
6122             }
6123           else
6124             {
6125             uint32_t delimiter = 0;
6126             for (i = 0; PRIV(callout_start_delims)[i] != 0; i++)
6127               {
6128               if (ptr[3] == PRIV(callout_start_delims)[i])
6129                 {
6130                 delimiter = PRIV(callout_end_delims)[i];
6131                 break;
6132                 }
6133               }
6134             if (delimiter != 0)
6135               {
6136               for (i = 4; ptr + i < cb->end_pattern; i++)
6137                 {
6138                 if (ptr[i] == delimiter)
6139                   {
6140                   if (ptr[i+1] == delimiter) i++;
6141                   else
6142                     {
6143                     if (ptr[i+1] == CHAR_RIGHT_PARENTHESIS) tempptr += i + 2;
6144                     break;
6145                     }
6146                   }
6147                 }
6148               }
6149             }
6150 
6151           /* tempptr should now be pointing to the opening parenthesis of the
6152           assertion condition. */
6153 
6154           if (*tempptr != CHAR_LEFT_PARENTHESIS)
6155             {
6156             *errorcodeptr = ERR28;
6157             goto FAILED;
6158             }
6159           }
6160 
6161         /* For conditions that are assertions, check the syntax, and then exit
6162         the switch. This will take control down to where bracketed groups
6163         are processed. The assertion will be handled as part of the group,
6164         but we need to identify this case because the conditional assertion may
6165         not be quantifier. */
6166 
6167         if (tempptr[1] == CHAR_QUESTION_MARK &&
6168               (tempptr[2] == CHAR_EQUALS_SIGN ||
6169                tempptr[2] == CHAR_EXCLAMATION_MARK ||
6170                  (tempptr[2] == CHAR_LESS_THAN_SIGN &&
6171                    (tempptr[3] == CHAR_EQUALS_SIGN ||
6172                     tempptr[3] == CHAR_EXCLAMATION_MARK))))
6173           {
6174           cb->iscondassert = TRUE;
6175           break;
6176           }
6177 
6178         /* Other conditions use OP_CREF/OP_DNCREF/OP_RREF/OP_DNRREF, and all
6179         need to skip at least 1+IMM2_SIZE bytes at the start of the group. */
6180 
6181         code[1+LINK_SIZE] = OP_CREF;
6182         skipunits = 1+IMM2_SIZE;
6183         refsign = -1;     /* => not a number */
6184         namelen = -1;     /* => not a name; must set to avoid warning */
6185         name = NULL;      /* Always set to avoid warning */
6186         recno = 0;        /* Always set to avoid warning */
6187 
6188         /* Point at character after (?( */
6189 
6190         ptr++;
6191 
6192         /* Check for (?(VERSION[>]=n.m), which is a facility whereby indirect
6193         users of PCRE2 via an application can discover which release of PCRE2
6194         is being used. */
6195 
6196         if (PRIV(strncmp_c8)(ptr, STRING_VERSION, 7) == 0 &&
6197             ptr[7] != CHAR_RIGHT_PARENTHESIS)
6198           {
6199           BOOL ge = FALSE;
6200           int major = 0;
6201           int minor = 0;
6202 
6203           ptr += 7;
6204           if (*ptr == CHAR_GREATER_THAN_SIGN)
6205             {
6206             ge = TRUE;
6207             ptr++;
6208             }
6209 
6210           /* NOTE: cannot write IS_DIGIT(*(++ptr)) here because IS_DIGIT
6211           references its argument twice. */
6212 
6213           if (*ptr != CHAR_EQUALS_SIGN || (ptr++, !IS_DIGIT(*ptr)))
6214             {
6215             *errorcodeptr = ERR79;
6216             goto FAILED;
6217             }
6218 
6219           while (IS_DIGIT(*ptr)) major = major * 10 + *ptr++ - '0';
6220           if (*ptr == CHAR_DOT)
6221             {
6222             ptr++;
6223             while (IS_DIGIT(*ptr)) minor = minor * 10 + *ptr++ - '0';
6224             if (minor < 10) minor *= 10;
6225             }
6226 
6227           if (*ptr != CHAR_RIGHT_PARENTHESIS || minor > 99)
6228             {
6229             *errorcodeptr = ERR79;
6230             goto FAILED;
6231             }
6232 
6233           if (ge)
6234             code[1+LINK_SIZE] = ((PCRE2_MAJOR > major) ||
6235               (PCRE2_MAJOR == major && PCRE2_MINOR >= minor))?
6236                 OP_TRUE : OP_FALSE;
6237           else
6238             code[1+LINK_SIZE] = (PCRE2_MAJOR == major && PCRE2_MINOR == minor)?
6239               OP_TRUE : OP_FALSE;
6240 
6241           ptr++;
6242           skipunits = 1;
6243           break;  /* End of condition processing */
6244           }
6245 
6246         /* Check for a test for recursion in a named group. */
6247 
6248         if (*ptr == CHAR_R && ptr[1] == CHAR_AMPERSAND)
6249           {
6250           terminator = -1;
6251           ptr += 2;
6252           code[1+LINK_SIZE] = OP_RREF;    /* Change the type of test */
6253           }
6254 
6255         /* Check for a test for a named group's having been set, using the Perl
6256         syntax (?(<name>) or (?('name'), and also allow for the original PCRE
6257         syntax of (?(name) or for (?(+n), (?(-n), and just (?(n). */
6258 
6259         else if (*ptr == CHAR_LESS_THAN_SIGN)
6260           {
6261           terminator = CHAR_GREATER_THAN_SIGN;
6262           ptr++;
6263           }
6264         else if (*ptr == CHAR_APOSTROPHE)
6265           {
6266           terminator = CHAR_APOSTROPHE;
6267           ptr++;
6268           }
6269         else
6270           {
6271           terminator = CHAR_NULL;
6272           if (*ptr == CHAR_MINUS || *ptr == CHAR_PLUS) refsign = *ptr++;
6273             else if (IS_DIGIT(*ptr)) refsign = 0;
6274           }
6275 
6276         /* Handle a number */
6277 
6278         if (refsign >= 0)
6279           {
6280           while (IS_DIGIT(*ptr))
6281             {
6282             if (recno > INT_MAX / 10 - 1)  /* Integer overflow */
6283               {
6284               while (IS_DIGIT(*ptr)) ptr++;
6285               *errorcodeptr = ERR61;
6286               goto FAILED;
6287               }
6288             recno = recno * 10 + (int)(*ptr - CHAR_0);
6289             ptr++;
6290             }
6291           }
6292 
6293         /* Otherwise we expect to read a name; anything else is an error. When
6294         the referenced name is one of a number of duplicates, a different
6295         opcode is used and it needs more memory. Unfortunately we cannot tell
6296         whether this is the case in the first pass, so we have to allow for
6297         more memory always. In the second pass, the additional to skipunits
6298         happens later. */
6299 
6300         else
6301           {
6302           if (IS_DIGIT(*ptr))
6303             {
6304             *errorcodeptr = ERR44;  /* Group name must start with non-digit */
6305             goto FAILED;
6306             }
6307           if (!MAX_255(*ptr) || (cb->ctypes[*ptr] & ctype_word) == 0)
6308             {
6309             *errorcodeptr = ERR28;   /* Assertion expected */
6310             goto FAILED;
6311             }
6312           name = ptr;
6313           /* Increment ptr, set namelen, check length */
6314           READ_NAME(ctype_word, ERR48, *errorcodeptr);
6315           if (lengthptr != NULL) skipunits += IMM2_SIZE;
6316           }
6317 
6318         /* Check the terminator */
6319 
6320         if ((terminator > 0 && *ptr++ != (PCRE2_UCHAR)terminator) ||
6321             *ptr++ != CHAR_RIGHT_PARENTHESIS)
6322           {
6323           ptr--;                  /* Error offset */
6324           *errorcodeptr = ERR26;  /* Malformed number or name */
6325           goto FAILED;
6326           }
6327 
6328         /* Do no further checking in the pre-compile phase. */
6329 
6330         if (lengthptr != NULL) break;
6331 
6332         /* In the real compile we do the work of looking for the actual
6333         reference. If refsign is not negative, it means we have a number in
6334         recno. */
6335 
6336         if (refsign >= 0)
6337           {
6338           if (recno <= 0)
6339             {
6340             *errorcodeptr = ERR35;
6341             goto FAILED;
6342             }
6343           if (refsign != 0) recno = (refsign == CHAR_MINUS)?
6344             (cb->bracount + 1) - recno : recno + cb->bracount;
6345           if (recno <= 0 || (uint32_t)recno > cb->final_bracount)
6346             {
6347             *errorcodeptr = ERR15;
6348             goto FAILED;
6349             }
6350           PUT2(code, 2+LINK_SIZE, recno);
6351           if ((uint32_t)recno > cb->top_backref) cb->top_backref = recno;
6352           break;
6353           }
6354 
6355         /* Otherwise look for the name. */
6356 
6357         slot = cb->name_table;
6358         for (i = 0; i < cb->names_found; i++)
6359           {
6360           if (PRIV(strncmp)(name, slot+IMM2_SIZE, namelen) == 0) break;
6361           slot += cb->name_entry_size;
6362           }
6363 
6364         /* Found the named subpattern. If the name is duplicated, add one to
6365         the opcode to change CREF/RREF into DNCREF/DNRREF and insert
6366         appropriate data values. Otherwise, just insert the unique subpattern
6367         number. */
6368 
6369         if (i < cb->names_found)
6370           {
6371           int offset = i;            /* Offset of first name found */
6372 
6373           count = 0;
6374           for (;;)
6375             {
6376             recno = GET2(slot, 0);   /* Number for last found */
6377             if ((uint32_t)recno > cb->top_backref) cb->top_backref = recno;
6378             count++;
6379             if (++i >= cb->names_found) break;
6380             slot += cb->name_entry_size;
6381             if (PRIV(strncmp)(name, slot+IMM2_SIZE, namelen) != 0 ||
6382               (slot+IMM2_SIZE)[namelen] != 0) break;
6383             }
6384 
6385           if (count > 1)
6386             {
6387             PUT2(code, 2+LINK_SIZE, offset);
6388             PUT2(code, 2+LINK_SIZE+IMM2_SIZE, count);
6389             skipunits += IMM2_SIZE;
6390             code[1+LINK_SIZE]++;
6391             }
6392           else  /* Not a duplicated name */
6393             {
6394             PUT2(code, 2+LINK_SIZE, recno);
6395             }
6396           }
6397 
6398         /* If terminator == CHAR_NULL it means that the name followed directly
6399         after the opening parenthesis [e.g. (?(abc)...] and in this case there
6400         are some further alternatives to try. For the cases where terminator !=
6401         CHAR_NULL [things like (?(<name>... or (?('name')... or (?(R&name)... ]
6402         we have now checked all the possibilities, so give an error. */
6403 
6404         else if (terminator != CHAR_NULL)
6405           {
6406           *errorcodeptr = ERR15;
6407           goto FAILED;
6408           }
6409 
6410         /* Check for (?(R) for recursion. Allow digits after R to specify a
6411         specific group number. */
6412 
6413         else if (*name == CHAR_R)
6414           {
6415           recno = 0;
6416           for (i = 1; i < namelen; i++)
6417             {
6418             if (!IS_DIGIT(name[i]))
6419               {
6420               *errorcodeptr = ERR15;        /* Non-existent subpattern */
6421               goto FAILED;
6422               }
6423             if (recno > INT_MAX / 10 - 1)   /* Integer overflow */
6424               {
6425               *errorcodeptr = ERR61;
6426               goto FAILED;
6427               }
6428             recno = recno * 10 + name[i] - CHAR_0;
6429             }
6430           if (recno == 0) recno = RREF_ANY;
6431           code[1+LINK_SIZE] = OP_RREF;      /* Change test type */
6432           PUT2(code, 2+LINK_SIZE, recno);
6433           }
6434 
6435         /* Similarly, check for the (?(DEFINE) "condition", which is always
6436         false. During compilation we set OP_DEFINE to distinguish this from
6437         other OP_FALSE conditions so that it can be checked for having only one
6438         branch, but after that the opcode is changed to OP_FALSE. */
6439 
6440         else if (namelen == 6 && PRIV(strncmp_c8)(name, STRING_DEFINE, 6) == 0)
6441           {
6442           code[1+LINK_SIZE] = OP_DEFINE;
6443           skipunits = 1;
6444           }
6445 
6446         /* Reference to an unidentified subpattern. */
6447 
6448         else
6449           {
6450           *errorcodeptr = ERR15;
6451           goto FAILED;
6452           }
6453         break;
6454 
6455 
6456         /* ------------------------------------------------------------ */
6457         case CHAR_EQUALS_SIGN:                 /* Positive lookahead */
6458         bravalue = OP_ASSERT;
6459         cb->assert_depth += 1;
6460         ptr++;
6461         break;
6462 
6463         /* Optimize (?!) to (*FAIL) unless it is quantified - which is a weird
6464         thing to do, but Perl allows all assertions to be quantified, and when
6465         they contain capturing parentheses there may be a potential use for
6466         this feature. Not that that applies to a quantified (?!) but we allow
6467         it for uniformity. */
6468 
6469         /* ------------------------------------------------------------ */
6470         case CHAR_EXCLAMATION_MARK:            /* Negative lookahead */
6471         ptr++;
6472         if (*ptr == CHAR_RIGHT_PARENTHESIS && ptr[1] != CHAR_ASTERISK &&
6473              ptr[1] != CHAR_PLUS && ptr[1] != CHAR_QUESTION_MARK &&
6474             (ptr[1] != CHAR_LEFT_CURLY_BRACKET || !is_counted_repeat(ptr+2)))
6475           {
6476           *code++ = OP_FAIL;
6477           previous = NULL;
6478           continue;
6479           }
6480         bravalue = OP_ASSERT_NOT;
6481         cb->assert_depth += 1;
6482         break;
6483 
6484 
6485         /* ------------------------------------------------------------ */
6486         case CHAR_LESS_THAN_SIGN:              /* Lookbehind or named define */
6487         switch (ptr[1])
6488           {
6489           case CHAR_EQUALS_SIGN:               /* Positive lookbehind */
6490           bravalue = OP_ASSERTBACK;
6491           cb->assert_depth += 1;
6492           ptr += 2;
6493           break;
6494 
6495           case CHAR_EXCLAMATION_MARK:          /* Negative lookbehind */
6496           bravalue = OP_ASSERTBACK_NOT;
6497           cb->assert_depth += 1;
6498           ptr += 2;
6499           break;
6500 
6501           /* Must be a name definition - as the syntax was checked in the
6502           pre-pass, we can assume here that it is valid. Skip over the name
6503           and go to handle the numbered group. */
6504 
6505           default:
6506           while (*(++ptr) != CHAR_GREATER_THAN_SIGN);
6507           ptr++;
6508           goto NUMBERED_GROUP;
6509           }
6510         break;
6511 
6512 
6513         /* ------------------------------------------------------------ */
6514         case CHAR_GREATER_THAN_SIGN:           /* One-time brackets */
6515         bravalue = OP_ONCE;
6516         ptr++;
6517         break;
6518 
6519 
6520         /* ------------------------------------------------------------ */
6521         case CHAR_C:                 /* Callout */
6522         previous_callout = code;     /* Save for later completion */
6523         after_manual_callout = 1;    /* Skip one item before completing */
6524         ptr++;                       /* Character after (?C */
6525 
6526         /* A callout may have a string argument, delimited by one of a fixed
6527         number of characters, or an undelimited numerical argument, or no
6528         argument, which is the same as (?C0). Different opcodes are used for
6529         the two cases. */
6530 
6531         if (*ptr != CHAR_RIGHT_PARENTHESIS && !IS_DIGIT(*ptr))
6532           {
6533           uint32_t delimiter = 0;
6534 
6535           for (i = 0; PRIV(callout_start_delims)[i] != 0; i++)
6536             {
6537             if (*ptr == PRIV(callout_start_delims)[i])
6538               {
6539               delimiter = PRIV(callout_end_delims)[i];
6540               break;
6541               }
6542             }
6543 
6544           if (delimiter == 0)
6545             {
6546             *errorcodeptr = ERR82;
6547             goto FAILED;
6548             }
6549 
6550           /* During the pre-compile phase, we parse the string and update the
6551           length. There is no need to generate any code. (In fact, the string
6552           has already been parsed in the pre-pass that looks for named
6553           parentheses, but it does no harm to leave this code in.) */
6554 
6555           if (lengthptr != NULL)     /* Only check the string */
6556             {
6557             PCRE2_SPTR start = ptr;
6558             do
6559               {
6560               if (++ptr >= cb->end_pattern)
6561                 {
6562                 *errorcodeptr = ERR81;
6563                 ptr = start;   /* To give a more useful message */
6564                 goto FAILED;
6565                 }
6566               if (ptr[0] == delimiter && ptr[1] == delimiter) ptr += 2;
6567               }
6568             while (ptr[0] != delimiter);
6569 
6570             /* Start points to the opening delimiter, ptr points to the
6571             closing delimiter. We must allow for including the delimiter and
6572             for the terminating zero. Any doubled delimiters within the string
6573             make this an overestimate, but it is not worth bothering about. */
6574 
6575             (*lengthptr) += (ptr - start) + 2 + (1 + 4*LINK_SIZE);
6576             }
6577 
6578           /* In the real compile we can copy the string, knowing that it is
6579           syntactically OK. The starting delimiter is included so that the
6580           client can discover it if they want. We also pass the start offset to
6581           help a script language give better error messages. */
6582 
6583           else
6584             {
6585             PCRE2_UCHAR *callout_string = code + (1 + 4*LINK_SIZE);
6586             *callout_string++ = *ptr++;
6587             PUT(code, 1 + 3*LINK_SIZE, (int)(ptr - cb->start_pattern)); /* Start offset */
6588             for(;;)
6589               {
6590               if (*ptr == delimiter)
6591                 {
6592                 if (ptr[1] == delimiter) ptr++; else break;
6593                 }
6594               *callout_string++ = *ptr++;
6595               }
6596             *callout_string++ = CHAR_NULL;
6597             code[0] = OP_CALLOUT_STR;
6598             PUT(code, 1, (int)(ptr + 2 - cb->start_pattern)); /* Next offset */
6599             PUT(code, 1 + LINK_SIZE, 0);      /* Default length */
6600             PUT(code, 1 + 2*LINK_SIZE,        /* Compute size */
6601                 (int)(callout_string - code));
6602             code = callout_string;
6603             }
6604 
6605           /* Advance to what should be the closing parenthesis, which is
6606           checked below. */
6607 
6608           ptr++;
6609           }
6610 
6611         /* Handle a callout with an optional numerical argument, which must be
6612         less than or equal to 255. A missing argument gives 0. */
6613 
6614         else
6615           {
6616           int n = 0;
6617           code[0] = OP_CALLOUT;     /* Numerical callout */
6618           while (IS_DIGIT(*ptr))
6619             {
6620             n = n * 10 + *ptr++ - CHAR_0;
6621             if (n > 255)
6622               {
6623               *errorcodeptr = ERR38;
6624               goto FAILED;
6625               }
6626             }
6627           PUT(code, 1, (int)(ptr - cb->start_pattern + 1));  /* Next offset */
6628           PUT(code, 1 + LINK_SIZE, 0);                    /* Default length */
6629           code[1 + 2*LINK_SIZE] = n;                      /* Callout number */
6630           code += PRIV(OP_lengths)[OP_CALLOUT];
6631           }
6632 
6633         /* Both formats must have a closing parenthesis */
6634 
6635         if (*ptr != CHAR_RIGHT_PARENTHESIS)
6636           {
6637           *errorcodeptr = ERR39;
6638           goto FAILED;
6639           }
6640 
6641         /* Callouts cannot be quantified. */
6642 
6643         previous = NULL;
6644         continue;
6645 
6646 
6647         /* ------------------------------------------------------------ */
6648         case CHAR_P:              /* Python-style named subpattern handling */
6649         if (*(++ptr) == CHAR_EQUALS_SIGN ||
6650             *ptr == CHAR_GREATER_THAN_SIGN)  /* Reference or recursion */
6651           {
6652           is_recurse = *ptr == CHAR_GREATER_THAN_SIGN;
6653           terminator = CHAR_RIGHT_PARENTHESIS;
6654           goto NAMED_REF_OR_RECURSE;
6655           }
6656         else if (*ptr != CHAR_LESS_THAN_SIGN)  /* Test for Python-style defn */
6657           {
6658           *errorcodeptr = ERR41;
6659           goto FAILED;
6660           }
6661         /* Fall through to handle (?P< as (?< is handled */
6662 
6663 
6664         /* ------------------------------------------------------------ */
6665         case CHAR_APOSTROPHE:   /* Define a name - note fall through above */
6666 
6667         /* The syntax was checked and the list of names was set up in the
6668         pre-pass, so there is nothing to be done now except to skip over the
6669         name. */
6670 
6671         terminator = (*ptr == CHAR_LESS_THAN_SIGN)?
6672                   CHAR_GREATER_THAN_SIGN : CHAR_APOSTROPHE;
6673         while (*(++ptr) != (unsigned int)terminator);
6674         ptr++;
6675         goto NUMBERED_GROUP;      /* Set up numbered group */
6676 
6677 
6678         /* ------------------------------------------------------------ */
6679         case CHAR_AMPERSAND:            /* Perl recursion/subroutine syntax */
6680         terminator = CHAR_RIGHT_PARENTHESIS;
6681         is_recurse = TRUE;
6682         /* Fall through */
6683 
6684         /* We come here from the Python syntax above that handles both
6685         references (?P=name) and recursion (?P>name), as well as falling
6686         through from the Perl recursion syntax (?&name). We also come here from
6687         the Perl \k<name> or \k'name' back reference syntax and the \k{name}
6688         .NET syntax, and the Oniguruma \g<...> and \g'...' subroutine syntax. */
6689 
6690         NAMED_REF_OR_RECURSE:
6691         name = ++ptr;
6692         if (IS_DIGIT(*ptr))
6693           {
6694           *errorcodeptr = ERR44;   /* Group name must start with non-digit */
6695           goto FAILED;
6696           }
6697         /* Increment ptr, set namelen, check length */
6698         READ_NAME(ctype_word, ERR48, *errorcodeptr);
6699 
6700         /* In the pre-compile phase, do a syntax check. */
6701 
6702         if (lengthptr != NULL)
6703           {
6704           if (namelen == 0)
6705             {
6706             *errorcodeptr = ERR62;
6707             goto FAILED;
6708             }
6709           if (*ptr != (PCRE2_UCHAR)terminator)
6710             {
6711             *errorcodeptr = ERR42;
6712             goto FAILED;
6713             }
6714           }
6715 
6716         /* Scan the list of names generated in the pre-pass in order to get
6717         a number and whether or not this name is duplicated. */
6718 
6719         recno = 0;
6720         is_dupname = FALSE;
6721         ng = cb->named_groups;
6722 
6723         for (i = 0; i < cb->names_found; i++, ng++)
6724           {
6725           if (namelen == ng->length &&
6726               PRIV(strncmp)(name, ng->name, namelen) == 0)
6727             {
6728             open_capitem *oc;
6729             is_dupname = ng->isdup;
6730             recno = ng->number;
6731 
6732             /* For a recursion, that's all that is needed. We can now go to the
6733             code that handles numerical recursion. */
6734 
6735             if (is_recurse) goto HANDLE_RECURSION;
6736 
6737             /* For a back reference, update the back reference map and the
6738             maximum back reference. Then for each group we must check to see if
6739             it is recursive, that is, it is inside the group that it
6740             references. A flag is set so that the group can be made atomic. */
6741 
6742             cb->backref_map |= (recno < 32)? (1u << recno) : 1;
6743             if ((uint32_t)recno > cb->top_backref) cb->top_backref = recno;
6744 
6745             for (oc = cb->open_caps; oc != NULL; oc = oc->next)
6746               {
6747               if (oc->number == recno)
6748                 {
6749                 oc->flag = TRUE;
6750                 break;
6751                 }
6752               }
6753             }
6754           }
6755 
6756         /* If the name was not found we have a bad reference. */
6757 
6758         if (recno == 0)
6759           {
6760           *errorcodeptr = ERR15;
6761           goto FAILED;
6762           }
6763 
6764         /* If a back reference name is not duplicated, we can handle it as a
6765         numerical reference. */
6766 
6767         if (!is_dupname) goto HANDLE_REFERENCE;
6768 
6769         /* If a back reference name is duplicated, we generate a different
6770         opcode to a numerical back reference. In the second pass we must search
6771         for the index and count in the final name table. */
6772 
6773         count = 0;
6774         index = 0;
6775 
6776         if (lengthptr == NULL)
6777           {
6778           slot = cb->name_table;
6779           for (i = 0; i < cb->names_found; i++)
6780             {
6781             if (PRIV(strncmp)(name, slot+IMM2_SIZE, namelen) == 0 &&
6782                 slot[IMM2_SIZE+namelen] == 0)
6783               {
6784               if (count == 0) index = i;
6785               count++;
6786               }
6787             slot += cb->name_entry_size;
6788             }
6789 
6790           if (count == 0)
6791             {
6792             *errorcodeptr = ERR15;
6793             goto FAILED;
6794             }
6795           }
6796 
6797         if (firstcuflags == REQ_UNSET) firstcuflags = REQ_NONE;
6798         previous = code;
6799         *code++ = ((options & PCRE2_CASELESS) != 0)? OP_DNREFI : OP_DNREF;
6800         PUT2INC(code, 0, index);
6801         PUT2INC(code, 0, count);
6802         continue;  /* End of back ref handling */
6803 
6804 
6805         /* ------------------------------------------------------------ */
6806         case CHAR_R:              /* Recursion, same as (?0) */
6807         recno = 0;
6808         if (*(++ptr) != CHAR_RIGHT_PARENTHESIS)
6809           {
6810           *errorcodeptr = ERR29;
6811           goto FAILED;
6812           }
6813         goto HANDLE_RECURSION;
6814 
6815 
6816         /* ------------------------------------------------------------ */
6817         case CHAR_MINUS: case CHAR_PLUS:  /* Recursion or subroutine */
6818         case CHAR_0: case CHAR_1: case CHAR_2: case CHAR_3: case CHAR_4:
6819         case CHAR_5: case CHAR_6: case CHAR_7: case CHAR_8: case CHAR_9:
6820           {
6821           terminator = CHAR_RIGHT_PARENTHESIS;
6822 
6823           /* Come here from the \g<...> and \g'...' code (Oniguruma
6824           compatibility). However, the syntax has been checked to ensure that
6825           the ... are a (signed) number, so that neither ERR63 nor ERR29 will
6826           be called on this path, nor with the jump to OTHER_CHAR_AFTER_QUERY
6827           ever be taken. */
6828 
6829           HANDLE_NUMERICAL_RECURSION:
6830 
6831           if ((refsign = *ptr) == CHAR_PLUS)
6832             {
6833             ptr++;
6834             if (!IS_DIGIT(*ptr))
6835               {
6836               *errorcodeptr = ERR63;
6837               goto FAILED;
6838               }
6839             }
6840           else if (refsign == CHAR_MINUS)
6841             {
6842             if (!IS_DIGIT(ptr[1]))
6843               goto OTHER_CHAR_AFTER_QUERY;
6844             ptr++;
6845             }
6846 
6847           recno = 0;
6848           while (IS_DIGIT(*ptr))
6849             {
6850             if (recno > INT_MAX / 10 - 1) /* Integer overflow */
6851               {
6852               while (IS_DIGIT(*ptr)) ptr++;
6853               *errorcodeptr = ERR61;
6854               goto FAILED;
6855               }
6856             recno = recno * 10 + *ptr++ - CHAR_0;
6857             }
6858 
6859           if (*ptr != (PCRE2_UCHAR)terminator)
6860             {
6861             *errorcodeptr = ERR29;
6862             goto FAILED;
6863             }
6864 
6865           if (refsign == CHAR_MINUS)
6866             {
6867             if (recno == 0)
6868               {
6869               *errorcodeptr = ERR58;
6870               goto FAILED;
6871               }
6872             recno = (int)(cb->bracount + 1) - recno;
6873             if (recno <= 0)
6874               {
6875               *errorcodeptr = ERR15;
6876               goto FAILED;
6877               }
6878             }
6879           else if (refsign == CHAR_PLUS)
6880             {
6881             if (recno == 0)
6882               {
6883               *errorcodeptr = ERR58;
6884               goto FAILED;
6885               }
6886             recno += cb->bracount;
6887             }
6888 
6889           if ((uint32_t)recno > cb->final_bracount)
6890             {
6891             *errorcodeptr = ERR15;
6892             goto FAILED;
6893             }
6894 
6895           /* Come here from code above that handles a named recursion.
6896           We insert the number of the called group after OP_RECURSE. At the
6897           end of compiling the pattern is scanned and these numbers are
6898           replaced by offsets within the pattern. It is done like this to avoid
6899           problems with forward references and adjusting offsets when groups
6900           are duplicated and moved (as discovered in previous implementations).
6901           Note that a recursion does not have a set first character (relevant
6902           if it is repeated, because it will then be wrapped with ONCE
6903           brackets). */
6904 
6905           HANDLE_RECURSION:
6906           previous = code;
6907           *code = OP_RECURSE;
6908           PUT(code, 1, recno);
6909           code += 1 + LINK_SIZE;
6910           groupsetfirstcu = FALSE;
6911           cb->had_recurse = TRUE;
6912           }
6913 
6914         /* Can't determine a first byte now */
6915 
6916         if (firstcuflags == REQ_UNSET) firstcuflags = REQ_NONE;
6917         continue;
6918 
6919 
6920         /* ------------------------------------------------------------ */
6921         default:              /* Other characters: check option setting */
6922         OTHER_CHAR_AFTER_QUERY:
6923         set = unset = 0;
6924         optset = &set;
6925 
6926         while (*ptr != CHAR_RIGHT_PARENTHESIS && *ptr != CHAR_COLON)
6927           {
6928           switch (*ptr++)
6929             {
6930             case CHAR_MINUS: optset = &unset; break;
6931 
6932             case CHAR_J:    /* Record that it changed in the external options */
6933             *optset |= PCRE2_DUPNAMES;
6934             cb->external_flags |= PCRE2_JCHANGED;
6935             break;
6936 
6937             case CHAR_i: *optset |= PCRE2_CASELESS; break;
6938             case CHAR_m: *optset |= PCRE2_MULTILINE; break;
6939             case CHAR_s: *optset |= PCRE2_DOTALL; break;
6940             case CHAR_x: *optset |= PCRE2_EXTENDED; break;
6941             case CHAR_U: *optset |= PCRE2_UNGREEDY; break;
6942 
6943             default:  *errorcodeptr = ERR11;
6944                       ptr--;    /* Correct the offset */
6945                       goto FAILED;
6946             }
6947           }
6948 
6949         /* Set up the changed option bits, but don't change anything yet. */
6950 
6951         newoptions = (options | set) & (~unset);
6952 
6953         /* If the options ended with ')' this is not the start of a nested
6954         group with option changes, so the options change at this level. They
6955         must also be passed back for use in subsequent branches. Reset the
6956         greedy defaults and the case value for firstcu and reqcu. */
6957 
6958         if (*ptr == CHAR_RIGHT_PARENTHESIS)
6959           {
6960           *optionsptr = options = newoptions;
6961           greedy_default = ((newoptions & PCRE2_UNGREEDY) != 0);
6962           greedy_non_default = greedy_default ^ 1;
6963           req_caseopt = ((newoptions & PCRE2_CASELESS) != 0)? REQ_CASELESS:0;
6964           previous = NULL;       /* This item can't be repeated */
6965           continue;              /* It is complete */
6966           }
6967 
6968         /* If the options ended with ':' we are heading into a nested group
6969         with possible change of options. Such groups are non-capturing and are
6970         not assertions of any kind. All we need to do is skip over the ':';
6971         the newoptions value is handled below. */
6972 
6973         bravalue = OP_BRA;
6974         ptr++;
6975         }     /* End of switch for character following (? */
6976       }       /* End of (? handling */
6977 
6978     /* Opening parenthesis not followed by '*' or '?'. If PCRE2_NO_AUTO_CAPTURE
6979     is set, all unadorned brackets become non-capturing and behave like (?:...)
6980     brackets. */
6981 
6982     else if ((options & PCRE2_NO_AUTO_CAPTURE) != 0)
6983       {
6984       bravalue = OP_BRA;
6985       }
6986 
6987     /* Else we have a capturing group. */
6988 
6989     else
6990       {
6991       NUMBERED_GROUP:
6992       cb->bracount += 1;
6993       PUT2(code, 1+LINK_SIZE, cb->bracount);
6994       skipunits = IMM2_SIZE;
6995       }
6996 
6997     /* Process nested bracketed regex. First check for parentheses nested too
6998     deeply. */
6999 
7000     if ((cb->parens_depth += 1) > (int)(cb->cx->parens_nest_limit))
7001       {
7002       *errorcodeptr = ERR19;
7003       goto FAILED;
7004       }
7005 
7006     /* All assertions used not to be repeatable, but this was changed for Perl
7007     compatibility. All kinds can now be repeated except for assertions that are
7008     conditions (Perl also forbids these to be repeated). We copy code into a
7009     non-register variable (tempcode) in order to be able to pass its address
7010     because some compilers complain otherwise. At the start of a conditional
7011     group whose condition is an assertion, cb->iscondassert is set. We unset it
7012     here so as to allow assertions later in the group to be quantified. */
7013 
7014     if (bravalue >= OP_ASSERT && bravalue <= OP_ASSERTBACK_NOT &&
7015         cb->iscondassert)
7016       {
7017       previous = NULL;
7018       cb->iscondassert = FALSE;
7019       }
7020     else
7021       {
7022       previous = code;
7023       }
7024 
7025     *code = bravalue;
7026     tempcode = code;
7027     tempreqvary = cb->req_varyopt;        /* Save value before bracket */
7028     tempbracount = cb->bracount;          /* Save value before bracket */
7029     length_prevgroup = 0;                 /* Initialize for pre-compile phase */
7030 
7031     if (!compile_regex(
7032          newoptions,                      /* The complete new option state */
7033          &tempcode,                       /* Where to put code (updated) */
7034          &ptr,                            /* Input pointer (updated) */
7035          errorcodeptr,                    /* Where to put an error message */
7036          (bravalue == OP_ASSERTBACK ||
7037           bravalue == OP_ASSERTBACK_NOT), /* TRUE if back assert */
7038          reset_bracount,                  /* True if (?| group */
7039          skipunits,                       /* Skip over bracket number */
7040          cond_depth +
7041            ((bravalue == OP_COND)?1:0),   /* Depth of condition subpatterns */
7042          &subfirstcu,                     /* For possible first char */
7043          &subfirstcuflags,
7044          &subreqcu,                       /* For possible last char */
7045          &subreqcuflags,
7046          bcptr,                           /* Current branch chain */
7047          cb,                              /* Compile data block */
7048          (lengthptr == NULL)? NULL :      /* Actual compile phase */
7049            &length_prevgroup              /* Pre-compile phase */
7050          ))
7051       goto FAILED;
7052 
7053     cb->parens_depth -= 1;
7054 
7055     /* If this was an atomic group and there are no capturing groups within it,
7056     generate OP_ONCE_NC instead of OP_ONCE. */
7057 
7058     if (bravalue == OP_ONCE && cb->bracount <= tempbracount)
7059       *code = OP_ONCE_NC;
7060 
7061     if (bravalue >= OP_ASSERT && bravalue <= OP_ASSERTBACK_NOT)
7062       cb->assert_depth -= 1;
7063 
7064     /* At the end of compiling, code is still pointing to the start of the
7065     group, while tempcode has been updated to point past the end of the group.
7066     The pattern pointer (ptr) is on the bracket.
7067 
7068     If this is a conditional bracket, check that there are no more than
7069     two branches in the group, or just one if it's a DEFINE group. We do this
7070     in the real compile phase, not in the pre-pass, where the whole group may
7071     not be available. */
7072 
7073     if (bravalue == OP_COND && lengthptr == NULL)
7074       {
7075       PCRE2_UCHAR *tc = code;
7076       int condcount = 0;
7077 
7078       do {
7079          condcount++;
7080          tc += GET(tc,1);
7081          }
7082       while (*tc != OP_KET);
7083 
7084       /* A DEFINE group is never obeyed inline (the "condition" is always
7085       false). It must have only one branch. Having checked this, change the
7086       opcode to OP_FALSE. */
7087 
7088       if (code[LINK_SIZE+1] == OP_DEFINE)
7089         {
7090         if (condcount > 1)
7091           {
7092           *errorcodeptr = ERR54;
7093           goto FAILED;
7094           }
7095         code[LINK_SIZE+1] = OP_FALSE;
7096         bravalue = OP_DEFINE;   /* Just a flag to suppress char handling below */
7097         }
7098 
7099       /* A "normal" conditional group. If there is just one branch, we must not
7100       make use of its firstcu or reqcu, because this is equivalent to an
7101       empty second branch. */
7102 
7103       else
7104         {
7105         if (condcount > 2)
7106           {
7107           *errorcodeptr = ERR27;
7108           goto FAILED;
7109           }
7110         if (condcount == 1) subfirstcuflags = subreqcuflags = REQ_NONE;
7111         }
7112       }
7113 
7114     /* At the end of a group, it's an error if we hit end of pattern or
7115     any non-closing parenthesis. This check also happens in the pre-scan,
7116     so should not trigger here, but leave this code as an insurance. */
7117 
7118     if (*ptr != CHAR_RIGHT_PARENTHESIS)
7119       {
7120       *errorcodeptr = ERR14;
7121       goto FAILED;
7122       }
7123 
7124     /* In the pre-compile phase, update the length by the length of the group,
7125     less the brackets at either end. Then reduce the compiled code to just a
7126     set of non-capturing brackets so that it doesn't use much memory if it is
7127     duplicated by a quantifier.*/
7128 
7129     if (lengthptr != NULL)
7130       {
7131       if (OFLOW_MAX - *lengthptr < length_prevgroup - 2 - 2*LINK_SIZE)
7132         {
7133         *errorcodeptr = ERR20;
7134         goto FAILED;
7135         }
7136       *lengthptr += length_prevgroup - 2 - 2*LINK_SIZE;
7137       code++;   /* This already contains bravalue */
7138       PUTINC(code, 0, 1 + LINK_SIZE);
7139       *code++ = OP_KET;
7140       PUTINC(code, 0, 1 + LINK_SIZE);
7141       break;    /* No need to waste time with special character handling */
7142       }
7143 
7144     /* Otherwise update the main code pointer to the end of the group. */
7145 
7146     code = tempcode;
7147 
7148     /* For a DEFINE group, required and first character settings are not
7149     relevant. */
7150 
7151     if (bravalue == OP_DEFINE) break;
7152 
7153     /* Handle updating of the required and first characters for other types of
7154     group. Update for normal brackets of all kinds, and conditions with two
7155     branches (see code above). If the bracket is followed by a quantifier with
7156     zero repeat, we have to back off. Hence the definition of zeroreqcu and
7157     zerofirstcu outside the main loop so that they can be accessed for the
7158     back off. */
7159 
7160     zeroreqcu = reqcu;
7161     zeroreqcuflags = reqcuflags;
7162     zerofirstcu = firstcu;
7163     zerofirstcuflags = firstcuflags;
7164     groupsetfirstcu = FALSE;
7165 
7166     if (bravalue >= OP_ONCE)
7167       {
7168       /* If we have not yet set a firstcu in this branch, take it from the
7169       subpattern, remembering that it was set here so that a repeat of more
7170       than one can replicate it as reqcu if necessary. If the subpattern has
7171       no firstcu, set "none" for the whole branch. In both cases, a zero
7172       repeat forces firstcu to "none". */
7173 
7174       if (firstcuflags == REQ_UNSET && subfirstcuflags != REQ_UNSET)
7175         {
7176         if (subfirstcuflags >= 0)
7177           {
7178           firstcu = subfirstcu;
7179           firstcuflags = subfirstcuflags;
7180           groupsetfirstcu = TRUE;
7181           }
7182         else firstcuflags = REQ_NONE;
7183         zerofirstcuflags = REQ_NONE;
7184         }
7185 
7186       /* If firstcu was previously set, convert the subpattern's firstcu
7187       into reqcu if there wasn't one, using the vary flag that was in
7188       existence beforehand. */
7189 
7190       else if (subfirstcuflags >= 0 && subreqcuflags < 0)
7191         {
7192         subreqcu = subfirstcu;
7193         subreqcuflags = subfirstcuflags | tempreqvary;
7194         }
7195 
7196       /* If the subpattern set a required byte (or set a first byte that isn't
7197       really the first byte - see above), set it. */
7198 
7199       if (subreqcuflags >= 0)
7200         {
7201         reqcu = subreqcu;
7202         reqcuflags = subreqcuflags;
7203         }
7204       }
7205 
7206     /* For a forward assertion, we take the reqcu, if set. This can be
7207     helpful if the pattern that follows the assertion doesn't set a different
7208     char. For example, it's useful for /(?=abcde).+/. We can't set firstcu
7209     for an assertion, however because it leads to incorrect effect for patterns
7210     such as /(?=a)a.+/ when the "real" "a" would then become a reqcu instead
7211     of a firstcu. This is overcome by a scan at the end if there's no
7212     firstcu, looking for an asserted first char. */
7213 
7214     else if (bravalue == OP_ASSERT && subreqcuflags >= 0)
7215       {
7216       reqcu = subreqcu;
7217       reqcuflags = subreqcuflags;
7218       }
7219     break;     /* End of processing '(' */
7220 
7221 
7222     /* ===================================================================*/
7223     /* Handle metasequences introduced by \. For ones like \d, the ESC_ values
7224     are arranged to be the negation of the corresponding OP_values in the
7225     default case when PCRE2_UCP is not set. For the back references, the values
7226     are negative the reference number. Only back references and those types
7227     that consume a character may be repeated. We can test for values between
7228     ESC_b and ESC_Z for the latter; this may have to change if any new ones are
7229     ever created.
7230 
7231     Note: \Q and \E are handled at the start of the character-processing loop,
7232     not here. */
7233 
7234     case CHAR_BACKSLASH:
7235     tempptr = ptr;
7236     escape = PRIV(check_escape)(&ptr, cb->end_pattern, &ec, errorcodeptr,
7237       options, FALSE, cb);
7238     if (*errorcodeptr != 0) goto FAILED;
7239 
7240     if (escape == 0)                  /* The escape coded a single character */
7241       c = ec;
7242     else
7243       {
7244       /* For metasequences that actually match a character, we disable the
7245       setting of a first character if it hasn't already been set. */
7246 
7247       if (firstcuflags == REQ_UNSET && escape > ESC_b && escape < ESC_Z)
7248         firstcuflags = REQ_NONE;
7249 
7250       /* Set values to reset to if this is followed by a zero repeat. */
7251 
7252       zerofirstcu = firstcu;
7253       zerofirstcuflags = firstcuflags;
7254       zeroreqcu = reqcu;
7255       zeroreqcuflags = reqcuflags;
7256 
7257       /* \g<name> or \g'name' is a subroutine call by name and \g<n> or \g'n'
7258       is a subroutine call by number (Oniguruma syntax). In fact, the value
7259       ESC_g is returned only for these cases. So we don't need to check for <
7260       or ' if the value is ESC_g. For the Perl syntax \g{n} the value is
7261       -n, and for the Perl syntax \g{name} the result is ESC_k (as
7262       that is a synonym for a named back reference). */
7263 
7264       if (escape == ESC_g)
7265         {
7266         PCRE2_SPTR p;
7267         uint32_t cf;
7268 
7269         terminator = (*(++ptr) == CHAR_LESS_THAN_SIGN)?
7270           CHAR_GREATER_THAN_SIGN : CHAR_APOSTROPHE;
7271 
7272         /* These two statements stop the compiler for warning about possibly
7273         unset variables caused by the jump to HANDLE_NUMERICAL_RECURSION. In
7274         fact, because we do the check for a number below, the paths that
7275         would actually be in error are never taken. */
7276 
7277         skipunits = 0;
7278         reset_bracount = FALSE;
7279 
7280         /* If it's not a signed or unsigned number, treat it as a name. */
7281 
7282         cf = ptr[1];
7283         if (cf != CHAR_PLUS && cf != CHAR_MINUS && !IS_DIGIT(cf))
7284           {
7285           is_recurse = TRUE;
7286           goto NAMED_REF_OR_RECURSE;
7287           }
7288 
7289         /* Signed or unsigned number (cf = ptr[1]) is known to be plus or minus
7290         or a digit. */
7291 
7292         p = ptr + 2;
7293         while (IS_DIGIT(*p)) p++;
7294         if (*p != (PCRE2_UCHAR)terminator)
7295           {
7296           *errorcodeptr = ERR57;
7297           goto FAILED;
7298           }
7299         ptr++;
7300         goto HANDLE_NUMERICAL_RECURSION;
7301         }
7302 
7303       /* \k<name> or \k'name' is a back reference by name (Perl syntax).
7304       We also support \k{name} (.NET syntax).  */
7305 
7306       if (escape == ESC_k)
7307         {
7308         if ((ptr[1] != CHAR_LESS_THAN_SIGN &&
7309           ptr[1] != CHAR_APOSTROPHE && ptr[1] != CHAR_LEFT_CURLY_BRACKET))
7310           {
7311           *errorcodeptr = ERR69;
7312           goto FAILED;
7313           }
7314         is_recurse = FALSE;
7315         terminator = (*(++ptr) == CHAR_LESS_THAN_SIGN)?
7316           CHAR_GREATER_THAN_SIGN : (*ptr == CHAR_APOSTROPHE)?
7317           CHAR_APOSTROPHE : CHAR_RIGHT_CURLY_BRACKET;
7318         goto NAMED_REF_OR_RECURSE;
7319         }
7320 
7321       /* Back references are handled specially; must disable firstcu if
7322       not set to cope with cases like (?=(\w+))\1: which would otherwise set
7323       ':' later. */
7324 
7325       if (escape < 0)
7326         {
7327         open_capitem *oc;
7328         recno = -escape;
7329 
7330         /* Come here from named backref handling when the reference is to a
7331         single group (i.e. not to a duplicated name). */
7332 
7333         HANDLE_REFERENCE:
7334         if (recno > (int)cb->final_bracount)
7335           {
7336           *errorcodeptr = ERR15;
7337           goto FAILED;
7338           }
7339         if (firstcuflags == REQ_UNSET) firstcuflags = REQ_NONE;
7340         previous = code;
7341         *code++ = ((options & PCRE2_CASELESS) != 0)? OP_REFI : OP_REF;
7342         PUT2INC(code, 0, recno);
7343         cb->backref_map |= (recno < 32)? (1u << recno) : 1;
7344         if ((uint32_t)recno > cb->top_backref) cb->top_backref = recno;
7345 
7346         /* Check to see if this back reference is recursive, that it, it
7347         is inside the group that it references. A flag is set so that the
7348         group can be made atomic. */
7349 
7350         for (oc = cb->open_caps; oc != NULL; oc = oc->next)
7351           {
7352           if (oc->number == recno)
7353             {
7354             oc->flag = TRUE;
7355             break;
7356             }
7357           }
7358         }
7359 
7360       /* So are Unicode property matches, if supported. */
7361 
7362 #ifdef SUPPORT_UNICODE
7363       else if (escape == ESC_P || escape == ESC_p)
7364         {
7365         BOOL negated;
7366         unsigned int ptype = 0, pdata = 0;
7367         if (!get_ucp(&ptr, &negated, &ptype, &pdata, errorcodeptr, cb))
7368           goto FAILED;
7369         previous = code;
7370         *code++ = ((escape == ESC_p) != negated)? OP_PROP : OP_NOTPROP;
7371         *code++ = ptype;
7372         *code++ = pdata;
7373         }
7374 #else
7375 
7376       /* If Unicode properties are not supported, \X, \P, and \p are not
7377       allowed. */
7378 
7379       else if (escape == ESC_X || escape == ESC_P || escape == ESC_p)
7380         {
7381         *errorcodeptr = ERR45;
7382         goto FAILED;
7383         }
7384 #endif
7385 
7386       /* The use of \C can be locked out. */
7387 
7388 #ifdef NEVER_BACKSLASH_C
7389       else if (escape == ESC_C)
7390         {
7391         *errorcodeptr = ERR85;
7392         goto FAILED;
7393         }
7394 #else
7395       else if (escape == ESC_C && (options & PCRE2_NEVER_BACKSLASH_C) != 0)
7396         {
7397         *errorcodeptr = ERR83;
7398         goto FAILED;
7399         }
7400 #endif
7401 
7402       /* For the rest (including \X when Unicode properties are supported), we
7403       can obtain the OP value by negating the escape value in the default
7404       situation when PCRE2_UCP is not set. When it *is* set, we substitute
7405       Unicode property tests. Note that \b and \B do a one-character
7406       lookbehind, and \A also behaves as if it does. */
7407 
7408       else
7409         {
7410         if (escape == ESC_C) cb->external_flags |= PCRE2_HASBKC; /* Record */
7411         if ((escape == ESC_b || escape == ESC_B || escape == ESC_A) &&
7412              cb->max_lookbehind == 0)
7413           cb->max_lookbehind = 1;
7414 #ifdef SUPPORT_UNICODE
7415         if (escape >= ESC_DU && escape <= ESC_wu)
7416           {
7417           cb->nestptr[1] = cb->nestptr[0];         /* Back up if at 2nd level */
7418           cb->nestptr[0] = ptr + 1;                /* Where to resume */
7419           ptr = substitutes[escape - ESC_DU] - 1;  /* Just before substitute */
7420           }
7421         else
7422 #endif
7423         /* In non-UTF mode, and for both 32-bit modes, we turn \C into
7424         OP_ALLANY instead of OP_ANYBYTE so that it works in DFA mode and in
7425         lookbehinds. */
7426 
7427           {
7428           previous = (escape > ESC_b && escape < ESC_Z)? code : NULL;
7429 #if PCRE2_CODE_UNIT_WIDTH == 32
7430           *code++ = (escape == ESC_C)? OP_ALLANY : escape;
7431 #else
7432           *code++ = (!utf && escape == ESC_C)? OP_ALLANY : escape;
7433 #endif
7434           }
7435         }
7436       continue;
7437       }
7438 
7439     /* We have a data character whose value is in c. In UTF-8 mode it may have
7440     a value > 127. We set its representation in the length/buffer, and then
7441     handle it as a data character. */
7442 
7443     mclength = PUTCHAR(c, mcbuffer);
7444     goto ONE_CHAR;
7445 
7446 
7447     /* ===================================================================*/
7448     /* Handle a literal character. It is guaranteed not to be whitespace or #
7449     when the extended flag is set. If we are in a UTF mode, it may be a
7450     multi-unit literal character. */
7451 
7452     default:
7453     NORMAL_CHAR:
7454     mclength = 1;
7455     mcbuffer[0] = c;
7456 
7457 #ifdef SUPPORT_UNICODE
7458     if (utf && HAS_EXTRALEN(c))
7459       ACROSSCHAR(TRUE, ptr[1], mcbuffer[mclength++] = *(++ptr));
7460 #endif
7461 
7462     /* At this point we have the character's bytes in mcbuffer, and the length
7463     in mclength. When not in UTF mode, the length is always 1. */
7464 
7465     ONE_CHAR:
7466     previous = code;
7467 
7468     /* For caseless UTF mode, check whether this character has more than one
7469     other case. If so, generate a special OP_PROP item instead of OP_CHARI. */
7470 
7471 #ifdef SUPPORT_UNICODE
7472     if (utf && (options & PCRE2_CASELESS) != 0)
7473       {
7474       GETCHAR(c, mcbuffer);
7475       if ((c = UCD_CASESET(c)) != 0)
7476         {
7477         *code++ = OP_PROP;
7478         *code++ = PT_CLIST;
7479         *code++ = c;
7480         if (firstcuflags == REQ_UNSET)
7481           firstcuflags = zerofirstcuflags = REQ_NONE;
7482         break;
7483         }
7484       }
7485 #endif
7486 
7487     /* Caseful matches, or not one of the multicase characters. */
7488 
7489     *code++ = ((options & PCRE2_CASELESS) != 0)? OP_CHARI : OP_CHAR;
7490     for (c = 0; c < mclength; c++) *code++ = mcbuffer[c];
7491 
7492     /* Remember if \r or \n were seen */
7493 
7494     if (mcbuffer[0] == CHAR_CR || mcbuffer[0] == CHAR_NL)
7495       cb->external_flags |= PCRE2_HASCRORLF;
7496 
7497     /* Set the first and required bytes appropriately. If no previous first
7498     byte, set it from this character, but revert to none on a zero repeat.
7499     Otherwise, leave the firstcu value alone, and don't change it on a zero
7500     repeat. */
7501 
7502     if (firstcuflags == REQ_UNSET)
7503       {
7504       zerofirstcuflags = REQ_NONE;
7505       zeroreqcu = reqcu;
7506       zeroreqcuflags = reqcuflags;
7507 
7508       /* If the character is more than one byte long, we can set firstcu
7509       only if it is not to be matched caselessly. */
7510 
7511       if (mclength == 1 || req_caseopt == 0)
7512         {
7513         firstcu = mcbuffer[0] | req_caseopt;
7514         firstcu = mcbuffer[0];
7515         firstcuflags = req_caseopt;
7516 
7517         if (mclength != 1)
7518           {
7519           reqcu = code[-1];
7520           reqcuflags = cb->req_varyopt;
7521           }
7522         }
7523       else firstcuflags = reqcuflags = REQ_NONE;
7524       }
7525 
7526     /* firstcu was previously set; we can set reqcu only if the length is
7527     1 or the matching is caseful. */
7528 
7529     else
7530       {
7531       zerofirstcu = firstcu;
7532       zerofirstcuflags = firstcuflags;
7533       zeroreqcu = reqcu;
7534       zeroreqcuflags = reqcuflags;
7535       if (mclength == 1 || req_caseopt == 0)
7536         {
7537         reqcu = code[-1];
7538         reqcuflags = req_caseopt | cb->req_varyopt;
7539         }
7540       }
7541 
7542     break;            /* End of literal character handling */
7543     }
7544   }                   /* end of big loop */
7545 
7546 /* Control never reaches here by falling through, only by a goto for all the
7547 error states. Pass back the position in the pattern so that it can be displayed
7548 to the user for diagnosing the error. */
7549 
7550 FAILED:
7551 *ptrptr = ptr;
7552 return FALSE;
7553 }
7554 
7555 
7556 
7557 /*************************************************
7558 *   Compile regex: a sequence of alternatives    *
7559 *************************************************/
7560 
7561 /* On entry, ptr is pointing past the bracket character, but on return it
7562 points to the closing bracket, or vertical bar, or end of string. The code
7563 variable is pointing at the byte into which the BRA operator has been stored.
7564 This function is used during the pre-compile phase when we are trying to find
7565 out the amount of memory needed, as well as during the real compile phase. The
7566 value of lengthptr distinguishes the two phases.
7567 
7568 Arguments:
7569   options           option bits, including any changes for this subpattern
7570   codeptr           -> the address of the current code pointer
7571   ptrptr            -> the address of the current pattern pointer
7572   errorcodeptr      -> pointer to error code variable
7573   lookbehind        TRUE if this is a lookbehind assertion
7574   reset_bracount    TRUE to reset the count for each branch
7575   skipunits         skip this many code units at start (for brackets and OP_COND)
7576   cond_depth        depth of nesting for conditional subpatterns
7577   firstcuptr        place to put the first required code unit
7578   firstcuflagsptr   place to put the first code unit flags, or a negative number
7579   reqcuptr          place to put the last required code unit
7580   reqcuflagsptr     place to put the last required code unit flags, or a negative number
7581   bcptr             pointer to the chain of currently open branches
7582   cb                points to the data block with tables pointers etc.
7583   lengthptr         NULL during the real compile phase
7584                     points to length accumulator during pre-compile phase
7585 
7586 Returns:            TRUE on success
7587 */
7588 
7589 static BOOL
compile_regex(uint32_t options,PCRE2_UCHAR ** codeptr,PCRE2_SPTR * ptrptr,int * errorcodeptr,BOOL lookbehind,BOOL reset_bracount,uint32_t skipunits,int cond_depth,uint32_t * firstcuptr,int32_t * firstcuflagsptr,uint32_t * reqcuptr,int32_t * reqcuflagsptr,branch_chain * bcptr,compile_block * cb,size_t * lengthptr)7590 compile_regex(uint32_t options, PCRE2_UCHAR **codeptr, PCRE2_SPTR *ptrptr,
7591   int *errorcodeptr, BOOL lookbehind, BOOL reset_bracount, uint32_t skipunits,
7592   int cond_depth, uint32_t *firstcuptr, int32_t *firstcuflagsptr,
7593   uint32_t *reqcuptr, int32_t *reqcuflagsptr, branch_chain *bcptr,
7594   compile_block *cb, size_t *lengthptr)
7595 {
7596 PCRE2_SPTR ptr = *ptrptr;
7597 PCRE2_UCHAR *code = *codeptr;
7598 PCRE2_UCHAR *last_branch = code;
7599 PCRE2_UCHAR *start_bracket = code;
7600 PCRE2_UCHAR *reverse_count = NULL;
7601 open_capitem capitem;
7602 int capnumber = 0;
7603 uint32_t firstcu, reqcu;
7604 int32_t firstcuflags, reqcuflags;
7605 uint32_t branchfirstcu, branchreqcu;
7606 int32_t branchfirstcuflags, branchreqcuflags;
7607 size_t length;
7608 unsigned int orig_bracount;
7609 unsigned int max_bracount;
7610 branch_chain bc;
7611 
7612 /* If set, call the external function that checks for stack availability. */
7613 
7614 if (cb->cx->stack_guard != NULL &&
7615     cb->cx->stack_guard(cb->parens_depth, cb->cx->stack_guard_data))
7616   {
7617   *errorcodeptr= ERR33;
7618   return FALSE;
7619   }
7620 
7621 /* Miscellaneous initialization */
7622 
7623 bc.outer = bcptr;
7624 bc.current_branch = code;
7625 
7626 firstcu = reqcu = 0;
7627 firstcuflags = reqcuflags = REQ_UNSET;
7628 
7629 /* Accumulate the length for use in the pre-compile phase. Start with the
7630 length of the BRA and KET and any extra code units that are required at the
7631 beginning. We accumulate in a local variable to save frequent testing of
7632 lengthptr for NULL. We cannot do this by looking at the value of 'code' at the
7633 start and end of each alternative, because compiled items are discarded during
7634 the pre-compile phase so that the work space is not exceeded. */
7635 
7636 length = 2 + 2*LINK_SIZE + skipunits;
7637 
7638 /* WARNING: If the above line is changed for any reason, you must also change
7639 the code that abstracts option settings at the start of the pattern and makes
7640 them global. It tests the value of length for (2 + 2*LINK_SIZE) in the
7641 pre-compile phase to find out whether or not anything has yet been compiled.
7642 
7643 If this is a capturing subpattern, add to the chain of open capturing items
7644 so that we can detect them if (*ACCEPT) is encountered. This is also used to
7645 detect groups that contain recursive back references to themselves. Note that
7646 only OP_CBRA need be tested here; changing this opcode to one of its variants,
7647 e.g. OP_SCBRAPOS, happens later, after the group has been compiled. */
7648 
7649 if (*code == OP_CBRA)
7650   {
7651   capnumber = GET2(code, 1 + LINK_SIZE);
7652   capitem.number = capnumber;
7653   capitem.next = cb->open_caps;
7654   capitem.flag = FALSE;
7655   cb->open_caps = &capitem;
7656   }
7657 
7658 /* Offset is set zero to mark that this bracket is still open */
7659 
7660 PUT(code, 1, 0);
7661 code += 1 + LINK_SIZE + skipunits;
7662 
7663 /* Loop for each alternative branch */
7664 
7665 orig_bracount = max_bracount = cb->bracount;
7666 
7667 for (;;)
7668   {
7669   /* For a (?| group, reset the capturing bracket count so that each branch
7670   uses the same numbers. */
7671 
7672   if (reset_bracount) cb->bracount = orig_bracount;
7673 
7674   /* Set up dummy OP_REVERSE if lookbehind assertion */
7675 
7676   if (lookbehind)
7677     {
7678     *code++ = OP_REVERSE;
7679     reverse_count = code;
7680     PUTINC(code, 0, 0);
7681     length += 1 + LINK_SIZE;
7682     }
7683 
7684   /* Now compile the branch; in the pre-compile phase its length gets added
7685   into the length. */
7686 
7687   if (!compile_branch(&options, &code, &ptr, errorcodeptr, &branchfirstcu,
7688         &branchfirstcuflags, &branchreqcu, &branchreqcuflags, &bc,
7689         cond_depth, cb, (lengthptr == NULL)? NULL : &length))
7690     {
7691     *ptrptr = ptr;
7692     return FALSE;
7693     }
7694 
7695   /* Keep the highest bracket count in case (?| was used and some branch
7696   has fewer than the rest. */
7697 
7698   if (cb->bracount > max_bracount) max_bracount = cb->bracount;
7699 
7700   /* In the real compile phase, there is some post-processing to be done. */
7701 
7702   if (lengthptr == NULL)
7703     {
7704     /* If this is the first branch, the firstcu and reqcu values for the
7705     branch become the values for the regex. */
7706 
7707     if (*last_branch != OP_ALT)
7708       {
7709       firstcu = branchfirstcu;
7710       firstcuflags = branchfirstcuflags;
7711       reqcu = branchreqcu;
7712       reqcuflags = branchreqcuflags;
7713       }
7714 
7715     /* If this is not the first branch, the first char and reqcu have to
7716     match the values from all the previous branches, except that if the
7717     previous value for reqcu didn't have REQ_VARY set, it can still match,
7718     and we set REQ_VARY for the regex. */
7719 
7720     else
7721       {
7722       /* If we previously had a firstcu, but it doesn't match the new branch,
7723       we have to abandon the firstcu for the regex, but if there was
7724       previously no reqcu, it takes on the value of the old firstcu. */
7725 
7726       if (firstcuflags != branchfirstcuflags || firstcu != branchfirstcu)
7727         {
7728         if (firstcuflags >= 0)
7729           {
7730           if (reqcuflags < 0)
7731             {
7732             reqcu = firstcu;
7733             reqcuflags = firstcuflags;
7734             }
7735           }
7736         firstcuflags = REQ_NONE;
7737         }
7738 
7739       /* If we (now or from before) have no firstcu, a firstcu from the
7740       branch becomes a reqcu if there isn't a branch reqcu. */
7741 
7742       if (firstcuflags < 0 && branchfirstcuflags >= 0 &&
7743           branchreqcuflags < 0)
7744         {
7745         branchreqcu = branchfirstcu;
7746         branchreqcuflags = branchfirstcuflags;
7747         }
7748 
7749       /* Now ensure that the reqcus match */
7750 
7751       if (((reqcuflags & ~REQ_VARY) != (branchreqcuflags & ~REQ_VARY)) ||
7752           reqcu != branchreqcu)
7753         reqcuflags = REQ_NONE;
7754       else
7755         {
7756         reqcu = branchreqcu;
7757         reqcuflags |= branchreqcuflags; /* To "or" REQ_VARY */
7758         }
7759       }
7760 
7761     /* If lookbehind, check that this branch matches a fixed-length string, and
7762     put the length into the OP_REVERSE item. Temporarily mark the end of the
7763     branch with OP_END. If the branch contains OP_RECURSE, the result is
7764     FFL_LATER (a negative value) because there may be forward references that
7765     we can't check here. Set a flag to cause another lookbehind check at the
7766     end. Why not do it all at the end? Because common errors can be picked up
7767     here and the offset of the problem can be shown. */
7768 
7769     if (lookbehind)
7770       {
7771       int fixed_length;
7772       int count = 0;
7773       *code = OP_END;
7774       fixed_length = find_fixedlength(last_branch,  (options & PCRE2_UTF) != 0,
7775         FALSE, cb, NULL, &count);
7776       if (fixed_length == FFL_LATER)
7777         {
7778         cb->check_lookbehind = TRUE;
7779         }
7780       else if (fixed_length < 0)
7781         {
7782         *errorcodeptr = fixed_length_errors[-fixed_length];
7783         *ptrptr = ptr;
7784         return FALSE;
7785         }
7786       else
7787         {
7788         if (fixed_length > cb->max_lookbehind)
7789           cb->max_lookbehind = fixed_length;
7790         PUT(reverse_count, 0, fixed_length);
7791         }
7792       }
7793     }
7794 
7795   /* Reached end of expression, either ')' or end of pattern. In the real
7796   compile phase, go back through the alternative branches and reverse the chain
7797   of offsets, with the field in the BRA item now becoming an offset to the
7798   first alternative. If there are no alternatives, it points to the end of the
7799   group. The length in the terminating ket is always the length of the whole
7800   bracketed item. Return leaving the pointer at the terminating char. */
7801 
7802   if (*ptr != CHAR_VERTICAL_LINE)
7803     {
7804     if (lengthptr == NULL)
7805       {
7806       size_t branch_length = code - last_branch;
7807       do
7808         {
7809         size_t prev_length = GET(last_branch, 1);
7810         PUT(last_branch, 1, branch_length);
7811         branch_length = prev_length;
7812         last_branch -= branch_length;
7813         }
7814       while (branch_length > 0);
7815       }
7816 
7817     /* Fill in the ket */
7818 
7819     *code = OP_KET;
7820     PUT(code, 1, (int)(code - start_bracket));
7821     code += 1 + LINK_SIZE;
7822 
7823     /* If it was a capturing subpattern, check to see if it contained any
7824     recursive back references. If so, we must wrap it in atomic brackets. In
7825     any event, remove the block from the chain. */
7826 
7827     if (capnumber > 0)
7828       {
7829       if (cb->open_caps->flag)
7830         {
7831         memmove(start_bracket + 1 + LINK_SIZE, start_bracket,
7832           CU2BYTES(code - start_bracket));
7833         *start_bracket = OP_ONCE;
7834         code += 1 + LINK_SIZE;
7835         PUT(start_bracket, 1, (int)(code - start_bracket));
7836         *code = OP_KET;
7837         PUT(code, 1, (int)(code - start_bracket));
7838         code += 1 + LINK_SIZE;
7839         length += 2 + 2*LINK_SIZE;
7840         }
7841       cb->open_caps = cb->open_caps->next;
7842       }
7843 
7844     /* Retain the highest bracket number, in case resetting was used. */
7845 
7846     cb->bracount = max_bracount;
7847 
7848     /* Set values to pass back */
7849 
7850     *codeptr = code;
7851     *ptrptr = ptr;
7852     *firstcuptr = firstcu;
7853     *firstcuflagsptr = firstcuflags;
7854     *reqcuptr = reqcu;
7855     *reqcuflagsptr = reqcuflags;
7856     if (lengthptr != NULL)
7857       {
7858       if (OFLOW_MAX - *lengthptr < length)
7859         {
7860         *errorcodeptr = ERR20;
7861         return FALSE;
7862         }
7863       *lengthptr += length;
7864       }
7865     return TRUE;
7866     }
7867 
7868   /* Another branch follows. In the pre-compile phase, we can move the code
7869   pointer back to where it was for the start of the first branch. (That is,
7870   pretend that each branch is the only one.)
7871 
7872   In the real compile phase, insert an ALT node. Its length field points back
7873   to the previous branch while the bracket remains open. At the end the chain
7874   is reversed. It's done like this so that the start of the bracket has a
7875   zero offset until it is closed, making it possible to detect recursion. */
7876 
7877   if (lengthptr != NULL)
7878     {
7879     code = *codeptr + 1 + LINK_SIZE + skipunits;
7880     length += 1 + LINK_SIZE;
7881     }
7882   else
7883     {
7884     *code = OP_ALT;
7885     PUT(code, 1, (int)(code - last_branch));
7886     bc.current_branch = last_branch = code;
7887     code += 1 + LINK_SIZE;
7888     }
7889 
7890   /* Advance past the vertical bar */
7891 
7892   ptr++;
7893   }
7894 /* Control never reaches here */
7895 }
7896 
7897 
7898 
7899 /*************************************************
7900 *          Check for anchored pattern            *
7901 *************************************************/
7902 
7903 /* Try to find out if this is an anchored regular expression. Consider each
7904 alternative branch. If they all start with OP_SOD or OP_CIRC, or with a bracket
7905 all of whose alternatives start with OP_SOD or OP_CIRC (recurse ad lib), then
7906 it's anchored. However, if this is a multiline pattern, then only OP_SOD will
7907 be found, because ^ generates OP_CIRCM in that mode.
7908 
7909 We can also consider a regex to be anchored if OP_SOM starts all its branches.
7910 This is the code for \G, which means "match at start of match position, taking
7911 into account the match offset".
7912 
7913 A branch is also implicitly anchored if it starts with .* and DOTALL is set,
7914 because that will try the rest of the pattern at all possible matching points,
7915 so there is no point trying again.... er ....
7916 
7917 .... except when the .* appears inside capturing parentheses, and there is a
7918 subsequent back reference to those parentheses. We haven't enough information
7919 to catch that case precisely.
7920 
7921 At first, the best we could do was to detect when .* was in capturing brackets
7922 and the highest back reference was greater than or equal to that level.
7923 However, by keeping a bitmap of the first 31 back references, we can catch some
7924 of the more common cases more precisely.
7925 
7926 ... A second exception is when the .* appears inside an atomic group, because
7927 this prevents the number of characters it matches from being adjusted.
7928 
7929 Arguments:
7930   code           points to start of the compiled pattern
7931   bracket_map    a bitmap of which brackets we are inside while testing; this
7932                    handles up to substring 31; after that we just have to take
7933                    the less precise approach
7934   cb             points to the compile data block
7935   atomcount      atomic group level
7936 
7937 Returns:     TRUE or FALSE
7938 */
7939 
7940 static BOOL
is_anchored(register PCRE2_SPTR code,unsigned int bracket_map,compile_block * cb,int atomcount)7941 is_anchored(register PCRE2_SPTR code, unsigned int bracket_map,
7942   compile_block *cb, int atomcount)
7943 {
7944 do {
7945    PCRE2_SPTR scode = first_significant_code(
7946      code + PRIV(OP_lengths)[*code], FALSE);
7947    register int op = *scode;
7948 
7949    /* Non-capturing brackets */
7950 
7951    if (op == OP_BRA  || op == OP_BRAPOS ||
7952        op == OP_SBRA || op == OP_SBRAPOS)
7953      {
7954      if (!is_anchored(scode, bracket_map, cb, atomcount)) return FALSE;
7955      }
7956 
7957    /* Capturing brackets */
7958 
7959    else if (op == OP_CBRA  || op == OP_CBRAPOS ||
7960             op == OP_SCBRA || op == OP_SCBRAPOS)
7961      {
7962      int n = GET2(scode, 1+LINK_SIZE);
7963      int new_map = bracket_map | ((n < 32)? (1u << n) : 1);
7964      if (!is_anchored(scode, new_map, cb, atomcount)) return FALSE;
7965      }
7966 
7967    /* Positive forward assertions and conditions */
7968 
7969    else if (op == OP_ASSERT || op == OP_COND)
7970      {
7971      if (!is_anchored(scode, bracket_map, cb, atomcount)) return FALSE;
7972      }
7973 
7974    /* Atomic groups */
7975 
7976    else if (op == OP_ONCE || op == OP_ONCE_NC)
7977      {
7978      if (!is_anchored(scode, bracket_map, cb, atomcount + 1))
7979        return FALSE;
7980      }
7981 
7982    /* .* is not anchored unless DOTALL is set (which generates OP_ALLANY) and
7983    it isn't in brackets that are or may be referenced or inside an atomic
7984    group. There is also an option that disables auto-anchoring. */
7985 
7986    else if ((op == OP_TYPESTAR || op == OP_TYPEMINSTAR ||
7987              op == OP_TYPEPOSSTAR))
7988      {
7989      if (scode[1] != OP_ALLANY || (bracket_map & cb->backref_map) != 0 ||
7990          atomcount > 0 || cb->had_pruneorskip ||
7991          (cb->external_options & PCRE2_NO_DOTSTAR_ANCHOR) != 0)
7992        return FALSE;
7993      }
7994 
7995    /* Check for explicit anchoring */
7996 
7997    else if (op != OP_SOD && op != OP_SOM && op != OP_CIRC) return FALSE;
7998 
7999    code += GET(code, 1);
8000    }
8001 while (*code == OP_ALT);   /* Loop for each alternative */
8002 return TRUE;
8003 }
8004 
8005 
8006 
8007 /*************************************************
8008 *         Check for starting with ^ or .*        *
8009 *************************************************/
8010 
8011 /* This is called to find out if every branch starts with ^ or .* so that
8012 "first char" processing can be done to speed things up in multiline
8013 matching and for non-DOTALL patterns that start with .* (which must start at
8014 the beginning or after \n). As in the case of is_anchored() (see above), we
8015 have to take account of back references to capturing brackets that contain .*
8016 because in that case we can't make the assumption. Also, the appearance of .*
8017 inside atomic brackets or in a pattern that contains *PRUNE or *SKIP does not
8018 count, because once again the assumption no longer holds.
8019 
8020 Arguments:
8021   code           points to start of the compiled pattern or a group
8022   bracket_map    a bitmap of which brackets we are inside while testing; this
8023                    handles up to substring 31; after that we just have to take
8024                    the less precise approach
8025   cb             points to the compile data
8026   atomcount      atomic group level
8027 
8028 Returns:         TRUE or FALSE
8029 */
8030 
8031 static BOOL
is_startline(PCRE2_SPTR code,unsigned int bracket_map,compile_block * cb,int atomcount)8032 is_startline(PCRE2_SPTR code, unsigned int bracket_map, compile_block *cb,
8033   int atomcount)
8034 {
8035 do {
8036    PCRE2_SPTR scode = first_significant_code(
8037      code + PRIV(OP_lengths)[*code], FALSE);
8038    register int op = *scode;
8039 
8040    /* If we are at the start of a conditional assertion group, *both* the
8041    conditional assertion *and* what follows the condition must satisfy the test
8042    for start of line. Other kinds of condition fail. Note that there may be an
8043    auto-callout at the start of a condition. */
8044 
8045    if (op == OP_COND)
8046      {
8047      scode += 1 + LINK_SIZE;
8048 
8049      if (*scode == OP_CALLOUT) scode += PRIV(OP_lengths)[OP_CALLOUT];
8050        else if (*scode == OP_CALLOUT_STR) scode += GET(scode, 1 + 2*LINK_SIZE);
8051 
8052      switch (*scode)
8053        {
8054        case OP_CREF:
8055        case OP_DNCREF:
8056        case OP_RREF:
8057        case OP_DNRREF:
8058        case OP_FAIL:
8059        case OP_FALSE:
8060        case OP_TRUE:
8061        return FALSE;
8062 
8063        default:     /* Assertion */
8064        if (!is_startline(scode, bracket_map, cb, atomcount)) return FALSE;
8065        do scode += GET(scode, 1); while (*scode == OP_ALT);
8066        scode += 1 + LINK_SIZE;
8067        break;
8068        }
8069      scode = first_significant_code(scode, FALSE);
8070      op = *scode;
8071      }
8072 
8073    /* Non-capturing brackets */
8074 
8075    if (op == OP_BRA  || op == OP_BRAPOS ||
8076        op == OP_SBRA || op == OP_SBRAPOS)
8077      {
8078      if (!is_startline(scode, bracket_map, cb, atomcount)) return FALSE;
8079      }
8080 
8081    /* Capturing brackets */
8082 
8083    else if (op == OP_CBRA  || op == OP_CBRAPOS ||
8084             op == OP_SCBRA || op == OP_SCBRAPOS)
8085      {
8086      int n = GET2(scode, 1+LINK_SIZE);
8087      int new_map = bracket_map | ((n < 32)? (1u << n) : 1);
8088      if (!is_startline(scode, new_map, cb, atomcount)) return FALSE;
8089      }
8090 
8091    /* Positive forward assertions */
8092 
8093    else if (op == OP_ASSERT)
8094      {
8095      if (!is_startline(scode, bracket_map, cb, atomcount)) return FALSE;
8096      }
8097 
8098    /* Atomic brackets */
8099 
8100    else if (op == OP_ONCE || op == OP_ONCE_NC)
8101      {
8102      if (!is_startline(scode, bracket_map, cb, atomcount + 1)) return FALSE;
8103      }
8104 
8105    /* .* means "start at start or after \n" if it isn't in atomic brackets or
8106    brackets that may be referenced, as long as the pattern does not contain
8107    *PRUNE or *SKIP, because these break the feature. Consider, for example,
8108    /.*?a(*PRUNE)b/ with the subject "aab", which matches "ab", i.e. not at the
8109    start of a line. There is also an option that disables this optimization. */
8110 
8111    else if (op == OP_TYPESTAR || op == OP_TYPEMINSTAR || op == OP_TYPEPOSSTAR)
8112      {
8113      if (scode[1] != OP_ANY || (bracket_map & cb->backref_map) != 0 ||
8114          atomcount > 0 || cb->had_pruneorskip ||
8115          (cb->external_options & PCRE2_NO_DOTSTAR_ANCHOR) != 0)
8116        return FALSE;
8117      }
8118 
8119    /* Check for explicit circumflex; anything else gives a FALSE result. Note
8120    in particular that this includes atomic brackets OP_ONCE and OP_ONCE_NC
8121    because the number of characters matched by .* cannot be adjusted inside
8122    them. */
8123 
8124    else if (op != OP_CIRC && op != OP_CIRCM) return FALSE;
8125 
8126    /* Move on to the next alternative */
8127 
8128    code += GET(code, 1);
8129    }
8130 while (*code == OP_ALT);  /* Loop for each alternative */
8131 return TRUE;
8132 }
8133 
8134 
8135 
8136 /*************************************************
8137 *    Check for asserted fixed first code unit    *
8138 *************************************************/
8139 
8140 /* During compilation, the "first code unit" settings from forward assertions
8141 are discarded, because they can cause conflicts with actual literals that
8142 follow. However, if we end up without a first code unit setting for an
8143 unanchored pattern, it is worth scanning the regex to see if there is an
8144 initial asserted first code unit. If all branches start with the same asserted
8145 code unit, or with a non-conditional bracket all of whose alternatives start
8146 with the same asserted code unit (recurse ad lib), then we return that code
8147 unit, with the flags set to zero or REQ_CASELESS; otherwise return zero with
8148 REQ_NONE in the flags.
8149 
8150 Arguments:
8151   code       points to start of compiled pattern
8152   flags      points to the first code unit flags
8153   inassert   TRUE if in an assertion
8154 
8155 Returns:     the fixed first code unit, or 0 with REQ_NONE in flags
8156 */
8157 
8158 static uint32_t
find_firstassertedcu(PCRE2_SPTR code,int32_t * flags,BOOL inassert)8159 find_firstassertedcu(PCRE2_SPTR code, int32_t *flags, BOOL inassert)
8160 {
8161 register uint32_t c = 0;
8162 int cflags = REQ_NONE;
8163 
8164 *flags = REQ_NONE;
8165 do {
8166    uint32_t d;
8167    int dflags;
8168    int xl = (*code == OP_CBRA || *code == OP_SCBRA ||
8169              *code == OP_CBRAPOS || *code == OP_SCBRAPOS)? IMM2_SIZE:0;
8170    PCRE2_SPTR scode = first_significant_code(code + 1+LINK_SIZE + xl, TRUE);
8171    register PCRE2_UCHAR op = *scode;
8172 
8173    switch(op)
8174      {
8175      default:
8176      return 0;
8177 
8178      case OP_BRA:
8179      case OP_BRAPOS:
8180      case OP_CBRA:
8181      case OP_SCBRA:
8182      case OP_CBRAPOS:
8183      case OP_SCBRAPOS:
8184      case OP_ASSERT:
8185      case OP_ONCE:
8186      case OP_ONCE_NC:
8187      d = find_firstassertedcu(scode, &dflags, op == OP_ASSERT);
8188      if (dflags < 0)
8189        return 0;
8190      if (cflags < 0) { c = d; cflags = dflags; }
8191        else if (c != d || cflags != dflags) return 0;
8192      break;
8193 
8194      case OP_EXACT:
8195      scode += IMM2_SIZE;
8196      /* Fall through */
8197 
8198      case OP_CHAR:
8199      case OP_PLUS:
8200      case OP_MINPLUS:
8201      case OP_POSPLUS:
8202      if (!inassert) return 0;
8203      if (cflags < 0) { c = scode[1]; cflags = 0; }
8204        else if (c != scode[1]) return 0;
8205      break;
8206 
8207      case OP_EXACTI:
8208      scode += IMM2_SIZE;
8209      /* Fall through */
8210 
8211      case OP_CHARI:
8212      case OP_PLUSI:
8213      case OP_MINPLUSI:
8214      case OP_POSPLUSI:
8215      if (!inassert) return 0;
8216      if (cflags < 0) { c = scode[1]; cflags = REQ_CASELESS; }
8217        else if (c != scode[1]) return 0;
8218      break;
8219      }
8220 
8221    code += GET(code, 1);
8222    }
8223 while (*code == OP_ALT);
8224 
8225 *flags = cflags;
8226 return c;
8227 }
8228 
8229 
8230 
8231 /*************************************************
8232 *     Add an entry to the name/number table      *
8233 *************************************************/
8234 
8235 /* This function is called between compiling passes to add an entry to the
8236 name/number table, maintaining alphabetical order. Checking for permitted
8237 and forbidden duplicates has already been done.
8238 
8239 Arguments:
8240   cb           the compile data block
8241   name         the name to add
8242   length       the length of the name
8243   groupno      the group number
8244 
8245 Returns:       nothing
8246 */
8247 
8248 static void
add_name_to_table(compile_block * cb,PCRE2_SPTR name,int length,unsigned int groupno)8249 add_name_to_table(compile_block *cb, PCRE2_SPTR name, int length,
8250   unsigned int groupno)
8251 {
8252 int i;
8253 PCRE2_UCHAR *slot = cb->name_table;
8254 
8255 for (i = 0; i < cb->names_found; i++)
8256   {
8257   int crc = memcmp(name, slot+IMM2_SIZE, CU2BYTES(length));
8258   if (crc == 0 && slot[IMM2_SIZE+length] != 0)
8259     crc = -1; /* Current name is a substring */
8260 
8261   /* Make space in the table and break the loop for an earlier name. For a
8262   duplicate or later name, carry on. We do this for duplicates so that in the
8263   simple case (when ?(| is not used) they are in order of their numbers. In all
8264   cases they are in the order in which they appear in the pattern. */
8265 
8266   if (crc < 0)
8267     {
8268     memmove(slot + cb->name_entry_size, slot,
8269       CU2BYTES((cb->names_found - i) * cb->name_entry_size));
8270     break;
8271     }
8272 
8273   /* Continue the loop for a later or duplicate name */
8274 
8275   slot += cb->name_entry_size;
8276   }
8277 
8278 PUT2(slot, 0, groupno);
8279 memcpy(slot + IMM2_SIZE, name, CU2BYTES(length));
8280 cb->names_found++;
8281 
8282 /* Add a terminating zero and fill the rest of the slot with zeroes so that
8283 the memory is all initialized. Otherwise valgrind moans about uninitialized
8284 memory when saving serialized compiled patterns. */
8285 
8286 memset(slot + IMM2_SIZE + length, 0,
8287   CU2BYTES(cb->name_entry_size - length - IMM2_SIZE));
8288 }
8289 
8290 
8291 
8292 /*************************************************
8293 *     External function to compile a pattern     *
8294 *************************************************/
8295 
8296 /* This function reads a regular expression in the form of a string and returns
8297 a pointer to a block of store holding a compiled version of the expression.
8298 
8299 Arguments:
8300   pattern       the regular expression
8301   patlen        the length of the pattern, or PCRE2_ZERO_TERMINATED
8302   options       option bits
8303   errorptr      pointer to errorcode
8304   erroroffset   pointer to error offset
8305   ccontext      points to a compile context or is NULL
8306 
8307 Returns:        pointer to compiled data block, or NULL on error,
8308                 with errorcode and erroroffset set
8309 */
8310 
8311 PCRE2_EXP_DEFN pcre2_code * PCRE2_CALL_CONVENTION
pcre2_compile(PCRE2_SPTR pattern,PCRE2_SIZE patlen,uint32_t options,int * errorptr,PCRE2_SIZE * erroroffset,pcre2_compile_context * ccontext)8312 pcre2_compile(PCRE2_SPTR pattern, PCRE2_SIZE patlen, uint32_t options,
8313    int *errorptr, PCRE2_SIZE *erroroffset, pcre2_compile_context *ccontext)
8314 {
8315 BOOL utf;                               /* Set TRUE for UTF mode */
8316 pcre2_real_code *re = NULL;             /* What we will return */
8317 compile_block cb;                       /* "Static" compile-time data */
8318 const uint8_t *tables;                  /* Char tables base pointer */
8319 
8320 PCRE2_UCHAR *code;                      /* Current pointer in compiled code */
8321 PCRE2_SPTR codestart;                   /* Start of compiled code */
8322 PCRE2_SPTR ptr;                         /* Current pointer in pattern */
8323 
8324 size_t length = 1;                      /* Allow or final END opcode */
8325 size_t usedlength;                      /* Actual length used */
8326 size_t re_blocksize;                    /* Size of memory block */
8327 
8328 int32_t firstcuflags, reqcuflags;       /* Type of first/req code unit */
8329 uint32_t firstcu, reqcu;                /* Value of first/req code unit */
8330 uint32_t setflags = 0;                  /* NL and BSR set flags */
8331 
8332 uint32_t skipatstart;                   /* When checking (*UTF) etc */
8333 uint32_t limit_match = UINT32_MAX;      /* Unset match limits */
8334 uint32_t limit_recursion = UINT32_MAX;
8335 
8336 int newline = 0;                        /* Unset; can be set by the pattern */
8337 int bsr = 0;                            /* Unset; can be set by the pattern */
8338 int errorcode = 0;                      /* Initialize to avoid compiler warn */
8339 
8340 /* Comments at the head of this file explain about these variables. */
8341 
8342 PCRE2_UCHAR *copied_pattern = NULL;
8343 PCRE2_UCHAR stack_copied_pattern[COPIED_PATTERN_SIZE];
8344 named_group named_groups[NAMED_GROUP_LIST_SIZE];
8345 
8346 /* The workspace is used in different ways in the different compiling phases.
8347 It needs to be 16-bit aligned for the preliminary group scan, and 32-bit
8348 aligned for the group information cache. */
8349 
8350 uint32_t c32workspace[C32_WORK_SIZE];
8351 PCRE2_UCHAR *cworkspace = (PCRE2_UCHAR *)c32workspace;
8352 
8353 
8354 /* -------------- Check arguments and set up the pattern ----------------- */
8355 
8356 /* There must be error code and offset pointers. */
8357 
8358 if (errorptr == NULL || erroroffset == NULL) return NULL;
8359 *errorptr = ERR0;
8360 *erroroffset = 0;
8361 
8362 /* There must be a pattern! */
8363 
8364 if (pattern == NULL)
8365   {
8366   *errorptr = ERR16;
8367   return NULL;
8368   }
8369 
8370 /* Check that all undefined public option bits are zero. */
8371 
8372 if ((options & ~PUBLIC_COMPILE_OPTIONS) != 0)
8373   {
8374   *errorptr = ERR17;
8375   return NULL;
8376   }
8377 
8378 /* A NULL compile context means "use a default context" */
8379 
8380 if (ccontext == NULL)
8381   ccontext = (pcre2_compile_context *)(&PRIV(default_compile_context));
8382 
8383 /* A zero-terminated pattern is indicated by the special length value
8384 PCRE2_ZERO_TERMINATED. Otherwise, we make a copy of the pattern and add a zero,
8385 to ensure that it is always possible to look one code unit beyond the end of
8386 the pattern's characters. In both cases, check that the pattern is overlong. */
8387 
8388 if (patlen == PCRE2_ZERO_TERMINATED)
8389   {
8390   patlen = PRIV(strlen)(pattern);
8391   if (patlen > ccontext->max_pattern_length)
8392     {
8393     *errorptr = ERR88;
8394     return NULL;
8395     }
8396   }
8397 else
8398   {
8399   if (patlen > ccontext->max_pattern_length)
8400     {
8401     *errorptr = ERR88;
8402     return NULL;
8403     }
8404   if (patlen < COPIED_PATTERN_SIZE)
8405     copied_pattern = stack_copied_pattern;
8406   else
8407     {
8408     copied_pattern = ccontext->memctl.malloc(CU2BYTES(patlen + 1),
8409       ccontext->memctl.memory_data);
8410     if (copied_pattern == NULL)
8411       {
8412       *errorptr = ERR21;
8413       return NULL;
8414       }
8415     }
8416   memcpy(copied_pattern, pattern, CU2BYTES(patlen));
8417   copied_pattern[patlen] = 0;
8418   pattern = copied_pattern;
8419   }
8420 
8421 /* ------------ Initialize the "static" compile data -------------- */
8422 
8423 tables = (ccontext->tables != NULL)? ccontext->tables : PRIV(default_tables);
8424 
8425 cb.lcc = tables + lcc_offset;          /* Individual */
8426 cb.fcc = tables + fcc_offset;          /*   character */
8427 cb.cbits = tables + cbits_offset;      /*      tables */
8428 cb.ctypes = tables + ctypes_offset;
8429 
8430 cb.assert_depth = 0;
8431 cb.bracount = cb.final_bracount = 0;
8432 cb.cx = ccontext;
8433 cb.dupnames = FALSE;
8434 cb.end_pattern = pattern + patlen;
8435 cb.nestptr[0] = cb.nestptr[1] = NULL;
8436 cb.external_flags = 0;
8437 cb.external_options = options;
8438 cb.groupinfo = c32workspace;
8439 cb.had_recurse = FALSE;
8440 cb.iscondassert = FALSE;
8441 cb.max_lookbehind = 0;
8442 cb.name_entry_size = 0;
8443 cb.name_table = NULL;
8444 cb.named_groups = named_groups;
8445 cb.named_group_list_size = NAMED_GROUP_LIST_SIZE;
8446 cb.names_found = 0;
8447 cb.open_caps = NULL;
8448 cb.parens_depth = 0;
8449 cb.req_varyopt = 0;
8450 cb.start_code = cworkspace;
8451 cb.start_pattern = pattern;
8452 cb.start_workspace = cworkspace;
8453 cb.workspace_size = COMPILE_WORK_SIZE;
8454 
8455 /* Maximum back reference and backref bitmap. The bitmap records up to 31 back
8456 references to help in deciding whether (.*) can be treated as anchored or not.
8457 */
8458 
8459 cb.top_backref = 0;
8460 cb.backref_map = 0;
8461 
8462 /* --------------- Start looking at the pattern --------------- */
8463 
8464 /* Check for global one-time option settings at the start of the pattern, and
8465 remember the offset to the actual regex. */
8466 
8467 ptr = pattern;
8468 skipatstart = 0;
8469 
8470 while (ptr[skipatstart] == CHAR_LEFT_PARENTHESIS &&
8471        ptr[skipatstart+1] == CHAR_ASTERISK)
8472   {
8473   unsigned int i;
8474   for (i = 0; i < sizeof(pso_list)/sizeof(pso); i++)
8475     {
8476     pso *p = pso_list + i;
8477 
8478     if (PRIV(strncmp_c8)(ptr+skipatstart+2, (char *)(p->name), p->length) == 0)
8479       {
8480       uint32_t c, pp;
8481 
8482       skipatstart += p->length + 2;
8483       switch(p->type)
8484         {
8485         case PSO_OPT:
8486         cb.external_options |= p->value;
8487         break;
8488 
8489         case PSO_FLG:
8490         setflags |= p->value;
8491         break;
8492 
8493         case PSO_NL:
8494         newline = p->value;
8495         setflags |= PCRE2_NL_SET;
8496         break;
8497 
8498         case PSO_BSR:
8499         bsr = p->value;
8500         setflags |= PCRE2_BSR_SET;
8501         break;
8502 
8503         case PSO_LIMM:
8504         case PSO_LIMR:
8505         c = 0;
8506         pp = skipatstart;
8507         if (!IS_DIGIT(ptr[pp]))
8508           {
8509           errorcode = ERR60;
8510           ptr += pp;
8511           goto HAD_ERROR;
8512           }
8513         while (IS_DIGIT(ptr[pp]))
8514           {
8515           if (c > UINT32_MAX / 10 - 1) break;   /* Integer overflow */
8516           c = c*10 + (ptr[pp++] - CHAR_0);
8517           }
8518         if (ptr[pp++] != CHAR_RIGHT_PARENTHESIS)
8519           {
8520           errorcode = ERR60;
8521           ptr += pp;
8522           goto HAD_ERROR;
8523           }
8524         if (p->type == PSO_LIMM) limit_match = c;
8525           else limit_recursion = c;
8526         skipatstart += pp - skipatstart;
8527         break;
8528         }
8529       break;   /* Out of the table scan loop */
8530       }
8531     }
8532   if (i >= sizeof(pso_list)/sizeof(pso)) break;   /* Out of pso loop */
8533   }
8534 
8535 /* End of pattern-start options; advance to start of real regex. */
8536 
8537 ptr += skipatstart;
8538 
8539 /* Can't support UTF or UCP unless PCRE2 has been compiled with UTF support. */
8540 
8541 #ifndef SUPPORT_UNICODE
8542 if ((cb.external_options & (PCRE2_UTF|PCRE2_UCP)) != 0)
8543   {
8544   errorcode = ERR32;
8545   goto HAD_ERROR;
8546   }
8547 #endif
8548 
8549 /* Check UTF. We have the original options in 'options', with that value as
8550 modified by (*UTF) etc in cb->external_options. */
8551 
8552 utf = (cb.external_options & PCRE2_UTF) != 0;
8553 if (utf)
8554   {
8555   if ((options & PCRE2_NEVER_UTF) != 0)
8556     {
8557     errorcode = ERR74;
8558     goto HAD_ERROR;
8559     }
8560   if ((options & PCRE2_NO_UTF_CHECK) == 0 &&
8561        (errorcode = PRIV(valid_utf)(pattern, patlen, erroroffset)) != 0)
8562     goto HAD_UTF_ERROR;
8563   }
8564 
8565 /* Check UCP lockout. */
8566 
8567 if ((cb.external_options & (PCRE2_UCP|PCRE2_NEVER_UCP)) ==
8568     (PCRE2_UCP|PCRE2_NEVER_UCP))
8569   {
8570   errorcode = ERR75;
8571   goto HAD_ERROR;
8572   }
8573 
8574 /* Process the BSR setting. */
8575 
8576 if (bsr == 0) bsr = ccontext->bsr_convention;
8577 
8578 /* Process the newline setting. */
8579 
8580 if (newline == 0) newline = ccontext->newline_convention;
8581 cb.nltype = NLTYPE_FIXED;
8582 switch(newline)
8583   {
8584   case PCRE2_NEWLINE_CR:
8585   cb.nllen = 1;
8586   cb.nl[0] = CHAR_CR;
8587   break;
8588 
8589   case PCRE2_NEWLINE_LF:
8590   cb.nllen = 1;
8591   cb.nl[0] = CHAR_NL;
8592   break;
8593 
8594   case PCRE2_NEWLINE_CRLF:
8595   cb.nllen = 2;
8596   cb.nl[0] = CHAR_CR;
8597   cb.nl[1] = CHAR_NL;
8598   break;
8599 
8600   case PCRE2_NEWLINE_ANY:
8601   cb.nltype = NLTYPE_ANY;
8602   break;
8603 
8604   case PCRE2_NEWLINE_ANYCRLF:
8605   cb.nltype = NLTYPE_ANYCRLF;
8606   break;
8607 
8608   default:
8609   errorcode = ERR56;
8610   goto HAD_ERROR;
8611   }
8612 
8613 /* Before we do anything else, do a pre-scan of the pattern in order to
8614 discover the named groups and their numerical equivalents, so that this
8615 information is always available for the remaining processing. */
8616 
8617 errorcode = scan_for_captures(&ptr, cb.external_options, &cb);
8618 if (errorcode != 0) goto HAD_ERROR;
8619 
8620 /* For obscure debugging this code can be enabled. */
8621 
8622 #if 0
8623   {
8624   int i;
8625   named_group *ng = cb.named_groups;
8626   fprintf(stderr, "+++Captures: %d\n", cb.final_bracount);
8627   for (i = 0; i < cb.names_found; i++, ng++)
8628     {
8629     fprintf(stderr, "+++%3d %.*s\n", ng->number, ng->length, ng->name);
8630     }
8631   }
8632 #endif
8633 
8634 /* Reset current bracket count to zero and current pointer to the start of the
8635 pattern. */
8636 
8637 cb.bracount = 0;
8638 ptr = pattern + skipatstart;
8639 
8640 /* Pretend to compile the pattern while actually just accumulating the amount
8641 of memory required in the 'length' variable. This behaviour is triggered by
8642 passing a non-NULL final argument to compile_regex(). We pass a block of
8643 workspace (cworkspace) for it to compile parts of the pattern into; the
8644 compiled code is discarded when it is no longer needed, so hopefully this
8645 workspace will never overflow, though there is a test for its doing so.
8646 
8647 On error, errorcode will be set non-zero, so we don't need to look at the
8648 result of the function. The initial options have been put into the cb block so
8649 that they can be changed if an option setting is found within the regex right
8650 at the beginning. Bringing initial option settings outside can help speed up
8651 starting point checks. We still have to pass a separate options variable (the
8652 first argument) because that may change as the pattern is processed. */
8653 
8654 code = cworkspace;
8655 *code = OP_BRA;
8656 
8657 (void)compile_regex(cb.external_options, &code, &ptr, &errorcode, FALSE,
8658   FALSE, 0, 0, &firstcu, &firstcuflags, &reqcu, &reqcuflags, NULL,
8659   &cb, &length);
8660 
8661 if (errorcode != 0) goto HAD_ERROR;
8662 if (length > MAX_PATTERN_SIZE)
8663   {
8664   errorcode = ERR20;
8665   goto HAD_ERROR;
8666   }
8667 
8668 /* Compute the size of, and then get and initialize, the data block for storing
8669 the compiled pattern and names table. Integer overflow should no longer be
8670 possible because nowadays we limit the maximum value of cb.names_found and
8671 cb.name_entry_size. */
8672 
8673 re_blocksize = sizeof(pcre2_real_code) +
8674   CU2BYTES(length + cb.names_found * cb.name_entry_size);
8675 re = (pcre2_real_code *)
8676   ccontext->memctl.malloc(re_blocksize, ccontext->memctl.memory_data);
8677 if (re == NULL)
8678   {
8679   errorcode = ERR21;
8680   goto HAD_ERROR;
8681   }
8682 
8683 re->memctl = ccontext->memctl;
8684 re->tables = tables;
8685 re->executable_jit = NULL;
8686 memset(re->start_bitmap, 0, 32 * sizeof(uint8_t));
8687 re->blocksize = re_blocksize;
8688 re->magic_number = MAGIC_NUMBER;
8689 re->compile_options = options;
8690 re->overall_options = cb.external_options;
8691 re->flags = PCRE2_CODE_UNIT_WIDTH/8 | cb.external_flags | setflags;
8692 re->limit_match = limit_match;
8693 re->limit_recursion = limit_recursion;
8694 re->first_codeunit = 0;
8695 re->last_codeunit = 0;
8696 re->bsr_convention = bsr;
8697 re->newline_convention = newline;
8698 re->max_lookbehind = 0;
8699 re->minlength = 0;
8700 re->top_bracket = 0;
8701 re->top_backref = 0;
8702 re->name_entry_size = cb.name_entry_size;
8703 re->name_count = cb.names_found;
8704 
8705 /* The basic block is immediately followed by the name table, and the compiled
8706 code follows after that. */
8707 
8708 codestart = (PCRE2_SPTR)((uint8_t *)re + sizeof(pcre2_real_code)) +
8709   re->name_entry_size * re->name_count;
8710 
8711 /* Workspace is needed to remember information about numbered groups: whether a
8712 group can match an empty string and what its fixed length is. This is done to
8713 avoid the possibility of recursive references causing very long compile times
8714 when checking these features. Unnumbered groups do not have this exposure since
8715 they cannot be referenced. We use an indexed vector for this purpose. If there
8716 are sufficiently few groups, it can be the c32workspace vector, as set up
8717 above. Otherwise we have to get/free a special vector. The vector must be
8718 initialized to zero. */
8719 
8720 if (cb.final_bracount >= C32_WORK_SIZE)
8721   {
8722   cb.groupinfo = ccontext->memctl.malloc(
8723     (cb.final_bracount + 1)*sizeof(uint32_t), ccontext->memctl.memory_data);
8724   if (cb.groupinfo == NULL)
8725     {
8726     errorcode = ERR21;
8727     goto HAD_ERROR;
8728     }
8729   }
8730 memset(cb.groupinfo, 0, (cb.final_bracount + 1) * sizeof(uint32_t));
8731 
8732 /* Update the compile data block for the actual compile. The starting points of
8733 the name/number translation table and of the code are passed around in the
8734 compile data block. The start/end pattern and initial options are already set
8735 from the pre-compile phase, as is the name_entry_size field. Reset the bracket
8736 count and the names_found field. */
8737 
8738 cb.parens_depth = 0;
8739 cb.assert_depth = 0;
8740 cb.bracount = 0;
8741 cb.max_lookbehind = 0;
8742 cb.name_table = (PCRE2_UCHAR *)((uint8_t *)re + sizeof(pcre2_real_code));
8743 cb.start_code = codestart;
8744 cb.iscondassert = FALSE;
8745 cb.req_varyopt = 0;
8746 cb.had_accept = FALSE;
8747 cb.had_pruneorskip = FALSE;
8748 cb.check_lookbehind = FALSE;
8749 cb.open_caps = NULL;
8750 
8751 /* If any named groups were found, create the name/number table from the list
8752 created in the pre-pass. */
8753 
8754 if (cb.names_found > 0)
8755   {
8756   int i = cb.names_found;
8757   named_group *ng = cb.named_groups;
8758   cb.names_found = 0;
8759   for (; i > 0; i--, ng++)
8760     add_name_to_table(&cb, ng->name, ng->length, ng->number);
8761   }
8762 
8763 /* Set up a starting, non-extracting bracket, then compile the expression. On
8764 error, errorcode will be set non-zero, so we don't need to look at the result
8765 of the function here. */
8766 
8767 ptr = pattern + skipatstart;
8768 code = (PCRE2_UCHAR *)codestart;
8769 *code = OP_BRA;
8770 (void)compile_regex(re->overall_options, &code, &ptr, &errorcode, FALSE, FALSE,
8771    0, 0, &firstcu, &firstcuflags, &reqcu, &reqcuflags, NULL, &cb, NULL);
8772 
8773 re->top_bracket = cb.bracount;
8774 re->top_backref = cb.top_backref;
8775 re->max_lookbehind = cb.max_lookbehind;
8776 
8777 if (cb.had_accept)
8778   {
8779   reqcu = 0;              /* Must disable after (*ACCEPT) */
8780   reqcuflags = REQ_NONE;
8781   }
8782 
8783 /* Fill in the final opcode and check for disastrous overflow. If no overflow,
8784 but the estimated length exceeds the really used length, adjust the value of
8785 re->blocksize, and if valgrind support is configured, mark the extra allocated
8786 memory as unaddressable, so that any out-of-bound reads can be detected. */
8787 
8788 *code++ = OP_END;
8789 usedlength = code - codestart;
8790 if (usedlength > length) errorcode = ERR23; else
8791   {
8792   re->blocksize -= CU2BYTES(length - usedlength);
8793 #ifdef SUPPORT_VALGRIND
8794   VALGRIND_MAKE_MEM_NOACCESS(code, CU2BYTES(length - usedlength));
8795 #endif
8796   }
8797 
8798 /* Scan the pattern for recursion/subroutine calls and convert the group
8799 numbers into offsets. Maintain a small cache so that repeated groups containing
8800 recursions are efficiently handled. */
8801 
8802 #define RSCAN_CACHE_SIZE 8
8803 
8804 if (errorcode == 0 && cb.had_recurse)
8805   {
8806   PCRE2_UCHAR *rcode;
8807   PCRE2_SPTR rgroup;
8808   int ccount = 0;
8809   int start = RSCAN_CACHE_SIZE;
8810   recurse_cache rc[RSCAN_CACHE_SIZE];
8811 
8812   for (rcode = (PCRE2_UCHAR *)find_recurse(codestart, utf);
8813        rcode != NULL;
8814        rcode = (PCRE2_UCHAR *)find_recurse(rcode + 1 + LINK_SIZE, utf))
8815     {
8816     int i, p, recno;
8817 
8818     recno = (int)GET(rcode, 1);
8819     if (recno == 0) rgroup = codestart; else
8820       {
8821       PCRE2_SPTR search_from = codestart;
8822       rgroup = NULL;
8823       for (i = 0, p = start; i < ccount; i++, p = (p + 1) & 7)
8824         {
8825         if (recno == rc[p].recno)
8826           {
8827           rgroup = rc[p].group;
8828           break;
8829           }
8830 
8831         /* Group n+1 must always start to the right of group n, so we can save
8832         search time below when the new group number is greater than any of the
8833         previously found groups. */
8834 
8835         if (recno > rc[p].recno) search_from = rc[p].group;
8836         }
8837 
8838       if (rgroup == NULL)
8839         {
8840         rgroup = PRIV(find_bracket)(search_from, utf, recno);
8841         if (rgroup == NULL)
8842           {
8843           errorcode = ERR53;
8844           break;
8845           }
8846         if (--start < 0) start = RSCAN_CACHE_SIZE - 1;
8847         rc[start].recno = recno;
8848         rc[start].group = rgroup;
8849         if (ccount < RSCAN_CACHE_SIZE) ccount++;
8850         }
8851       }
8852 
8853     PUT(rcode, 1, rgroup - codestart);
8854     }
8855   }
8856 
8857 /* In rare debugging situations we sometimes need to look at the compiled code
8858 at this stage. */
8859 
8860 #ifdef CALL_PRINTINT
8861 pcre2_printint(re, stderr, TRUE);
8862 fprintf(stderr, "Length=%lu Used=%lu\n", length, usedlength);
8863 #endif
8864 
8865 /* After a successful compile, give an error if there's back reference to a
8866 non-existent capturing subpattern. Then, unless disabled, check whether any
8867 single character iterators can be auto-possessified. The function overwrites
8868 the appropriate opcode values, so the type of the pointer must be cast. NOTE:
8869 the intermediate variable "temp" is used in this code because at least one
8870 compiler gives a warning about loss of "const" attribute if the cast
8871 (PCRE2_UCHAR *)codestart is used directly in the function call. */
8872 
8873 if (errorcode == 0)
8874   {
8875   if (re->top_backref > re->top_bracket) errorcode = ERR15;
8876   else if ((re->overall_options & PCRE2_NO_AUTO_POSSESS) == 0)
8877     {
8878     PCRE2_UCHAR *temp = (PCRE2_UCHAR *)codestart;
8879     if (PRIV(auto_possessify)(temp, utf, &cb) != 0) errorcode = ERR80;
8880     }
8881   }
8882 
8883 /* If there were any lookbehind assertions that contained OP_RECURSE
8884 (recursions or subroutine calls), a flag is set for them to be checked here,
8885 because they may contain forward references. Actual recursions cannot be fixed
8886 length, but subroutine calls can. It is done like this so that those without
8887 OP_RECURSE that are not fixed length get a diagnosic with a useful offset. The
8888 exceptional ones forgo this. We scan the pattern to check that they are fixed
8889 length, and set their lengths. */
8890 
8891 if (errorcode == 0 && cb.check_lookbehind)
8892   {
8893   PCRE2_UCHAR *cc = (PCRE2_UCHAR *)codestart;
8894 
8895   /* Loop, searching for OP_REVERSE items, and process those that do not have
8896   their length set. (Actually, it will also re-process any that have a length
8897   of zero, but that is a pathological case, and it does no harm.) When we find
8898   one, we temporarily terminate the branch it is in while we scan it. Note that
8899   calling find_bracket() with a negative group number returns a pointer to the
8900   OP_REVERSE item, not the actual lookbehind. */
8901 
8902   for (cc = (PCRE2_UCHAR *)PRIV(find_bracket)(codestart, utf, -1);
8903        cc != NULL;
8904        cc = (PCRE2_UCHAR *)PRIV(find_bracket)(cc, utf, -1))
8905     {
8906     if (GET(cc, 1) == 0)
8907       {
8908       int fixed_length;
8909       int count = 0;
8910       PCRE2_UCHAR *be = cc - 1 - LINK_SIZE + GET(cc, -LINK_SIZE);
8911       int end_op = *be;
8912       *be = OP_END;
8913       fixed_length = find_fixedlength(cc, utf, TRUE, &cb, NULL, &count);
8914       *be = end_op;
8915       if (fixed_length < 0)
8916         {
8917         errorcode = fixed_length_errors[-fixed_length];
8918         break;
8919         }
8920       if (fixed_length > cb.max_lookbehind) cb.max_lookbehind = fixed_length;
8921       PUT(cc, 1, fixed_length);
8922       }
8923     cc += 1 + LINK_SIZE;
8924     }
8925 
8926   /* The previous value of the maximum lookbehind was transferred to the
8927   compiled regex block above. We could have updated this value in the loop
8928   above, but keep the two values in step, just in case some later code below
8929   uses the cb value. */
8930 
8931   re->max_lookbehind = cb.max_lookbehind;
8932   }
8933 
8934 /* Failed to compile, or error while post-processing. Earlier errors get here
8935 via the dreaded goto. */
8936 
8937 if (errorcode != 0)
8938   {
8939   HAD_ERROR:
8940   *erroroffset = (int)(ptr - pattern);
8941   HAD_UTF_ERROR:
8942   *errorptr = errorcode;
8943   pcre2_code_free(re);
8944   re = NULL;
8945   goto EXIT;
8946   }
8947 
8948 /* Successful compile. If the anchored option was not passed, set it if
8949 we can determine that the pattern is anchored by virtue of ^ characters or \A
8950 or anything else, such as starting with non-atomic .* when DOTALL is set and
8951 there are no occurrences of *PRUNE or *SKIP (though there is an option to
8952 disable this case). */
8953 
8954 if ((re->overall_options & PCRE2_ANCHORED) == 0 &&
8955      is_anchored(codestart, 0, &cb, 0))
8956   re->overall_options |= PCRE2_ANCHORED;
8957 
8958 /* If the pattern is still not anchored and we do not have a first code unit,
8959 see if there is one that is asserted (these are not saved during the compile
8960 because they can cause conflicts with actual literals that follow). This code
8961 need not be obeyed if PCRE2_NO_START_OPTIMIZE is set, as the data it would
8962 create will not be used. */
8963 
8964 if ((re->overall_options & (PCRE2_ANCHORED|PCRE2_NO_START_OPTIMIZE)) == 0)
8965   {
8966   if (firstcuflags < 0)
8967     firstcu = find_firstassertedcu(codestart, &firstcuflags, FALSE);
8968 
8969   /* Save the data for a first code unit. */
8970 
8971   if (firstcuflags >= 0)
8972     {
8973     re->first_codeunit = firstcu;
8974     re->flags |= PCRE2_FIRSTSET;
8975 
8976     /* Handle caseless first code units. */
8977 
8978     if ((firstcuflags & REQ_CASELESS) != 0)
8979       {
8980       if (firstcu < 128 || (!utf && firstcu < 255))
8981         {
8982         if (cb.fcc[firstcu] != firstcu) re->flags |= PCRE2_FIRSTCASELESS;
8983         }
8984 
8985       /* The first code unit is > 128 in UTF mode, or > 255 otherwise. In
8986       8-bit UTF mode, codepoints in the range 128-255 are introductory code
8987       points and cannot have another case. In 16-bit and 32-bit modes, we can
8988       check wide characters when UTF (and therefore UCP) is supported. */
8989 
8990 #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 8
8991       else if (firstcu <= MAX_UTF_CODE_POINT &&
8992                UCD_OTHERCASE(firstcu) != firstcu)
8993         re->flags |= PCRE2_FIRSTCASELESS;
8994 #endif
8995       }
8996     }
8997 
8998   /* When there is no first code unit, see if we can set the PCRE2_STARTLINE
8999   flag. This is helpful for multiline matches when all branches start with ^
9000   and also when all branches start with non-atomic .* for non-DOTALL matches
9001   when *PRUNE and SKIP are not present. (There is an option that disables this
9002   case.) */
9003 
9004   else if (is_startline(codestart, 0, &cb, 0)) re->flags |= PCRE2_STARTLINE;
9005   }
9006 
9007 /* Handle the "required code unit", if one is set. In the case of an anchored
9008 pattern, do this only if it follows a variable length item in the pattern.
9009 Again, skip this if PCRE2_NO_START_OPTIMIZE is set. */
9010 
9011 if (reqcuflags >= 0 &&
9012      ((re->overall_options & (PCRE2_ANCHORED|PCRE2_NO_START_OPTIMIZE)) == 0 ||
9013       (reqcuflags & REQ_VARY) != 0))
9014   {
9015   re->last_codeunit = reqcu;
9016   re->flags |= PCRE2_LASTSET;
9017 
9018   /* Handle caseless required code units as for first code units (above). */
9019 
9020   if ((reqcuflags & REQ_CASELESS) != 0)
9021     {
9022     if (reqcu < 128 || (!utf && reqcu < 255))
9023       {
9024       if (cb.fcc[reqcu] != reqcu) re->flags |= PCRE2_LASTCASELESS;
9025       }
9026 #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 8
9027     else if (reqcu <= MAX_UTF_CODE_POINT && UCD_OTHERCASE(reqcu) != reqcu)
9028       re->flags |= PCRE2_LASTCASELESS;
9029 #endif
9030     }
9031   }
9032 
9033 /* Check for a pattern than can match an empty string, so that this information
9034 can be provided to applications. */
9035 
9036 do
9037   {
9038   int count = 0;
9039   int rc = could_be_empty_branch(codestart, code, utf, &cb, TRUE, NULL, &count);
9040   if (rc < 0)
9041     {
9042     errorcode = ERR86;
9043     goto HAD_ERROR;
9044     }
9045   if (rc > 0)
9046     {
9047     re->flags |= PCRE2_MATCH_EMPTY;
9048     break;
9049     }
9050   codestart += GET(codestart, 1);
9051   }
9052 while (*codestart == OP_ALT);
9053 
9054 /* Finally, unless PCRE2_NO_START_OPTIMIZE is set, study the compiled pattern
9055 to set up information such as a bitmap of starting code units and a minimum
9056 matching length. */
9057 
9058 if ((re->overall_options & PCRE2_NO_START_OPTIMIZE) == 0 &&
9059     PRIV(study)(re) != 0)
9060   {
9061   errorcode = ERR31;
9062   goto HAD_ERROR;
9063   }
9064 
9065 /* Control ends up here in all cases. If memory was obtained for a
9066 zero-terminated copy of the pattern, remember to free it before returning. Also
9067 free the list of named groups if a larger one had to be obtained, and likewise
9068 the group information vector. */
9069 
9070 EXIT:
9071 if (copied_pattern != stack_copied_pattern)
9072   ccontext->memctl.free(copied_pattern, ccontext->memctl.memory_data);
9073 if (cb.named_group_list_size > NAMED_GROUP_LIST_SIZE)
9074   ccontext->memctl.free((void *)cb.named_groups, ccontext->memctl.memory_data);
9075 if (cb.groupinfo != c32workspace)
9076   ccontext->memctl.free((void *)cb.groupinfo, ccontext->memctl.memory_data);
9077 
9078 return re;    /* Will be NULL after an error */
9079 }
9080 
9081 /* End of pcre2_compile.c */
9082