• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*************************************************
2 *      Perl-Compatible Regular Expressions       *
3 *************************************************/
4 
5 /* PCRE is a library of functions to support regular expressions whose syntax
6 and semantics are as close as possible to those of the Perl 5 language.
7 
8                        Written by Philip Hazel
9      Original API code Copyright (c) 1997-2012 University of Cambridge
10           New API code Copyright (c) 2016-2019 University of Cambridge
11 
12 -----------------------------------------------------------------------------
13 Redistribution and use in source and binary forms, with or without
14 modification, are permitted provided that the following conditions are met:
15 
16     * Redistributions of source code must retain the above copyright notice,
17       this list of conditions and the following disclaimer.
18 
19     * Redistributions in binary form must reproduce the above copyright
20       notice, this list of conditions and the following disclaimer in the
21       documentation and/or other materials provided with the distribution.
22 
23     * Neither the name of the University of Cambridge nor the names of its
24       contributors may be used to endorse or promote products derived from
25       this software without specific prior written permission.
26 
27 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
28 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
29 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
30 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
31 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
32 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
33 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
34 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
35 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
36 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
37 POSSIBILITY OF SUCH DAMAGE.
38 -----------------------------------------------------------------------------
39 */
40 
41 
42 #ifdef HAVE_CONFIG_H
43 #include "config.h"
44 #endif
45 
46 #define NLBLOCK cb             /* Block containing newline information */
47 #define PSSTART start_pattern  /* Field containing processed string start */
48 #define PSEND   end_pattern    /* Field containing processed string end */
49 
50 #include "pcre2_internal.h"
51 
52 /* In rare error cases debugging might require calling pcre2_printint(). */
53 
54 #if 0
55 #ifdef EBCDIC
56 #define PRINTABLE(c) ((c) >= 64 && (c) < 255)
57 #else
58 #define PRINTABLE(c) ((c) >= 32 && (c) < 127)
59 #endif
60 #include "pcre2_printint.c"
61 #define DEBUG_CALL_PRINTINT
62 #endif
63 
64 /* Other debugging code can be enabled by these defines. */
65 
66 /* #define DEBUG_SHOW_CAPTURES */
67 /* #define DEBUG_SHOW_PARSED */
68 
69 /* There are a few things that vary with different code unit sizes. Handle them
70 by defining macros in order to minimize #if usage. */
71 
72 #if PCRE2_CODE_UNIT_WIDTH == 8
73 #define STRING_UTFn_RIGHTPAR     STRING_UTF8_RIGHTPAR, 5
74 #define XDIGIT(c)                xdigitab[c]
75 
76 #else  /* Either 16-bit or 32-bit */
77 #define XDIGIT(c)                (MAX_255(c)? xdigitab[c] : 0xff)
78 
79 #if PCRE2_CODE_UNIT_WIDTH == 16
80 #define STRING_UTFn_RIGHTPAR     STRING_UTF16_RIGHTPAR, 6
81 
82 #else  /* 32-bit */
83 #define STRING_UTFn_RIGHTPAR     STRING_UTF32_RIGHTPAR, 6
84 #endif
85 #endif
86 
87 /* Macros to store and retrieve a PCRE2_SIZE value in the parsed pattern, which
88 consists of uint32_t elements. Assume that if uint32_t can't hold it, two of
89 them will be able to (i.e. assume a 64-bit world). */
90 
91 #if PCRE2_SIZE_MAX <= UINT32_MAX
92 #define PUTOFFSET(s,p) *p++ = s
93 #define GETOFFSET(s,p) s = *p++
94 #define GETPLUSOFFSET(s,p) s = *(++p)
95 #define READPLUSOFFSET(s,p) s = p[1]
96 #define SKIPOFFSET(p) p++
97 #define SIZEOFFSET 1
98 #else
99 #define PUTOFFSET(s,p) \
100   { *p++ = (uint32_t)(s >> 32); *p++ = (uint32_t)(s & 0xffffffff); }
101 #define GETOFFSET(s,p) \
102   { s = ((PCRE2_SIZE)p[0] << 32) | (PCRE2_SIZE)p[1]; p += 2; }
103 #define GETPLUSOFFSET(s,p) \
104   { s = ((PCRE2_SIZE)p[1] << 32) | (PCRE2_SIZE)p[2]; p += 2; }
105 #define READPLUSOFFSET(s,p) \
106   { s = ((PCRE2_SIZE)p[1] << 32) | (PCRE2_SIZE)p[2]; }
107 #define SKIPOFFSET(p) p += 2
108 #define SIZEOFFSET 2
109 #endif
110 
111 /* Macros for manipulating elements of the parsed pattern vector. */
112 
113 #define META_CODE(x)   (x & 0xffff0000u)
114 #define META_DATA(x)   (x & 0x0000ffffu)
115 #define META_DIFF(x,y) ((x-y)>>16)
116 
117 /* Function definitions to allow mutual recursion */
118 
119 #ifdef SUPPORT_UNICODE
120 static unsigned int
121   add_list_to_class_internal(uint8_t *, PCRE2_UCHAR **, uint32_t,
122     compile_block *, const uint32_t *, unsigned int);
123 #endif
124 
125 static int
126   compile_regex(uint32_t, PCRE2_UCHAR **, uint32_t **, int *, uint32_t,
127     uint32_t *, int32_t *, uint32_t *, int32_t *, branch_chain *,
128     compile_block *, PCRE2_SIZE *);
129 
130 static int
131   get_branchlength(uint32_t **, int *, int *, parsed_recurse_check *,
132     compile_block *);
133 
134 static BOOL
135   set_lookbehind_lengths(uint32_t **, int *, int *, parsed_recurse_check *,
136     compile_block *);
137 
138 
139 
140 /*************************************************
141 *      Code parameters and static tables         *
142 *************************************************/
143 
144 #define MAX_GROUP_NUMBER   65535u
145 #define MAX_REPEAT_COUNT   65535u
146 #define REPEAT_UNLIMITED   (MAX_REPEAT_COUNT+1)
147 
148 /* COMPILE_WORK_SIZE specifies the size of stack workspace, which is used in
149 different ways in the different pattern scans. The parsing and group-
150 identifying pre-scan uses it to handle nesting, and needs it to be 16-bit
151 aligned for this. Having defined the size in code units, we set up
152 C16_WORK_SIZE as the number of elements in the 16-bit vector.
153 
154 During the first compiling phase, when determining how much memory is required,
155 the regex is partly compiled into this space, but the compiled parts are
156 discarded as soon as they can be, so that hopefully there will never be an
157 overrun. The code does, however, check for an overrun, which can occur for
158 pathological patterns. The size of the workspace depends on LINK_SIZE because
159 the length of compiled items varies with this.
160 
161 In the real compile phase, this workspace is not currently used. */
162 
163 #define COMPILE_WORK_SIZE (3000*LINK_SIZE)   /* Size in code units */
164 
165 #define C16_WORK_SIZE \
166   ((COMPILE_WORK_SIZE * sizeof(PCRE2_UCHAR))/sizeof(uint16_t))
167 
168 /* A uint32_t vector is used for caching information about the size of
169 capturing groups, to improve performance. A default is created on the stack of
170 this size. */
171 
172 #define GROUPINFO_DEFAULT_SIZE 256
173 
174 /* The overrun tests check for a slightly smaller size so that they detect the
175 overrun before it actually does run off the end of the data block. */
176 
177 #define WORK_SIZE_SAFETY_MARGIN (100)
178 
179 /* This value determines the size of the initial vector that is used for
180 remembering named groups during the pre-compile. It is allocated on the stack,
181 but if it is too small, it is expanded, in a similar way to the workspace. The
182 value is the number of slots in the list. */
183 
184 #define NAMED_GROUP_LIST_SIZE  20
185 
186 /* The pre-compiling pass over the pattern creates a parsed pattern in a vector
187 of uint32_t. For short patterns this lives on the stack, with this size. Heap
188 memory is used for longer patterns. */
189 
190 #define PARSED_PATTERN_DEFAULT_SIZE 1024
191 
192 /* Maximum length value to check against when making sure that the variable
193 that holds the compiled pattern length does not overflow. We make it a bit less
194 than INT_MAX to allow for adding in group terminating code units, so that we
195 don't have to check them every time. */
196 
197 #define OFLOW_MAX (INT_MAX - 20)
198 
199 /* Code values for parsed patterns, which are stored in a vector of 32-bit
200 unsigned ints. Values less than META_END are literal data values. The coding
201 for identifying the item is in the top 16-bits, leaving 16 bits for the
202 additional data that some of them need. The META_CODE, META_DATA, and META_DIFF
203 macros are used to manipulate parsed pattern elements.
204 
205 NOTE: When these definitions are changed, the table of extra lengths for each
206 code (meta_extra_lengths, just below) must be updated to remain in step. */
207 
208 #define META_END              0x80000000u  /* End of pattern */
209 
210 #define META_ALT              0x80010000u  /* alternation */
211 #define META_ATOMIC           0x80020000u  /* atomic group */
212 #define META_BACKREF          0x80030000u  /* Back ref */
213 #define META_BACKREF_BYNAME   0x80040000u  /* \k'name' */
214 #define META_BIGVALUE         0x80050000u  /* Next is a literal > META_END */
215 #define META_CALLOUT_NUMBER   0x80060000u  /* (?C with numerical argument */
216 #define META_CALLOUT_STRING   0x80070000u  /* (?C with string argument */
217 #define META_CAPTURE          0x80080000u  /* Capturing parenthesis */
218 #define META_CIRCUMFLEX       0x80090000u  /* ^ metacharacter */
219 #define META_CLASS            0x800a0000u  /* start non-empty class */
220 #define META_CLASS_EMPTY      0x800b0000u  /* empty class */
221 #define META_CLASS_EMPTY_NOT  0x800c0000u  /* negative empty class */
222 #define META_CLASS_END        0x800d0000u  /* end of non-empty class */
223 #define META_CLASS_NOT        0x800e0000u  /* start non-empty negative class */
224 #define META_COND_ASSERT      0x800f0000u  /* (?(?assertion)... */
225 #define META_COND_DEFINE      0x80100000u  /* (?(DEFINE)... */
226 #define META_COND_NAME        0x80110000u  /* (?(<name>)... */
227 #define META_COND_NUMBER      0x80120000u  /* (?(digits)... */
228 #define META_COND_RNAME       0x80130000u  /* (?(R&name)... */
229 #define META_COND_RNUMBER     0x80140000u  /* (?(Rdigits)... */
230 #define META_COND_VERSION     0x80150000u  /* (?(VERSION<op>x.y)... */
231 #define META_DOLLAR           0x80160000u  /* $ metacharacter */
232 #define META_DOT              0x80170000u  /* . metacharacter */
233 #define META_ESCAPE           0x80180000u  /* \d and friends */
234 #define META_KET              0x80190000u  /* closing parenthesis */
235 #define META_NOCAPTURE        0x801a0000u  /* no capture parens */
236 #define META_OPTIONS          0x801b0000u  /* (?i) and friends */
237 #define META_POSIX            0x801c0000u  /* POSIX class item */
238 #define META_POSIX_NEG        0x801d0000u  /* negative POSIX class item */
239 #define META_RANGE_ESCAPED    0x801e0000u  /* range with at least one escape */
240 #define META_RANGE_LITERAL    0x801f0000u  /* range defined literally */
241 #define META_RECURSE          0x80200000u  /* Recursion */
242 #define META_RECURSE_BYNAME   0x80210000u  /* (?&name) */
243 #define META_SCRIPT_RUN       0x80220000u  /* (*script_run:...) */
244 
245 /* These must be kept together to make it easy to check that an assertion
246 is present where expected in a conditional group. */
247 
248 #define META_LOOKAHEAD        0x80230000u  /* (?= */
249 #define META_LOOKAHEADNOT     0x80240000u  /* (?! */
250 #define META_LOOKBEHIND       0x80250000u  /* (?<= */
251 #define META_LOOKBEHINDNOT    0x80260000u  /* (?<! */
252 
253 /* These must be kept in this order, with consecutive values, and the _ARG
254 versions of COMMIT, PRUNE, SKIP, and THEN immediately after their non-argument
255 versions. */
256 
257 #define META_MARK             0x80270000u  /* (*MARK) */
258 #define META_ACCEPT           0x80280000u  /* (*ACCEPT) */
259 #define META_FAIL             0x80290000u  /* (*FAIL) */
260 #define META_COMMIT           0x802a0000u  /* These               */
261 #define META_COMMIT_ARG       0x802b0000u  /*   pairs             */
262 #define META_PRUNE            0x802c0000u  /*     must            */
263 #define META_PRUNE_ARG        0x802d0000u  /*       be            */
264 #define META_SKIP             0x802e0000u  /*         kept        */
265 #define META_SKIP_ARG         0x802f0000u  /*           in        */
266 #define META_THEN             0x80300000u  /*             this    */
267 #define META_THEN_ARG         0x80310000u  /*               order */
268 
269 /* These must be kept in groups of adjacent 3 values, and all together. */
270 
271 #define META_ASTERISK         0x80320000u  /* *  */
272 #define META_ASTERISK_PLUS    0x80330000u  /* *+ */
273 #define META_ASTERISK_QUERY   0x80340000u  /* *? */
274 #define META_PLUS             0x80350000u  /* +  */
275 #define META_PLUS_PLUS        0x80360000u  /* ++ */
276 #define META_PLUS_QUERY       0x80370000u  /* +? */
277 #define META_QUERY            0x80380000u  /* ?  */
278 #define META_QUERY_PLUS       0x80390000u  /* ?+ */
279 #define META_QUERY_QUERY      0x803a0000u  /* ?? */
280 #define META_MINMAX           0x803b0000u  /* {n,m}  repeat */
281 #define META_MINMAX_PLUS      0x803c0000u  /* {n,m}+ repeat */
282 #define META_MINMAX_QUERY     0x803d0000u  /* {n,m}? repeat */
283 
284 #define META_FIRST_QUANTIFIER META_ASTERISK
285 #define META_LAST_QUANTIFIER  META_MINMAX_QUERY
286 
287 /* This is a special "meta code" that is used only to distinguish (*asr: from
288 (*sr: in the table of aphabetic assertions. It is never stored in the parsed
289 pattern because (*asr: is turned into (*sr:(*atomic: at that stage. There is
290 therefore no need for it to have a length entry, so use a high value. */
291 
292 #define META_ATOMIC_SCRIPT_RUN 0x8fff0000u
293 
294 /* Table of extra lengths for each of the meta codes. Must be kept in step with
295 the definitions above. For some items these values are a basic length to which
296 a variable amount has to be added. */
297 
298 static unsigned char meta_extra_lengths[] = {
299   0,             /* META_END */
300   0,             /* META_ALT */
301   0,             /* META_ATOMIC */
302   0,             /* META_BACKREF - more if group is >= 10 */
303   1+SIZEOFFSET,  /* META_BACKREF_BYNAME */
304   1,             /* META_BIGVALUE */
305   3,             /* META_CALLOUT_NUMBER */
306   3+SIZEOFFSET,  /* META_CALLOUT_STRING */
307   0,             /* META_CAPTURE */
308   0,             /* META_CIRCUMFLEX */
309   0,             /* META_CLASS */
310   0,             /* META_CLASS_EMPTY */
311   0,             /* META_CLASS_EMPTY_NOT */
312   0,             /* META_CLASS_END */
313   0,             /* META_CLASS_NOT */
314   0,             /* META_COND_ASSERT */
315   SIZEOFFSET,    /* META_COND_DEFINE */
316   1+SIZEOFFSET,  /* META_COND_NAME */
317   1+SIZEOFFSET,  /* META_COND_NUMBER */
318   1+SIZEOFFSET,  /* META_COND_RNAME */
319   1+SIZEOFFSET,  /* META_COND_RNUMBER */
320   3,             /* META_COND_VERSION */
321   0,             /* META_DOLLAR */
322   0,             /* META_DOT */
323   0,             /* META_ESCAPE - more for ESC_P, ESC_p, ESC_g, ESC_k */
324   0,             /* META_KET */
325   0,             /* META_NOCAPTURE */
326   1,             /* META_OPTIONS */
327   1,             /* META_POSIX */
328   1,             /* META_POSIX_NEG */
329   0,             /* META_RANGE_ESCAPED */
330   0,             /* META_RANGE_LITERAL */
331   SIZEOFFSET,    /* META_RECURSE */
332   1+SIZEOFFSET,  /* META_RECURSE_BYNAME */
333   0,             /* META_SCRIPT_RUN */
334   0,             /* META_LOOKAHEAD */
335   0,             /* META_LOOKAHEADNOT */
336   SIZEOFFSET,    /* META_LOOKBEHIND */
337   SIZEOFFSET,    /* META_LOOKBEHINDNOT */
338   1,             /* META_MARK - plus the string length */
339   0,             /* META_ACCEPT */
340   0,             /* META_FAIL */
341   0,             /* META_COMMIT */
342   1,             /* META_COMMIT_ARG - plus the string length */
343   0,             /* META_PRUNE */
344   1,             /* META_PRUNE_ARG - plus the string length */
345   0,             /* META_SKIP */
346   1,             /* META_SKIP_ARG - plus the string length */
347   0,             /* META_THEN */
348   1,             /* META_THEN_ARG - plus the string length */
349   0,             /* META_ASTERISK */
350   0,             /* META_ASTERISK_PLUS */
351   0,             /* META_ASTERISK_QUERY */
352   0,             /* META_PLUS */
353   0,             /* META_PLUS_PLUS */
354   0,             /* META_PLUS_QUERY */
355   0,             /* META_QUERY */
356   0,             /* META_QUERY_PLUS */
357   0,             /* META_QUERY_QUERY */
358   2,             /* META_MINMAX */
359   2,             /* META_MINMAX_PLUS */
360   2              /* META_MINMAX_QUERY */
361 };
362 
363 /* Types for skipping parts of a parsed pattern. */
364 
365 enum { PSKIP_ALT, PSKIP_CLASS, PSKIP_KET };
366 
367 /* Macro for setting individual bits in class bitmaps. It took some
368 experimenting to figure out how to stop gcc 5.3.0 from warning with
369 -Wconversion. This version gets a warning:
370 
371   #define SETBIT(a,b) a[(b)/8] |= (uint8_t)(1u << ((b)&7))
372 
373 Let's hope the apparently less efficient version isn't actually so bad if the
374 compiler is clever with identical subexpressions. */
375 
376 #define SETBIT(a,b) a[(b)/8] = (uint8_t)(a[(b)/8] | (1u << ((b)&7)))
377 
378 /* Private flags added to firstcu and reqcu. */
379 
380 #define REQ_CASELESS    (1u << 0)       /* Indicates caselessness */
381 #define REQ_VARY        (1u << 1)       /* reqcu followed non-literal item */
382 /* Negative values for the firstcu and reqcu flags */
383 #define REQ_UNSET       (-2)            /* Not yet found anything */
384 #define REQ_NONE        (-1)            /* Found not fixed char */
385 
386 /* These flags are used in the groupinfo vector. */
387 
388 #define GI_SET_FIXED_LENGTH    0x80000000u
389 #define GI_NOT_FIXED_LENGTH    0x40000000u
390 #define GI_FIXED_LENGTH_MASK   0x0000ffffu
391 
392 /* This simple test for a decimal digit works for both ASCII/Unicode and EBCDIC
393 and is fast (a good compiler can turn it into a subtraction and unsigned
394 comparison). */
395 
396 #define IS_DIGIT(x) ((x) >= CHAR_0 && (x) <= CHAR_9)
397 
398 /* Table to identify hex digits. The tables in chartables are dependent on the
399 locale, and may mark arbitrary characters as digits. We want to recognize only
400 0-9, a-z, and A-Z as hex digits, which is why we have a private table here. It
401 costs 256 bytes, but it is a lot faster than doing character value tests (at
402 least in some simple cases I timed), and in some applications one wants PCRE2
403 to compile efficiently as well as match efficiently. The value in the table is
404 the binary hex digit value, or 0xff for non-hex digits. */
405 
406 /* This is the "normal" case, for ASCII systems, and EBCDIC systems running in
407 UTF-8 mode. */
408 
409 #ifndef EBCDIC
410 static const uint8_t xdigitab[] =
411   {
412   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*   0-  7 */
413   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*   8- 15 */
414   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  16- 23 */
415   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  24- 31 */
416   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*    - '  */
417   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  ( - /  */
418   0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07, /*  0 - 7  */
419   0x08,0x09,0xff,0xff,0xff,0xff,0xff,0xff, /*  8 - ?  */
420   0xff,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f,0xff, /*  @ - G  */
421   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  H - O  */
422   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  P - W  */
423   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  X - _  */
424   0xff,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f,0xff, /*  ` - g  */
425   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  h - o  */
426   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  p - w  */
427   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  x -127 */
428   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 128-135 */
429   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 136-143 */
430   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 144-151 */
431   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 152-159 */
432   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 160-167 */
433   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 168-175 */
434   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 176-183 */
435   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 184-191 */
436   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 192-199 */
437   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 2ff-207 */
438   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 208-215 */
439   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 216-223 */
440   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 224-231 */
441   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 232-239 */
442   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 240-247 */
443   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff};/* 248-255 */
444 
445 #else
446 
447 /* This is the "abnormal" case, for EBCDIC systems not running in UTF-8 mode. */
448 
449 static const uint8_t xdigitab[] =
450   {
451   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*   0-  7  0 */
452   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*   8- 15    */
453   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  16- 23 10 */
454   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  24- 31    */
455   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  32- 39 20 */
456   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  40- 47    */
457   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  48- 55 30 */
458   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  56- 63    */
459   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*    - 71 40 */
460   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  72- |     */
461   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  & - 87 50 */
462   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  88- 95    */
463   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  - -103 60 */
464   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 104- ?     */
465   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 112-119 70 */
466   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 120- "     */
467   0xff,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f,0xff, /* 128- g  80 */
468   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  h -143    */
469   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 144- p  90 */
470   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  q -159    */
471   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 160- x  A0 */
472   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  y -175    */
473   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  ^ -183 B0 */
474   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 184-191    */
475   0xff,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f,0xff, /*  { - G  C0 */
476   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  H -207    */
477   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  } - P  D0 */
478   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  Q -223    */
479   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  \ - X  E0 */
480   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  Y -239    */
481   0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07, /*  0 - 7  F0 */
482   0x08,0x09,0xff,0xff,0xff,0xff,0xff,0xff};/*  8 -255    */
483 #endif  /* EBCDIC */
484 
485 
486 /* Table for handling alphanumeric escaped characters. Positive returns are
487 simple data values; negative values are for special things like \d and so on.
488 Zero means further processing is needed (for things like \x), or the escape is
489 invalid. */
490 
491 /* This is the "normal" table for ASCII systems or for EBCDIC systems running
492 in UTF-8 mode. It runs from '0' to 'z'. */
493 
494 #ifndef EBCDIC
495 #define ESCAPES_FIRST       CHAR_0
496 #define ESCAPES_LAST        CHAR_z
497 #define UPPER_CASE(c)       (c-32)
498 
499 static const short int escapes[] = {
500      0,                       0,
501      0,                       0,
502      0,                       0,
503      0,                       0,
504      0,                       0,
505      CHAR_COLON,              CHAR_SEMICOLON,
506      CHAR_LESS_THAN_SIGN,     CHAR_EQUALS_SIGN,
507      CHAR_GREATER_THAN_SIGN,  CHAR_QUESTION_MARK,
508      CHAR_COMMERCIAL_AT,      -ESC_A,
509      -ESC_B,                  -ESC_C,
510      -ESC_D,                  -ESC_E,
511      0,                       -ESC_G,
512      -ESC_H,                  0,
513      0,                       -ESC_K,
514      0,                       0,
515      -ESC_N,                  0,
516      -ESC_P,                  -ESC_Q,
517      -ESC_R,                  -ESC_S,
518      0,                       0,
519      -ESC_V,                  -ESC_W,
520      -ESC_X,                  0,
521      -ESC_Z,                  CHAR_LEFT_SQUARE_BRACKET,
522      CHAR_BACKSLASH,          CHAR_RIGHT_SQUARE_BRACKET,
523      CHAR_CIRCUMFLEX_ACCENT,  CHAR_UNDERSCORE,
524      CHAR_GRAVE_ACCENT,       CHAR_BEL,
525      -ESC_b,                  0,
526      -ESC_d,                  CHAR_ESC,
527      CHAR_FF,                 0,
528      -ESC_h,                  0,
529      0,                       -ESC_k,
530      0,                       0,
531      CHAR_LF,                 0,
532      -ESC_p,                  0,
533      CHAR_CR,                 -ESC_s,
534      CHAR_HT,                 0,
535      -ESC_v,                  -ESC_w,
536      0,                       0,
537      -ESC_z
538 };
539 
540 #else
541 
542 /* This is the "abnormal" table for EBCDIC systems without UTF-8 support.
543 It runs from 'a' to '9'. For some minimal testing of EBCDIC features, the code
544 is sometimes compiled on an ASCII system. In this case, we must not use CHAR_a
545 because it is defined as 'a', which of course picks up the ASCII value. */
546 
547 #if 'a' == 0x81                    /* Check for a real EBCDIC environment */
548 #define ESCAPES_FIRST       CHAR_a
549 #define ESCAPES_LAST        CHAR_9
550 #define UPPER_CASE(c)       (c+64)
551 #else                              /* Testing in an ASCII environment */
552 #define ESCAPES_FIRST  ((unsigned char)'\x81')   /* EBCDIC 'a' */
553 #define ESCAPES_LAST   ((unsigned char)'\xf9')   /* EBCDIC '9' */
554 #define UPPER_CASE(c)  (c-32)
555 #endif
556 
557 static const short int escapes[] = {
558 /*  80 */         CHAR_BEL, -ESC_b,       0, -ESC_d, CHAR_ESC, CHAR_FF,      0,
559 /*  88 */ -ESC_h,        0,      0,     '{',      0,        0,       0,      0,
560 /*  90 */      0,        0, -ESC_k,       0,      0,  CHAR_LF,       0, -ESC_p,
561 /*  98 */      0,  CHAR_CR,      0,     '}',      0,        0,       0,      0,
562 /*  A0 */      0,      '~', -ESC_s, CHAR_HT,      0,   -ESC_v,  -ESC_w,      0,
563 /*  A8 */      0,   -ESC_z,      0,       0,      0,      '[',       0,      0,
564 /*  B0 */      0,        0,      0,       0,      0,        0,       0,      0,
565 /*  B8 */      0,        0,      0,       0,      0,      ']',     '=',    '-',
566 /*  C0 */    '{',   -ESC_A, -ESC_B,  -ESC_C, -ESC_D,   -ESC_E,       0, -ESC_G,
567 /*  C8 */ -ESC_H,        0,      0,       0,      0,        0,       0,      0,
568 /*  D0 */    '}',        0, -ESC_K,       0,      0,   -ESC_N,       0, -ESC_P,
569 /*  D8 */ -ESC_Q,   -ESC_R,      0,       0,      0,        0,       0,      0,
570 /*  E0 */   '\\',        0, -ESC_S,       0,      0,   -ESC_V,  -ESC_W, -ESC_X,
571 /*  E8 */      0,   -ESC_Z,      0,       0,      0,        0,       0,      0,
572 /*  F0 */      0,        0,      0,       0,      0,        0,       0,      0,
573 /*  F8 */      0,        0
574 };
575 
576 /* We also need a table of characters that may follow \c in an EBCDIC
577 environment for characters 0-31. */
578 
579 static unsigned char ebcdic_escape_c[] = "@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_";
580 
581 #endif   /* EBCDIC */
582 
583 
584 /* Table of special "verbs" like (*PRUNE). This is a short table, so it is
585 searched linearly. Put all the names into a single string, in order to reduce
586 the number of relocations when a shared library is dynamically linked. The
587 string is built from string macros so that it works in UTF-8 mode on EBCDIC
588 platforms. */
589 
590 typedef struct verbitem {
591   unsigned int len;          /* Length of verb name */
592   uint32_t meta;             /* Base META_ code */
593   int has_arg;               /* Argument requirement */
594 } verbitem;
595 
596 static const char verbnames[] =
597   "\0"                       /* Empty name is a shorthand for MARK */
598   STRING_MARK0
599   STRING_ACCEPT0
600   STRING_F0
601   STRING_FAIL0
602   STRING_COMMIT0
603   STRING_PRUNE0
604   STRING_SKIP0
605   STRING_THEN;
606 
607 static const verbitem verbs[] = {
608   { 0, META_MARK,   +1 },  /* > 0 => must have an argument */
609   { 4, META_MARK,   +1 },
610   { 6, META_ACCEPT, -1 },  /* < 0 => Optional argument, convert to pre-MARK */
611   { 1, META_FAIL,   -1 },
612   { 4, META_FAIL,   -1 },
613   { 6, META_COMMIT,  0 },
614   { 5, META_PRUNE,   0 },  /* Optional argument; bump META code if found */
615   { 4, META_SKIP,    0 },
616   { 4, META_THEN,    0 }
617 };
618 
619 static const int verbcount = sizeof(verbs)/sizeof(verbitem);
620 
621 /* Verb opcodes, indexed by their META code offset from META_MARK. */
622 
623 static const uint32_t verbops[] = {
624   OP_MARK, OP_ACCEPT, OP_FAIL, OP_COMMIT, OP_COMMIT_ARG, OP_PRUNE,
625   OP_PRUNE_ARG, OP_SKIP, OP_SKIP_ARG, OP_THEN, OP_THEN_ARG };
626 
627 /* Table of "alpha assertions" like (*pla:...), similar to the (*VERB) table. */
628 
629 typedef struct alasitem {
630   unsigned int len;          /* Length of name */
631   uint32_t meta;             /* Base META_ code */
632 } alasitem;
633 
634 static const char alasnames[] =
635   STRING_pla0
636   STRING_plb0
637   STRING_nla0
638   STRING_nlb0
639   STRING_positive_lookahead0
640   STRING_positive_lookbehind0
641   STRING_negative_lookahead0
642   STRING_negative_lookbehind0
643   STRING_atomic0
644   STRING_sr0
645   STRING_asr0
646   STRING_script_run0
647   STRING_atomic_script_run;
648 
649 static const alasitem alasmeta[] = {
650   {  3, META_LOOKAHEAD         },
651   {  3, META_LOOKBEHIND        },
652   {  3, META_LOOKAHEADNOT      },
653   {  3, META_LOOKBEHINDNOT     },
654   { 18, META_LOOKAHEAD         },
655   { 19, META_LOOKBEHIND        },
656   { 18, META_LOOKAHEADNOT      },
657   { 19, META_LOOKBEHINDNOT     },
658   {  6, META_ATOMIC            },
659   {  2, META_SCRIPT_RUN        }, /* sr = script run */
660   {  3, META_ATOMIC_SCRIPT_RUN }, /* asr = atomic script run */
661   { 10, META_SCRIPT_RUN        }, /* script run */
662   { 17, META_ATOMIC_SCRIPT_RUN }  /* atomic script run */
663 };
664 
665 static const int alascount = sizeof(alasmeta)/sizeof(alasitem);
666 
667 /* Offsets from OP_STAR for case-independent and negative repeat opcodes. */
668 
669 static uint32_t chartypeoffset[] = {
670   OP_STAR - OP_STAR,    OP_STARI - OP_STAR,
671   OP_NOTSTAR - OP_STAR, OP_NOTSTARI - OP_STAR };
672 
673 /* Tables of names of POSIX character classes and their lengths. The names are
674 now all in a single string, to reduce the number of relocations when a shared
675 library is dynamically loaded. The list of lengths is terminated by a zero
676 length entry. The first three must be alpha, lower, upper, as this is assumed
677 for handling case independence. The indices for graph, print, and punct are
678 needed, so identify them. */
679 
680 static const char posix_names[] =
681   STRING_alpha0 STRING_lower0 STRING_upper0 STRING_alnum0
682   STRING_ascii0 STRING_blank0 STRING_cntrl0 STRING_digit0
683   STRING_graph0 STRING_print0 STRING_punct0 STRING_space0
684   STRING_word0  STRING_xdigit;
685 
686 static const uint8_t posix_name_lengths[] = {
687   5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 6, 0 };
688 
689 #define PC_GRAPH  8
690 #define PC_PRINT  9
691 #define PC_PUNCT 10
692 
693 /* Table of class bit maps for each POSIX class. Each class is formed from a
694 base map, with an optional addition or removal of another map. Then, for some
695 classes, there is some additional tweaking: for [:blank:] the vertical space
696 characters are removed, and for [:alpha:] and [:alnum:] the underscore
697 character is removed. The triples in the table consist of the base map offset,
698 second map offset or -1 if no second map, and a non-negative value for map
699 addition or a negative value for map subtraction (if there are two maps). The
700 absolute value of the third field has these meanings: 0 => no tweaking, 1 =>
701 remove vertical space characters, 2 => remove underscore. */
702 
703 static const int posix_class_maps[] = {
704   cbit_word,  cbit_digit, -2,             /* alpha */
705   cbit_lower, -1,          0,             /* lower */
706   cbit_upper, -1,          0,             /* upper */
707   cbit_word,  -1,          2,             /* alnum - word without underscore */
708   cbit_print, cbit_cntrl,  0,             /* ascii */
709   cbit_space, -1,          1,             /* blank - a GNU extension */
710   cbit_cntrl, -1,          0,             /* cntrl */
711   cbit_digit, -1,          0,             /* digit */
712   cbit_graph, -1,          0,             /* graph */
713   cbit_print, -1,          0,             /* print */
714   cbit_punct, -1,          0,             /* punct */
715   cbit_space, -1,          0,             /* space */
716   cbit_word,  -1,          0,             /* word - a Perl extension */
717   cbit_xdigit,-1,          0              /* xdigit */
718 };
719 
720 #ifdef SUPPORT_UNICODE
721 
722 /* The POSIX class Unicode property substitutes that are used in UCP mode must
723 be in the order of the POSIX class names, defined above. */
724 
725 static int posix_substitutes[] = {
726   PT_GC, ucp_L,     /* alpha */
727   PT_PC, ucp_Ll,    /* lower */
728   PT_PC, ucp_Lu,    /* upper */
729   PT_ALNUM, 0,      /* alnum */
730   -1, 0,            /* ascii, treat as non-UCP */
731   -1, 1,            /* blank, treat as \h */
732   PT_PC, ucp_Cc,    /* cntrl */
733   PT_PC, ucp_Nd,    /* digit */
734   PT_PXGRAPH, 0,    /* graph */
735   PT_PXPRINT, 0,    /* print */
736   PT_PXPUNCT, 0,    /* punct */
737   PT_PXSPACE, 0,    /* space */   /* Xps is POSIX space, but from 8.34 */
738   PT_WORD, 0,       /* word  */   /* Perl and POSIX space are the same */
739   -1, 0             /* xdigit, treat as non-UCP */
740 };
741 #define POSIX_SUBSIZE (sizeof(posix_substitutes) / (2*sizeof(uint32_t)))
742 #endif  /* SUPPORT_UNICODE */
743 
744 /* Masks for checking option settings. When PCRE2_LITERAL is set, only a subset
745 are allowed. */
746 
747 #define PUBLIC_LITERAL_COMPILE_OPTIONS \
748   (PCRE2_ANCHORED|PCRE2_AUTO_CALLOUT|PCRE2_CASELESS|PCRE2_ENDANCHORED| \
749    PCRE2_FIRSTLINE|PCRE2_LITERAL|PCRE2_NO_START_OPTIMIZE| \
750    PCRE2_NO_UTF_CHECK|PCRE2_USE_OFFSET_LIMIT|PCRE2_UTF)
751 
752 #define PUBLIC_COMPILE_OPTIONS \
753   (PUBLIC_LITERAL_COMPILE_OPTIONS| \
754    PCRE2_ALLOW_EMPTY_CLASS|PCRE2_ALT_BSUX|PCRE2_ALT_CIRCUMFLEX| \
755    PCRE2_ALT_VERBNAMES|PCRE2_DOLLAR_ENDONLY|PCRE2_DOTALL|PCRE2_DUPNAMES| \
756    PCRE2_EXTENDED|PCRE2_EXTENDED_MORE|PCRE2_MATCH_UNSET_BACKREF| \
757    PCRE2_MULTILINE|PCRE2_NEVER_BACKSLASH_C|PCRE2_NEVER_UCP| \
758    PCRE2_NEVER_UTF|PCRE2_NO_AUTO_CAPTURE|PCRE2_NO_AUTO_POSSESS| \
759    PCRE2_NO_DOTSTAR_ANCHOR|PCRE2_UCP|PCRE2_UNGREEDY)
760 
761 #define PUBLIC_LITERAL_COMPILE_EXTRA_OPTIONS \
762    (PCRE2_EXTRA_MATCH_LINE|PCRE2_EXTRA_MATCH_WORD)
763 
764 #define PUBLIC_COMPILE_EXTRA_OPTIONS \
765    (PUBLIC_LITERAL_COMPILE_EXTRA_OPTIONS| \
766     PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES|PCRE2_EXTRA_BAD_ESCAPE_IS_LITERAL| \
767     PCRE2_EXTRA_ESCAPED_CR_IS_LF|PCRE2_EXTRA_ALT_BSUX)
768 
769 /* Compile time error code numbers. They are given names so that they can more
770 easily be tracked. When a new number is added, the tables called eint1 and
771 eint2 in pcre2posix.c may need to be updated, and a new error text must be
772 added to compile_error_texts in pcre2_error.c. */
773 
774 enum { ERR0 = COMPILE_ERROR_BASE,
775        ERR1,  ERR2,  ERR3,  ERR4,  ERR5,  ERR6,  ERR7,  ERR8,  ERR9,  ERR10,
776        ERR11, ERR12, ERR13, ERR14, ERR15, ERR16, ERR17, ERR18, ERR19, ERR20,
777        ERR21, ERR22, ERR23, ERR24, ERR25, ERR26, ERR27, ERR28, ERR29, ERR30,
778        ERR31, ERR32, ERR33, ERR34, ERR35, ERR36, ERR37, ERR38, ERR39, ERR40,
779        ERR41, ERR42, ERR43, ERR44, ERR45, ERR46, ERR47, ERR48, ERR49, ERR50,
780        ERR51, ERR52, ERR53, ERR54, ERR55, ERR56, ERR57, ERR58, ERR59, ERR60,
781        ERR61, ERR62, ERR63, ERR64, ERR65, ERR66, ERR67, ERR68, ERR69, ERR70,
782        ERR71, ERR72, ERR73, ERR74, ERR75, ERR76, ERR77, ERR78, ERR79, ERR80,
783        ERR81, ERR82, ERR83, ERR84, ERR85, ERR86, ERR87, ERR88, ERR89, ERR90,
784        ERR91, ERR92, ERR93, ERR94, ERR95, ERR96 };
785 
786 /* This is a table of start-of-pattern options such as (*UTF) and settings such
787 as (*LIMIT_MATCH=nnnn) and (*CRLF). For completeness and backward
788 compatibility, (*UTFn) is supported in the relevant libraries, but (*UTF) is
789 generic and always supported. */
790 
791 enum { PSO_OPT,     /* Value is an option bit */
792        PSO_FLG,     /* Value is a flag bit */
793        PSO_NL,      /* Value is a newline type */
794        PSO_BSR,     /* Value is a \R type */
795        PSO_LIMH,    /* Read integer value for heap limit */
796        PSO_LIMM,    /* Read integer value for match limit */
797        PSO_LIMD };  /* Read integer value for depth limit */
798 
799 typedef struct pso {
800   const uint8_t *name;
801   uint16_t length;
802   uint16_t type;
803   uint32_t value;
804 } pso;
805 
806 /* NB: STRING_UTFn_RIGHTPAR contains the length as well */
807 
808 static pso pso_list[] = {
809   { (uint8_t *)STRING_UTFn_RIGHTPAR,                  PSO_OPT, PCRE2_UTF },
810   { (uint8_t *)STRING_UTF_RIGHTPAR,                4, PSO_OPT, PCRE2_UTF },
811   { (uint8_t *)STRING_UCP_RIGHTPAR,                4, PSO_OPT, PCRE2_UCP },
812   { (uint8_t *)STRING_NOTEMPTY_RIGHTPAR,           9, PSO_FLG, PCRE2_NOTEMPTY_SET },
813   { (uint8_t *)STRING_NOTEMPTY_ATSTART_RIGHTPAR,  17, PSO_FLG, PCRE2_NE_ATST_SET },
814   { (uint8_t *)STRING_NO_AUTO_POSSESS_RIGHTPAR,   16, PSO_OPT, PCRE2_NO_AUTO_POSSESS },
815   { (uint8_t *)STRING_NO_DOTSTAR_ANCHOR_RIGHTPAR, 18, PSO_OPT, PCRE2_NO_DOTSTAR_ANCHOR },
816   { (uint8_t *)STRING_NO_JIT_RIGHTPAR,             7, PSO_FLG, PCRE2_NOJIT },
817   { (uint8_t *)STRING_NO_START_OPT_RIGHTPAR,      13, PSO_OPT, PCRE2_NO_START_OPTIMIZE },
818   { (uint8_t *)STRING_LIMIT_HEAP_EQ,              11, PSO_LIMH, 0 },
819   { (uint8_t *)STRING_LIMIT_MATCH_EQ,             12, PSO_LIMM, 0 },
820   { (uint8_t *)STRING_LIMIT_DEPTH_EQ,             12, PSO_LIMD, 0 },
821   { (uint8_t *)STRING_LIMIT_RECURSION_EQ,         16, PSO_LIMD, 0 },
822   { (uint8_t *)STRING_CR_RIGHTPAR,                 3, PSO_NL,  PCRE2_NEWLINE_CR },
823   { (uint8_t *)STRING_LF_RIGHTPAR,                 3, PSO_NL,  PCRE2_NEWLINE_LF },
824   { (uint8_t *)STRING_CRLF_RIGHTPAR,               5, PSO_NL,  PCRE2_NEWLINE_CRLF },
825   { (uint8_t *)STRING_ANY_RIGHTPAR,                4, PSO_NL,  PCRE2_NEWLINE_ANY },
826   { (uint8_t *)STRING_NUL_RIGHTPAR,                4, PSO_NL,  PCRE2_NEWLINE_NUL },
827   { (uint8_t *)STRING_ANYCRLF_RIGHTPAR,            8, PSO_NL,  PCRE2_NEWLINE_ANYCRLF },
828   { (uint8_t *)STRING_BSR_ANYCRLF_RIGHTPAR,       12, PSO_BSR, PCRE2_BSR_ANYCRLF },
829   { (uint8_t *)STRING_BSR_UNICODE_RIGHTPAR,       12, PSO_BSR, PCRE2_BSR_UNICODE }
830 };
831 
832 /* This table is used when converting repeating opcodes into possessified
833 versions as a result of an explicit possessive quantifier such as ++. A zero
834 value means there is no possessified version - in those cases the item in
835 question must be wrapped in ONCE brackets. The table is truncated at OP_CALLOUT
836 because all relevant opcodes are less than that. */
837 
838 static const uint8_t opcode_possessify[] = {
839   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,   /* 0 - 15  */
840   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,   /* 16 - 31 */
841 
842   0,                       /* NOTI */
843   OP_POSSTAR, 0,           /* STAR, MINSTAR */
844   OP_POSPLUS, 0,           /* PLUS, MINPLUS */
845   OP_POSQUERY, 0,          /* QUERY, MINQUERY */
846   OP_POSUPTO, 0,           /* UPTO, MINUPTO */
847   0,                       /* EXACT */
848   0, 0, 0, 0,              /* POS{STAR,PLUS,QUERY,UPTO} */
849 
850   OP_POSSTARI, 0,          /* STARI, MINSTARI */
851   OP_POSPLUSI, 0,          /* PLUSI, MINPLUSI */
852   OP_POSQUERYI, 0,         /* QUERYI, MINQUERYI */
853   OP_POSUPTOI, 0,          /* UPTOI, MINUPTOI */
854   0,                       /* EXACTI */
855   0, 0, 0, 0,              /* POS{STARI,PLUSI,QUERYI,UPTOI} */
856 
857   OP_NOTPOSSTAR, 0,        /* NOTSTAR, NOTMINSTAR */
858   OP_NOTPOSPLUS, 0,        /* NOTPLUS, NOTMINPLUS */
859   OP_NOTPOSQUERY, 0,       /* NOTQUERY, NOTMINQUERY */
860   OP_NOTPOSUPTO, 0,        /* NOTUPTO, NOTMINUPTO */
861   0,                       /* NOTEXACT */
862   0, 0, 0, 0,              /* NOTPOS{STAR,PLUS,QUERY,UPTO} */
863 
864   OP_NOTPOSSTARI, 0,       /* NOTSTARI, NOTMINSTARI */
865   OP_NOTPOSPLUSI, 0,       /* NOTPLUSI, NOTMINPLUSI */
866   OP_NOTPOSQUERYI, 0,      /* NOTQUERYI, NOTMINQUERYI */
867   OP_NOTPOSUPTOI, 0,       /* NOTUPTOI, NOTMINUPTOI */
868   0,                       /* NOTEXACTI */
869   0, 0, 0, 0,              /* NOTPOS{STARI,PLUSI,QUERYI,UPTOI} */
870 
871   OP_TYPEPOSSTAR, 0,       /* TYPESTAR, TYPEMINSTAR */
872   OP_TYPEPOSPLUS, 0,       /* TYPEPLUS, TYPEMINPLUS */
873   OP_TYPEPOSQUERY, 0,      /* TYPEQUERY, TYPEMINQUERY */
874   OP_TYPEPOSUPTO, 0,       /* TYPEUPTO, TYPEMINUPTO */
875   0,                       /* TYPEEXACT */
876   0, 0, 0, 0,              /* TYPEPOS{STAR,PLUS,QUERY,UPTO} */
877 
878   OP_CRPOSSTAR, 0,         /* CRSTAR, CRMINSTAR */
879   OP_CRPOSPLUS, 0,         /* CRPLUS, CRMINPLUS */
880   OP_CRPOSQUERY, 0,        /* CRQUERY, CRMINQUERY */
881   OP_CRPOSRANGE, 0,        /* CRRANGE, CRMINRANGE */
882   0, 0, 0, 0,              /* CRPOS{STAR,PLUS,QUERY,RANGE} */
883 
884   0, 0, 0,                 /* CLASS, NCLASS, XCLASS */
885   0, 0,                    /* REF, REFI */
886   0, 0,                    /* DNREF, DNREFI */
887   0, 0                     /* RECURSE, CALLOUT */
888 };
889 
890 
891 #ifdef DEBUG_SHOW_PARSED
892 /*************************************************
893 *     Show the parsed pattern for debugging      *
894 *************************************************/
895 
896 /* For debugging the pre-scan, this code, which outputs the parsed data vector,
897 can be enabled. */
898 
show_parsed(compile_block * cb)899 static void show_parsed(compile_block *cb)
900 {
901 uint32_t *pptr = cb->parsed_pattern;
902 
903 for (;;)
904   {
905   int max, min;
906   PCRE2_SIZE offset;
907   uint32_t i;
908   uint32_t length;
909   uint32_t meta_arg = META_DATA(*pptr);
910 
911   fprintf(stderr, "+++ %02d %.8x ", (int)(pptr - cb->parsed_pattern), *pptr);
912 
913   if (*pptr < META_END)
914     {
915     if (*pptr > 32 && *pptr < 128) fprintf(stderr, "%c", *pptr);
916     pptr++;
917     }
918 
919   else switch (META_CODE(*pptr++))
920     {
921     default:
922     fprintf(stderr, "**** OOPS - unknown META value - giving up ****\n");
923     return;
924 
925     case META_END:
926     fprintf(stderr, "META_END\n");
927     return;
928 
929     case META_CAPTURE:
930     fprintf(stderr, "META_CAPTURE %d", meta_arg);
931     break;
932 
933     case META_RECURSE:
934     GETOFFSET(offset, pptr);
935     fprintf(stderr, "META_RECURSE %d %zd", meta_arg, offset);
936     break;
937 
938     case META_BACKREF:
939     if (meta_arg < 10)
940       offset = cb->small_ref_offset[meta_arg];
941     else
942       GETOFFSET(offset, pptr);
943     fprintf(stderr, "META_BACKREF %d %zd", meta_arg, offset);
944     break;
945 
946     case META_ESCAPE:
947     if (meta_arg == ESC_P || meta_arg == ESC_p)
948       {
949       uint32_t ptype = *pptr >> 16;
950       uint32_t pvalue = *pptr++ & 0xffff;
951       fprintf(stderr, "META \\%c %d %d", (meta_arg == ESC_P)? 'P':'p',
952         ptype, pvalue);
953       }
954     else
955       {
956       uint32_t cc;
957       /* There's just one escape we might have here that isn't negated in the
958       escapes table. */
959       if (meta_arg == ESC_g) cc = CHAR_g;
960       else for (cc = ESCAPES_FIRST; cc <= ESCAPES_LAST; cc++)
961         {
962         if (meta_arg == (uint32_t)(-escapes[cc - ESCAPES_FIRST])) break;
963         }
964       if (cc > ESCAPES_LAST) cc = CHAR_QUESTION_MARK;
965       fprintf(stderr, "META \\%c", cc);
966       }
967     break;
968 
969     case META_MINMAX:
970     min = *pptr++;
971     max = *pptr++;
972     if (max != REPEAT_UNLIMITED)
973       fprintf(stderr, "META {%d,%d}", min, max);
974     else
975       fprintf(stderr, "META {%d,}", min);
976     break;
977 
978     case META_MINMAX_QUERY:
979     min = *pptr++;
980     max = *pptr++;
981     if (max != REPEAT_UNLIMITED)
982       fprintf(stderr, "META {%d,%d}?", min, max);
983     else
984       fprintf(stderr, "META {%d,}?", min);
985     break;
986 
987     case META_MINMAX_PLUS:
988     min = *pptr++;
989     max = *pptr++;
990     if (max != REPEAT_UNLIMITED)
991       fprintf(stderr, "META {%d,%d}+", min, max);
992     else
993       fprintf(stderr, "META {%d,}+", min);
994     break;
995 
996     case META_BIGVALUE: fprintf(stderr, "META_BIGVALUE %.8x", *pptr++); break;
997     case META_CIRCUMFLEX: fprintf(stderr, "META_CIRCUMFLEX"); break;
998     case META_COND_ASSERT: fprintf(stderr, "META_COND_ASSERT"); break;
999     case META_DOLLAR: fprintf(stderr, "META_DOLLAR"); break;
1000     case META_DOT: fprintf(stderr, "META_DOT"); break;
1001     case META_ASTERISK: fprintf(stderr, "META *"); break;
1002     case META_ASTERISK_QUERY: fprintf(stderr, "META *?"); break;
1003     case META_ASTERISK_PLUS: fprintf(stderr, "META *+"); break;
1004     case META_PLUS: fprintf(stderr, "META +"); break;
1005     case META_PLUS_QUERY: fprintf(stderr, "META +?"); break;
1006     case META_PLUS_PLUS: fprintf(stderr, "META ++"); break;
1007     case META_QUERY: fprintf(stderr, "META ?"); break;
1008     case META_QUERY_QUERY: fprintf(stderr, "META ??"); break;
1009     case META_QUERY_PLUS: fprintf(stderr, "META ?+"); break;
1010 
1011     case META_ATOMIC: fprintf(stderr, "META (?>"); break;
1012     case META_NOCAPTURE: fprintf(stderr, "META (?:"); break;
1013     case META_LOOKAHEAD: fprintf(stderr, "META (?="); break;
1014     case META_LOOKAHEADNOT: fprintf(stderr, "META (?!"); break;
1015     case META_SCRIPT_RUN: fprintf(stderr, "META (*sr:"); break;
1016     case META_KET: fprintf(stderr, "META )"); break;
1017     case META_ALT: fprintf(stderr, "META | %d", meta_arg); break;
1018 
1019     case META_CLASS: fprintf(stderr, "META ["); break;
1020     case META_CLASS_NOT: fprintf(stderr, "META [^"); break;
1021     case META_CLASS_END: fprintf(stderr, "META ]"); break;
1022     case META_CLASS_EMPTY: fprintf(stderr, "META []"); break;
1023     case META_CLASS_EMPTY_NOT: fprintf(stderr, "META [^]"); break;
1024 
1025     case META_RANGE_LITERAL: fprintf(stderr, "META - (literal)"); break;
1026     case META_RANGE_ESCAPED: fprintf(stderr, "META - (escaped)"); break;
1027 
1028     case META_POSIX: fprintf(stderr, "META_POSIX %d", *pptr++); break;
1029     case META_POSIX_NEG: fprintf(stderr, "META_POSIX_NEG %d", *pptr++); break;
1030 
1031     case META_ACCEPT: fprintf(stderr, "META (*ACCEPT)"); break;
1032     case META_FAIL: fprintf(stderr, "META (*FAIL)"); break;
1033     case META_COMMIT: fprintf(stderr, "META (*COMMIT)"); break;
1034     case META_PRUNE: fprintf(stderr, "META (*PRUNE)"); break;
1035     case META_SKIP: fprintf(stderr, "META (*SKIP)"); break;
1036     case META_THEN: fprintf(stderr, "META (*THEN)"); break;
1037 
1038     case META_OPTIONS: fprintf(stderr, "META_OPTIONS 0x%02x", *pptr++); break;
1039 
1040     case META_LOOKBEHIND:
1041     fprintf(stderr, "META (?<= %d offset=", meta_arg);
1042     GETOFFSET(offset, pptr);
1043     fprintf(stderr, "%zd", offset);
1044     break;
1045 
1046     case META_LOOKBEHINDNOT:
1047     fprintf(stderr, "META (?<! %d offset=", meta_arg);
1048     GETOFFSET(offset, pptr);
1049     fprintf(stderr, "%zd", offset);
1050     break;
1051 
1052     case META_CALLOUT_NUMBER:
1053     fprintf(stderr, "META (?C%d) next=%d/%d", pptr[2], pptr[0],
1054        pptr[1]);
1055     pptr += 3;
1056     break;
1057 
1058     case META_CALLOUT_STRING:
1059       {
1060       uint32_t patoffset = *pptr++;    /* Offset of next pattern item */
1061       uint32_t patlength = *pptr++;    /* Length of next pattern item */
1062       fprintf(stderr, "META (?Cstring) length=%d offset=", *pptr++);
1063       GETOFFSET(offset, pptr);
1064       fprintf(stderr, "%zd next=%d/%d", offset, patoffset, patlength);
1065       }
1066     break;
1067 
1068     case META_RECURSE_BYNAME:
1069     fprintf(stderr, "META (?(&name) length=%d offset=", *pptr++);
1070     GETOFFSET(offset, pptr);
1071     fprintf(stderr, "%zd", offset);
1072     break;
1073 
1074     case META_BACKREF_BYNAME:
1075     fprintf(stderr, "META_BACKREF_BYNAME length=%d offset=", *pptr++);
1076     GETOFFSET(offset, pptr);
1077     fprintf(stderr, "%zd", offset);
1078     break;
1079 
1080     case META_COND_NUMBER:
1081     fprintf(stderr, "META_COND_NUMBER %d offset=", pptr[SIZEOFFSET]);
1082     GETOFFSET(offset, pptr);
1083     fprintf(stderr, "%zd", offset);
1084     pptr++;
1085     break;
1086 
1087     case META_COND_DEFINE:
1088     fprintf(stderr, "META (?(DEFINE) offset=");
1089     GETOFFSET(offset, pptr);
1090     fprintf(stderr, "%zd", offset);
1091     break;
1092 
1093     case META_COND_VERSION:
1094     fprintf(stderr, "META (?(VERSION%s", (*pptr++ == 0)? "=" : ">=");
1095     fprintf(stderr, "%d.", *pptr++);
1096     fprintf(stderr, "%d)", *pptr++);
1097     break;
1098 
1099     case META_COND_NAME:
1100     fprintf(stderr, "META (?(<name>) length=%d offset=", *pptr++);
1101     GETOFFSET(offset, pptr);
1102     fprintf(stderr, "%zd", offset);
1103     break;
1104 
1105     case META_COND_RNAME:
1106     fprintf(stderr, "META (?(R&name) length=%d offset=", *pptr++);
1107     GETOFFSET(offset, pptr);
1108     fprintf(stderr, "%zd", offset);
1109     break;
1110 
1111     /* This is kept as a name, because it might be. */
1112 
1113     case META_COND_RNUMBER:
1114     fprintf(stderr, "META (?(Rnumber) length=%d offset=", *pptr++);
1115     GETOFFSET(offset, pptr);
1116     fprintf(stderr, "%zd", offset);
1117     break;
1118 
1119     case META_MARK:
1120     fprintf(stderr, "META (*MARK:");
1121     goto SHOWARG;
1122 
1123     case META_COMMIT_ARG:
1124     fprintf(stderr, "META (*COMMIT:");
1125     goto SHOWARG;
1126 
1127     case META_PRUNE_ARG:
1128     fprintf(stderr, "META (*PRUNE:");
1129     goto SHOWARG;
1130 
1131     case META_SKIP_ARG:
1132     fprintf(stderr, "META (*SKIP:");
1133     goto SHOWARG;
1134 
1135     case META_THEN_ARG:
1136     fprintf(stderr, "META (*THEN:");
1137     SHOWARG:
1138     length = *pptr++;
1139     for (i = 0; i < length; i++)
1140       {
1141       uint32_t cc = *pptr++;
1142       if (cc > 32 && cc < 128) fprintf(stderr, "%c", cc);
1143         else fprintf(stderr, "\\x{%x}", cc);
1144       }
1145     fprintf(stderr, ") length=%u", length);
1146     break;
1147     }
1148   fprintf(stderr, "\n");
1149   }
1150 return;
1151 }
1152 #endif  /* DEBUG_SHOW_PARSED */
1153 
1154 
1155 
1156 /*************************************************
1157 *               Copy compiled code               *
1158 *************************************************/
1159 
1160 /* Compiled JIT code cannot be copied, so the new compiled block has no
1161 associated JIT data. */
1162 
1163 PCRE2_EXP_DEFN pcre2_code * PCRE2_CALL_CONVENTION
pcre2_code_copy(const pcre2_code * code)1164 pcre2_code_copy(const pcre2_code *code)
1165 {
1166 PCRE2_SIZE* ref_count;
1167 pcre2_code *newcode;
1168 
1169 if (code == NULL) return NULL;
1170 newcode = code->memctl.malloc(code->blocksize, code->memctl.memory_data);
1171 if (newcode == NULL) return NULL;
1172 memcpy(newcode, code, code->blocksize);
1173 newcode->executable_jit = NULL;
1174 
1175 /* If the code is one that has been deserialized, increment the reference count
1176 in the decoded tables. */
1177 
1178 if ((code->flags & PCRE2_DEREF_TABLES) != 0)
1179   {
1180   ref_count = (PCRE2_SIZE *)(code->tables + tables_length);
1181   (*ref_count)++;
1182   }
1183 
1184 return newcode;
1185 }
1186 
1187 
1188 
1189 /*************************************************
1190 *     Copy compiled code and character tables    *
1191 *************************************************/
1192 
1193 /* Compiled JIT code cannot be copied, so the new compiled block has no
1194 associated JIT data. This version of code_copy also makes a separate copy of
1195 the character tables. */
1196 
1197 PCRE2_EXP_DEFN pcre2_code * PCRE2_CALL_CONVENTION
pcre2_code_copy_with_tables(const pcre2_code * code)1198 pcre2_code_copy_with_tables(const pcre2_code *code)
1199 {
1200 PCRE2_SIZE* ref_count;
1201 pcre2_code *newcode;
1202 uint8_t *newtables;
1203 
1204 if (code == NULL) return NULL;
1205 newcode = code->memctl.malloc(code->blocksize, code->memctl.memory_data);
1206 if (newcode == NULL) return NULL;
1207 memcpy(newcode, code, code->blocksize);
1208 newcode->executable_jit = NULL;
1209 
1210 newtables = code->memctl.malloc(tables_length + sizeof(PCRE2_SIZE),
1211   code->memctl.memory_data);
1212 if (newtables == NULL)
1213   {
1214   code->memctl.free((void *)newcode, code->memctl.memory_data);
1215   return NULL;
1216   }
1217 memcpy(newtables, code->tables, tables_length);
1218 ref_count = (PCRE2_SIZE *)(newtables + tables_length);
1219 *ref_count = 1;
1220 
1221 newcode->tables = newtables;
1222 newcode->flags |= PCRE2_DEREF_TABLES;
1223 return newcode;
1224 }
1225 
1226 
1227 
1228 /*************************************************
1229 *               Free compiled code               *
1230 *************************************************/
1231 
1232 PCRE2_EXP_DEFN void PCRE2_CALL_CONVENTION
pcre2_code_free(pcre2_code * code)1233 pcre2_code_free(pcre2_code *code)
1234 {
1235 PCRE2_SIZE* ref_count;
1236 
1237 if (code != NULL)
1238   {
1239   if (code->executable_jit != NULL)
1240     PRIV(jit_free)(code->executable_jit, &code->memctl);
1241 
1242   if ((code->flags & PCRE2_DEREF_TABLES) != 0)
1243     {
1244     /* Decoded tables belong to the codes after deserialization, and they must
1245     be freed when there are no more references to them. The *ref_count should
1246     always be > 0. */
1247 
1248     ref_count = (PCRE2_SIZE *)(code->tables + tables_length);
1249     if (*ref_count > 0)
1250       {
1251       (*ref_count)--;
1252       if (*ref_count == 0)
1253         code->memctl.free((void *)code->tables, code->memctl.memory_data);
1254       }
1255     }
1256 
1257   code->memctl.free(code, code->memctl.memory_data);
1258   }
1259 }
1260 
1261 
1262 
1263 /*************************************************
1264 *         Read a number, possibly signed         *
1265 *************************************************/
1266 
1267 /* This function is used to read numbers in the pattern. The initial pointer
1268 must be the sign or first digit of the number. When relative values (introduced
1269 by + or -) are allowed, they are relative group numbers, and the result must be
1270 greater than zero.
1271 
1272 Arguments:
1273   ptrptr      points to the character pointer variable
1274   ptrend      points to the end of the input string
1275   allow_sign  if < 0, sign not allowed; if >= 0, sign is relative to this
1276   max_value   the largest number allowed
1277   max_error   the error to give for an over-large number
1278   intptr      where to put the result
1279   errcodeptr  where to put an error code
1280 
1281 Returns:      TRUE  - a number was read
1282               FALSE - errorcode == 0 => no number was found
1283                       errorcode != 0 => an error occurred
1284 */
1285 
1286 static BOOL
read_number(PCRE2_SPTR * ptrptr,PCRE2_SPTR ptrend,int32_t allow_sign,uint32_t max_value,uint32_t max_error,int * intptr,int * errorcodeptr)1287 read_number(PCRE2_SPTR *ptrptr, PCRE2_SPTR ptrend, int32_t allow_sign,
1288   uint32_t max_value, uint32_t max_error, int *intptr, int *errorcodeptr)
1289 {
1290 int sign = 0;
1291 uint32_t n = 0;
1292 PCRE2_SPTR ptr = *ptrptr;
1293 BOOL yield = FALSE;
1294 
1295 *errorcodeptr = 0;
1296 
1297 if (allow_sign >= 0 && ptr < ptrend)
1298   {
1299   if (*ptr == CHAR_PLUS)
1300     {
1301     sign = +1;
1302     max_value -= allow_sign;
1303     ptr++;
1304     }
1305   else if (*ptr == CHAR_MINUS)
1306     {
1307     sign = -1;
1308     ptr++;
1309     }
1310   }
1311 
1312 if (ptr >= ptrend || !IS_DIGIT(*ptr)) return FALSE;
1313 while (ptr < ptrend && IS_DIGIT(*ptr))
1314   {
1315   n = n * 10 + *ptr++ - CHAR_0;
1316   if (n > max_value)
1317     {
1318     *errorcodeptr = max_error;
1319     goto EXIT;
1320     }
1321   }
1322 
1323 if (allow_sign >= 0 && sign != 0)
1324   {
1325   if (n == 0)
1326     {
1327     *errorcodeptr = ERR26;  /* +0 and -0 are not allowed */
1328     goto EXIT;
1329     }
1330 
1331   if (sign > 0) n += allow_sign;
1332   else if ((int)n > allow_sign)
1333     {
1334     *errorcodeptr = ERR15;  /* Non-existent subpattern */
1335     goto EXIT;
1336     }
1337   else n = allow_sign + 1 - n;
1338   }
1339 
1340 yield = TRUE;
1341 
1342 EXIT:
1343 *intptr = n;
1344 *ptrptr = ptr;
1345 return yield;
1346 }
1347 
1348 
1349 
1350 /*************************************************
1351 *         Read repeat counts                     *
1352 *************************************************/
1353 
1354 /* Read an item of the form {n,m} and return the values if non-NULL pointers
1355 are supplied. Repeat counts must be less than 65536 (MAX_REPEAT_COUNT); a
1356 larger value is used for "unlimited". We have to use signed arguments for
1357 read_number() because it is capable of returning a signed value.
1358 
1359 Arguments:
1360   ptrptr         points to pointer to character after'{'
1361   ptrend         pointer to end of input
1362   minp           if not NULL, pointer to int for min
1363   maxp           if not NULL, pointer to int for max (-1 if no max)
1364                  returned as -1 if no max
1365   errorcodeptr   points to error code variable
1366 
1367 Returns:         FALSE if not a repeat quantifier, errorcode set zero
1368                  FALSE on error, with errorcode set non-zero
1369                  TRUE on success, with pointer updated to point after '}'
1370 */
1371 
1372 static BOOL
read_repeat_counts(PCRE2_SPTR * ptrptr,PCRE2_SPTR ptrend,uint32_t * minp,uint32_t * maxp,int * errorcodeptr)1373 read_repeat_counts(PCRE2_SPTR *ptrptr, PCRE2_SPTR ptrend, uint32_t *minp,
1374   uint32_t *maxp, int *errorcodeptr)
1375 {
1376 PCRE2_SPTR p = *ptrptr;
1377 BOOL yield = FALSE;
1378 int32_t min = 0;
1379 int32_t max = REPEAT_UNLIMITED; /* This value is larger than MAX_REPEAT_COUNT */
1380 
1381 /* NB read_number() initializes the error code to zero. The only error is for a
1382 number that is too big. */
1383 
1384 if (!read_number(&p, ptrend, -1, MAX_REPEAT_COUNT, ERR5, &min, errorcodeptr))
1385   goto EXIT;
1386 
1387 if (p >= ptrend) goto EXIT;
1388 
1389 if (*p == CHAR_RIGHT_CURLY_BRACKET)
1390   {
1391   p++;
1392   max = min;
1393   }
1394 
1395 else
1396   {
1397   if (*p++ != CHAR_COMMA || p >= ptrend) goto EXIT;
1398   if (*p != CHAR_RIGHT_CURLY_BRACKET)
1399     {
1400     if (!read_number(&p, ptrend, -1, MAX_REPEAT_COUNT, ERR5, &max,
1401         errorcodeptr) || p >= ptrend ||  *p != CHAR_RIGHT_CURLY_BRACKET)
1402       goto EXIT;
1403     if (max < min)
1404       {
1405       *errorcodeptr = ERR4;
1406       goto EXIT;
1407       }
1408     }
1409   p++;
1410   }
1411 
1412 yield = TRUE;
1413 if (minp != NULL) *minp = (uint32_t)min;
1414 if (maxp != NULL) *maxp = (uint32_t)max;
1415 
1416 /* Update the pattern pointer on success, or after an error, but not when
1417 the result is "not a repeat quantifier". */
1418 
1419 EXIT:
1420 if (yield || *errorcodeptr != 0) *ptrptr = p;
1421 return yield;
1422 
1423 
1424 
1425 }
1426 
1427 
1428 
1429 /*************************************************
1430 *            Handle escapes                      *
1431 *************************************************/
1432 
1433 /* This function is called when a \ has been encountered. It either returns a
1434 positive value for a simple escape such as \d, or 0 for a data character, which
1435 is placed in chptr. A backreference to group n is returned as negative n. On
1436 entry, ptr is pointing at the character after \. On exit, it points after the
1437 final code unit of the escape sequence.
1438 
1439 This function is also called from pcre2_substitute() to handle escape sequences
1440 in replacement strings. In this case, the cb argument is NULL, and in the case
1441 of escapes that have further processing, only sequences that define a data
1442 character are recognised. The isclass argument is not relevant; the options
1443 argument is the final value of the compiled pattern's options.
1444 
1445 Arguments:
1446   ptrptr         points to the input position pointer
1447   ptrend         points to the end of the input
1448   chptr          points to a returned data character
1449   errorcodeptr   points to the errorcode variable (containing zero)
1450   options        the current options bits
1451   isclass        TRUE if inside a character class
1452   cb             compile data block or NULL when called from pcre2_substitute()
1453 
1454 Returns:         zero => a data character
1455                  positive => a special escape sequence
1456                  negative => a numerical back reference
1457                  on error, errorcodeptr is set non-zero
1458 */
1459 
1460 int
PRIV(check_escape)1461 PRIV(check_escape)(PCRE2_SPTR *ptrptr, PCRE2_SPTR ptrend, uint32_t *chptr,
1462   int *errorcodeptr, uint32_t options, uint32_t extra_options, BOOL isclass,
1463   compile_block *cb)
1464 {
1465 BOOL utf = (options & PCRE2_UTF) != 0;
1466 PCRE2_SPTR ptr = *ptrptr;
1467 uint32_t c, cc;
1468 int escape = 0;
1469 int i;
1470 
1471 /* If backslash is at the end of the string, it's an error. */
1472 
1473 if (ptr >= ptrend)
1474   {
1475   *errorcodeptr = ERR1;
1476   return 0;
1477   }
1478 
1479 GETCHARINCTEST(c, ptr);         /* Get character value, increment pointer */
1480 *errorcodeptr = 0;              /* Be optimistic */
1481 
1482 /* Non-alphanumerics are literals, so we just leave the value in c. An initial
1483 value test saves a memory lookup for code points outside the alphanumeric
1484 range. */
1485 
1486 if (c < ESCAPES_FIRST || c > ESCAPES_LAST) {}  /* Definitely literal */
1487 
1488 /* Otherwise, do a table lookup. Non-zero values need little processing here. A
1489 positive value is a literal value for something like \n. A negative value is
1490 the negation of one of the ESC_ macros that is passed back for handling by the
1491 calling function. Some extra checking is needed for \N because only \N{U+dddd}
1492 is supported. If the value is zero, further processing is handled below. */
1493 
1494 else if ((i = escapes[c - ESCAPES_FIRST]) != 0)
1495   {
1496   if (i > 0)
1497     {
1498     c = (uint32_t)i;
1499     if (c == CHAR_CR && (extra_options & PCRE2_EXTRA_ESCAPED_CR_IS_LF) != 0)
1500       c = CHAR_LF;
1501     }
1502   else  /* Negative table entry */
1503     {
1504     escape = -i;                    /* Else return a special escape */
1505     if (cb != NULL && (escape == ESC_P || escape == ESC_p || escape == ESC_X))
1506       cb->external_flags |= PCRE2_HASBKPORX;   /* Note \P, \p, or \X */
1507 
1508     /* Perl supports \N{name} for character names and \N{U+dddd} for numerical
1509     Unicode code points, as well as plain \N for "not newline". PCRE does not
1510     support \N{name}. However, it does support quantification such as \N{2,3},
1511     so if \N{ is not followed by U+dddd we check for a quantifier. */
1512 
1513     if (escape == ESC_N && ptr < ptrend && *ptr == CHAR_LEFT_CURLY_BRACKET)
1514       {
1515       PCRE2_SPTR p = ptr + 1;
1516 
1517       /* \N{U+ can be handled by the \x{ code. However, this construction is
1518       not valid in EBCDIC environments because it specifies a Unicode
1519       character, not a codepoint in the local code. For example \N{U+0041}
1520       must be "A" in all environments. Also, in Perl, \N{U+ forces Unicode
1521       casing semantics for the entire pattern, so allow it only in UTF (i.e.
1522       Unicode) mode. */
1523 
1524       if (ptrend - p > 1 && *p == CHAR_U && p[1] == CHAR_PLUS)
1525         {
1526 #ifdef EBCDIC
1527         *errorcodeptr = ERR93;
1528 #else
1529         if (utf)
1530           {
1531           ptr = p + 1;
1532           escape = 0;   /* Not a fancy escape after all */
1533           goto COME_FROM_NU;
1534           }
1535         else *errorcodeptr = ERR93;
1536 #endif
1537         }
1538 
1539       /* Give an error if what follows is not a quantifier, but don't override
1540       an error set by the quantifier reader (e.g. number overflow). */
1541 
1542       else
1543         {
1544         if (!read_repeat_counts(&p, ptrend, NULL, NULL, errorcodeptr) &&
1545              *errorcodeptr == 0)
1546           *errorcodeptr = ERR37;
1547         }
1548       }
1549     }
1550   }
1551 
1552 /* Escapes that need further processing, including those that are unknown, have
1553 a zero entry in the lookup table. When called from pcre2_substitute(), only \c,
1554 \o, and \x are recognized (\u and \U can never appear as they are used for case
1555 forcing). */
1556 
1557 else
1558   {
1559   int s;
1560   PCRE2_SPTR oldptr;
1561   BOOL overflow;
1562   BOOL alt_bsux =
1563     ((options & PCRE2_ALT_BSUX) | (extra_options & PCRE2_EXTRA_ALT_BSUX)) != 0;
1564 
1565   /* Filter calls from pcre2_substitute(). */
1566 
1567   if (cb == NULL)
1568     {
1569     if (c != CHAR_c && c != CHAR_o && c != CHAR_x)
1570       {
1571       *errorcodeptr = ERR3;
1572       return 0;
1573       }
1574     alt_bsux = FALSE;   /* Do not modify \x handling */
1575     }
1576 
1577   switch (c)
1578     {
1579     /* A number of Perl escapes are not handled by PCRE. We give an explicit
1580     error. */
1581 
1582     case CHAR_F:
1583     case CHAR_l:
1584     case CHAR_L:
1585     *errorcodeptr = ERR37;
1586     break;
1587 
1588     /* \u is unrecognized when neither PCRE2_ALT_BSUX nor PCRE2_EXTRA_ALT_BSUX
1589     is set. Otherwise, \u must be followed by exactly four hex digits or, if
1590     PCRE2_EXTRA_ALT_BSUX is set, by any number of hex digits in braces.
1591     Otherwise it is a lowercase u letter. This gives some compatibility with
1592     ECMAScript (aka JavaScript). */
1593 
1594     case CHAR_u:
1595     if (!alt_bsux) *errorcodeptr = ERR37; else
1596       {
1597       uint32_t xc;
1598 
1599       if (ptr >= ptrend) break;
1600       if (*ptr == CHAR_LEFT_CURLY_BRACKET &&
1601           (extra_options & PCRE2_EXTRA_ALT_BSUX) != 0)
1602         {
1603         PCRE2_SPTR hptr = ptr + 1;
1604         cc = 0;
1605 
1606         while (hptr < ptrend && (xc = XDIGIT(*hptr)) != 0xff)
1607           {
1608           if ((cc & 0xf0000000) != 0)  /* Test for 32-bit overflow */
1609             {
1610             *errorcodeptr = ERR77;
1611             ptr = hptr;   /* Show where */
1612             break;        /* *hptr != } will cause another break below */
1613             }
1614           cc = (cc << 4) | xc;
1615           hptr++;
1616           }
1617 
1618         if (hptr == ptr + 1 ||   /* No hex digits */
1619             hptr >= ptrend ||    /* Hit end of input */
1620             *hptr != CHAR_RIGHT_CURLY_BRACKET)  /* No } terminator */
1621           break;         /* Hex escape not recognized */
1622 
1623         c = cc;          /* Accept the code point */
1624         ptr = hptr + 1;
1625         }
1626 
1627       else  /* Must be exactly 4 hex digits */
1628         {
1629         if (ptrend - ptr < 4) break;               /* Less than 4 chars */
1630         if ((cc = XDIGIT(ptr[0])) == 0xff) break;  /* Not a hex digit */
1631         if ((xc = XDIGIT(ptr[1])) == 0xff) break;  /* Not a hex digit */
1632         cc = (cc << 4) | xc;
1633         if ((xc = XDIGIT(ptr[2])) == 0xff) break;  /* Not a hex digit */
1634         cc = (cc << 4) | xc;
1635         if ((xc = XDIGIT(ptr[3])) == 0xff) break;  /* Not a hex digit */
1636         c = (cc << 4) | xc;
1637         ptr += 4;
1638         }
1639 
1640       if (utf)
1641         {
1642         if (c > 0x10ffffU) *errorcodeptr = ERR77;
1643         else
1644           if (c >= 0xd800 && c <= 0xdfff &&
1645               (extra_options & PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES) == 0)
1646                 *errorcodeptr = ERR73;
1647         }
1648       else if (c > MAX_NON_UTF_CHAR) *errorcodeptr = ERR77;
1649       }
1650     break;
1651 
1652     /* \U is unrecognized unless PCRE2_ALT_BSUX or PCRE2_EXTRA_ALT_BSUX is set,
1653     in which case it is an upper case letter. */
1654 
1655     case CHAR_U:
1656     if (!alt_bsux) *errorcodeptr = ERR37;
1657     break;
1658 
1659     /* In a character class, \g is just a literal "g". Outside a character
1660     class, \g must be followed by one of a number of specific things:
1661 
1662     (1) A number, either plain or braced. If positive, it is an absolute
1663     backreference. If negative, it is a relative backreference. This is a Perl
1664     5.10 feature.
1665 
1666     (2) Perl 5.10 also supports \g{name} as a reference to a named group. This
1667     is part of Perl's movement towards a unified syntax for back references. As
1668     this is synonymous with \k{name}, we fudge it up by pretending it really
1669     was \k{name}.
1670 
1671     (3) For Oniguruma compatibility we also support \g followed by a name or a
1672     number either in angle brackets or in single quotes. However, these are
1673     (possibly recursive) subroutine calls, _not_ backreferences. We return
1674     the ESC_g code.
1675 
1676     Summary: Return a negative number for a numerical back reference, ESC_k for
1677     a named back reference, and ESC_g for a named or numbered subroutine call.
1678     */
1679 
1680     case CHAR_g:
1681     if (isclass) break;
1682 
1683     if (ptr >= ptrend)
1684       {
1685       *errorcodeptr = ERR57;
1686       break;
1687       }
1688 
1689     if (*ptr == CHAR_LESS_THAN_SIGN || *ptr == CHAR_APOSTROPHE)
1690       {
1691       escape = ESC_g;
1692       break;
1693       }
1694 
1695     /* If there is a brace delimiter, try to read a numerical reference. If
1696     there isn't one, assume we have a name and treat it as \k. */
1697 
1698     if (*ptr == CHAR_LEFT_CURLY_BRACKET)
1699       {
1700       PCRE2_SPTR p = ptr + 1;
1701       if (!read_number(&p, ptrend, cb->bracount, MAX_GROUP_NUMBER, ERR61, &s,
1702           errorcodeptr))
1703         {
1704         if (*errorcodeptr == 0) escape = ESC_k;  /* No number found */
1705         break;
1706         }
1707       if (p >= ptrend || *p != CHAR_RIGHT_CURLY_BRACKET)
1708         {
1709         *errorcodeptr = ERR57;
1710         break;
1711         }
1712       ptr = p + 1;
1713       }
1714 
1715     /* Read an undelimited number */
1716 
1717     else
1718       {
1719       if (!read_number(&ptr, ptrend, cb->bracount, MAX_GROUP_NUMBER, ERR61, &s,
1720           errorcodeptr))
1721         {
1722         if (*errorcodeptr == 0) *errorcodeptr = ERR57;  /* No number found */
1723         break;
1724         }
1725       }
1726 
1727     if (s <= 0)
1728       {
1729       *errorcodeptr = ERR15;
1730       break;
1731       }
1732 
1733     escape = -s;
1734     break;
1735 
1736     /* The handling of escape sequences consisting of a string of digits
1737     starting with one that is not zero is not straightforward. Perl has changed
1738     over the years. Nowadays \g{} for backreferences and \o{} for octal are
1739     recommended to avoid the ambiguities in the old syntax.
1740 
1741     Outside a character class, the digits are read as a decimal number. If the
1742     number is less than 10, or if there are that many previous extracting left
1743     brackets, it is a back reference. Otherwise, up to three octal digits are
1744     read to form an escaped character code. Thus \123 is likely to be octal 123
1745     (cf \0123, which is octal 012 followed by the literal 3).
1746 
1747     Inside a character class, \ followed by a digit is always either a literal
1748     8 or 9 or an octal number. */
1749 
1750     case CHAR_1: case CHAR_2: case CHAR_3: case CHAR_4: case CHAR_5:
1751     case CHAR_6: case CHAR_7: case CHAR_8: case CHAR_9:
1752 
1753     if (!isclass)
1754       {
1755       oldptr = ptr;
1756       ptr--;   /* Back to the digit */
1757       if (!read_number(&ptr, ptrend, -1, INT_MAX/10 - 1, ERR61, &s,
1758           errorcodeptr))
1759         break;
1760 
1761       /* \1 to \9 are always back references. \8x and \9x are too; \1x to \7x
1762       are octal escapes if there are not that many previous captures. */
1763 
1764       if (s < 10 || oldptr[-1] >= CHAR_8 || s <= (int)cb->bracount)
1765         {
1766         if (s > (int)MAX_GROUP_NUMBER) *errorcodeptr = ERR61;
1767           else escape = -s;     /* Indicates a back reference */
1768         break;
1769         }
1770       ptr = oldptr;      /* Put the pointer back and fall through */
1771       }
1772 
1773     /* Handle a digit following \ when the number is not a back reference, or
1774     we are within a character class. If the first digit is 8 or 9, Perl used to
1775     generate a binary zero and then treat the digit as a following literal. At
1776     least by Perl 5.18 this changed so as not to insert the binary zero. */
1777 
1778     if (c >= CHAR_8) break;
1779 
1780     /* Fall through */
1781 
1782     /* \0 always starts an octal number, but we may drop through to here with a
1783     larger first octal digit. The original code used just to take the least
1784     significant 8 bits of octal numbers (I think this is what early Perls used
1785     to do). Nowadays we allow for larger numbers in UTF-8 mode and 16-bit mode,
1786     but no more than 3 octal digits. */
1787 
1788     case CHAR_0:
1789     c -= CHAR_0;
1790     while(i++ < 2 && ptr < ptrend && *ptr >= CHAR_0 && *ptr <= CHAR_7)
1791         c = c * 8 + *ptr++ - CHAR_0;
1792 #if PCRE2_CODE_UNIT_WIDTH == 8
1793     if (!utf && c > 0xff) *errorcodeptr = ERR51;
1794 #endif
1795     break;
1796 
1797     /* \o is a relatively new Perl feature, supporting a more general way of
1798     specifying character codes in octal. The only supported form is \o{ddd}. */
1799 
1800     case CHAR_o:
1801     if (ptr >= ptrend || *ptr++ != CHAR_LEFT_CURLY_BRACKET)
1802       {
1803       ptr--;
1804       *errorcodeptr = ERR55;
1805       }
1806     else if (ptr >= ptrend || *ptr == CHAR_RIGHT_CURLY_BRACKET)
1807       *errorcodeptr = ERR78;
1808     else
1809       {
1810       c = 0;
1811       overflow = FALSE;
1812       while (ptr < ptrend && *ptr >= CHAR_0 && *ptr <= CHAR_7)
1813         {
1814         cc = *ptr++;
1815         if (c == 0 && cc == CHAR_0) continue;     /* Leading zeroes */
1816 #if PCRE2_CODE_UNIT_WIDTH == 32
1817         if (c >= 0x20000000l) { overflow = TRUE; break; }
1818 #endif
1819         c = (c << 3) + (cc - CHAR_0);
1820 #if PCRE2_CODE_UNIT_WIDTH == 8
1821         if (c > (utf ? 0x10ffffU : 0xffU)) { overflow = TRUE; break; }
1822 #elif PCRE2_CODE_UNIT_WIDTH == 16
1823         if (c > (utf ? 0x10ffffU : 0xffffU)) { overflow = TRUE; break; }
1824 #elif PCRE2_CODE_UNIT_WIDTH == 32
1825         if (utf && c > 0x10ffffU) { overflow = TRUE; break; }
1826 #endif
1827         }
1828       if (overflow)
1829         {
1830         while (ptr < ptrend && *ptr >= CHAR_0 && *ptr <= CHAR_7) ptr++;
1831         *errorcodeptr = ERR34;
1832         }
1833       else if (ptr < ptrend && *ptr++ == CHAR_RIGHT_CURLY_BRACKET)
1834         {
1835         if (utf && c >= 0xd800 && c <= 0xdfff &&
1836             (extra_options & PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES) == 0)
1837           {
1838           ptr--;
1839           *errorcodeptr = ERR73;
1840           }
1841         }
1842       else
1843         {
1844         ptr--;
1845         *errorcodeptr = ERR64;
1846         }
1847       }
1848     break;
1849 
1850     /* When PCRE2_ALT_BSUX or PCRE2_EXTRA_ALT_BSUX is set, \x must be followed
1851     by two hexadecimal digits. Otherwise it is a lowercase x letter. */
1852 
1853     case CHAR_x:
1854     if (alt_bsux)
1855       {
1856       uint32_t xc;
1857       if (ptrend - ptr < 2) break;               /* Less than 2 characters */
1858       if ((cc = XDIGIT(ptr[0])) == 0xff) break;  /* Not a hex digit */
1859       if ((xc = XDIGIT(ptr[1])) == 0xff) break;  /* Not a hex digit */
1860       c = (cc << 4) | xc;
1861       ptr += 2;
1862       }
1863 
1864     /* Handle \x in Perl's style. \x{ddd} is a character code which can be
1865     greater than 0xff in UTF-8 or non-8bit mode, but only if the ddd are hex
1866     digits. If not, { used to be treated as a data character. However, Perl
1867     seems to read hex digits up to the first non-such, and ignore the rest, so
1868     that, for example \x{zz} matches a binary zero. This seems crazy, so PCRE
1869     now gives an error. */
1870 
1871     else
1872       {
1873       if (ptr < ptrend && *ptr == CHAR_LEFT_CURLY_BRACKET)
1874         {
1875 #ifndef EBCDIC
1876         COME_FROM_NU:
1877 #endif
1878         if (++ptr >= ptrend || *ptr == CHAR_RIGHT_CURLY_BRACKET)
1879           {
1880           *errorcodeptr = ERR78;
1881           break;
1882           }
1883         c = 0;
1884         overflow = FALSE;
1885 
1886         while (ptr < ptrend && (cc = XDIGIT(*ptr)) != 0xff)
1887           {
1888           ptr++;
1889           if (c == 0 && cc == 0) continue;   /* Leading zeroes */
1890 #if PCRE2_CODE_UNIT_WIDTH == 32
1891           if (c >= 0x10000000l) { overflow = TRUE; break; }
1892 #endif
1893           c = (c << 4) | cc;
1894           if ((utf && c > 0x10ffffU) || (!utf && c > MAX_NON_UTF_CHAR))
1895             {
1896             overflow = TRUE;
1897             break;
1898             }
1899           }
1900 
1901         if (overflow)
1902           {
1903           while (ptr < ptrend && XDIGIT(*ptr) != 0xff) ptr++;
1904           *errorcodeptr = ERR34;
1905           }
1906         else if (ptr < ptrend && *ptr++ == CHAR_RIGHT_CURLY_BRACKET)
1907           {
1908           if (utf && c >= 0xd800 && c <= 0xdfff &&
1909               (extra_options & PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES) == 0)
1910             {
1911             ptr--;
1912             *errorcodeptr = ERR73;
1913             }
1914           }
1915 
1916         /* If the sequence of hex digits does not end with '}', give an error.
1917         We used just to recognize this construct and fall through to the normal
1918         \x handling, but nowadays Perl gives an error, which seems much more
1919         sensible, so we do too. */
1920 
1921         else
1922           {
1923           ptr--;
1924           *errorcodeptr = ERR67;
1925           }
1926         }   /* End of \x{} processing */
1927 
1928       /* Read a up to two hex digits after \x */
1929 
1930       else
1931         {
1932         c = 0;
1933         if (ptr >= ptrend || (cc = XDIGIT(*ptr)) == 0xff) break;  /* Not a hex digit */
1934         ptr++;
1935         c = cc;
1936         if (ptr >= ptrend || (cc = XDIGIT(*ptr)) == 0xff) break;  /* Not a hex digit */
1937         ptr++;
1938         c = (c << 4) | cc;
1939         }     /* End of \xdd handling */
1940       }       /* End of Perl-style \x handling */
1941     break;
1942 
1943     /* The handling of \c is different in ASCII and EBCDIC environments. In an
1944     ASCII (or Unicode) environment, an error is given if the character
1945     following \c is not a printable ASCII character. Otherwise, the following
1946     character is upper-cased if it is a letter, and after that the 0x40 bit is
1947     flipped. The result is the value of the escape.
1948 
1949     In an EBCDIC environment the handling of \c is compatible with the
1950     specification in the perlebcdic document. The following character must be
1951     a letter or one of small number of special characters. These provide a
1952     means of defining the character values 0-31.
1953 
1954     For testing the EBCDIC handling of \c in an ASCII environment, recognize
1955     the EBCDIC value of 'c' explicitly. */
1956 
1957 #if defined EBCDIC && 'a' != 0x81
1958     case 0x83:
1959 #else
1960     case CHAR_c:
1961 #endif
1962     if (ptr >= ptrend)
1963       {
1964       *errorcodeptr = ERR2;
1965       break;
1966       }
1967     c = *ptr;
1968     if (c >= CHAR_a && c <= CHAR_z) c = UPPER_CASE(c);
1969 
1970     /* Handle \c in an ASCII/Unicode environment. */
1971 
1972 #ifndef EBCDIC    /* ASCII/UTF-8 coding */
1973     if (c < 32 || c > 126)  /* Excludes all non-printable ASCII */
1974       {
1975       *errorcodeptr = ERR68;
1976       break;
1977       }
1978     c ^= 0x40;
1979 
1980     /* Handle \c in an EBCDIC environment. The special case \c? is converted to
1981     255 (0xff) or 95 (0x5f) if other characters suggest we are using the
1982     POSIX-BC encoding. (This is the way Perl indicates that it handles \c?.)
1983     The other valid sequences correspond to a list of specific characters. */
1984 
1985 #else
1986     if (c == CHAR_QUESTION_MARK)
1987       c = ('\\' == 188 && '`' == 74)? 0x5f : 0xff;
1988     else
1989       {
1990       for (i = 0; i < 32; i++)
1991         {
1992         if (c == ebcdic_escape_c[i]) break;
1993         }
1994       if (i < 32) c = i; else *errorcodeptr = ERR68;
1995       }
1996 #endif  /* EBCDIC */
1997 
1998     ptr++;
1999     break;
2000 
2001     /* Any other alphanumeric following \ is an error. Perl gives an error only
2002     if in warning mode, but PCRE doesn't have a warning mode. */
2003 
2004     default:
2005     *errorcodeptr = ERR3;
2006     *ptrptr = ptr - 1;     /* Point to the character at fault */
2007     return 0;
2008     }
2009   }
2010 
2011 /* Set the pointer to the next character before returning. */
2012 
2013 *ptrptr = ptr;
2014 *chptr = c;
2015 return escape;
2016 }
2017 
2018 
2019 
2020 #ifdef SUPPORT_UNICODE
2021 /*************************************************
2022 *               Handle \P and \p                 *
2023 *************************************************/
2024 
2025 /* This function is called after \P or \p has been encountered, provided that
2026 PCRE2 is compiled with support for UTF and Unicode properties. On entry, the
2027 contents of ptrptr are pointing after the P or p. On exit, it is left pointing
2028 after the final code unit of the escape sequence.
2029 
2030 Arguments:
2031   ptrptr         the pattern position pointer
2032   negptr         a boolean that is set TRUE for negation else FALSE
2033   ptypeptr       an unsigned int that is set to the type value
2034   pdataptr       an unsigned int that is set to the detailed property value
2035   errorcodeptr   the error code variable
2036   cb             the compile data
2037 
2038 Returns:         TRUE if the type value was found, or FALSE for an invalid type
2039 */
2040 
2041 static BOOL
get_ucp(PCRE2_SPTR * ptrptr,BOOL * negptr,uint16_t * ptypeptr,uint16_t * pdataptr,int * errorcodeptr,compile_block * cb)2042 get_ucp(PCRE2_SPTR *ptrptr, BOOL *negptr, uint16_t *ptypeptr,
2043   uint16_t *pdataptr, int *errorcodeptr, compile_block *cb)
2044 {
2045 PCRE2_UCHAR c;
2046 PCRE2_SIZE i, bot, top;
2047 PCRE2_SPTR ptr = *ptrptr;
2048 PCRE2_UCHAR name[32];
2049 
2050 if (ptr >= cb->end_pattern) goto ERROR_RETURN;
2051 c = *ptr++;
2052 *negptr = FALSE;
2053 
2054 /* \P or \p can be followed by a name in {}, optionally preceded by ^ for
2055 negation. */
2056 
2057 if (c == CHAR_LEFT_CURLY_BRACKET)
2058   {
2059   if (ptr >= cb->end_pattern) goto ERROR_RETURN;
2060   if (*ptr == CHAR_CIRCUMFLEX_ACCENT)
2061     {
2062     *negptr = TRUE;
2063     ptr++;
2064     }
2065   for (i = 0; i < (int)(sizeof(name) / sizeof(PCRE2_UCHAR)) - 1; i++)
2066     {
2067     if (ptr >= cb->end_pattern) goto ERROR_RETURN;
2068     c = *ptr++;
2069     if (c == CHAR_NUL) goto ERROR_RETURN;
2070     if (c == CHAR_RIGHT_CURLY_BRACKET) break;
2071     name[i] = c;
2072     }
2073   if (c != CHAR_RIGHT_CURLY_BRACKET) goto ERROR_RETURN;
2074   name[i] = 0;
2075   }
2076 
2077 /* Otherwise there is just one following character, which must be an ASCII
2078 letter. */
2079 
2080 else if (MAX_255(c) && (cb->ctypes[c] & ctype_letter) != 0)
2081   {
2082   name[0] = c;
2083   name[1] = 0;
2084   }
2085 else goto ERROR_RETURN;
2086 
2087 *ptrptr = ptr;
2088 
2089 /* Search for a recognized property name using binary chop. */
2090 
2091 bot = 0;
2092 top = PRIV(utt_size);
2093 
2094 while (bot < top)
2095   {
2096   int r;
2097   i = (bot + top) >> 1;
2098   r = PRIV(strcmp_c8)(name, PRIV(utt_names) + PRIV(utt)[i].name_offset);
2099   if (r == 0)
2100     {
2101     *ptypeptr = PRIV(utt)[i].type;
2102     *pdataptr = PRIV(utt)[i].value;
2103     return TRUE;
2104     }
2105   if (r > 0) bot = i + 1; else top = i;
2106   }
2107 *errorcodeptr = ERR47;   /* Unrecognized name */
2108 return FALSE;
2109 
2110 ERROR_RETURN:            /* Malformed \P or \p */
2111 *errorcodeptr = ERR46;
2112 *ptrptr = ptr;
2113 return FALSE;
2114 }
2115 #endif
2116 
2117 
2118 
2119 /*************************************************
2120 *           Check for POSIX class syntax         *
2121 *************************************************/
2122 
2123 /* This function is called when the sequence "[:" or "[." or "[=" is
2124 encountered in a character class. It checks whether this is followed by a
2125 sequence of characters terminated by a matching ":]" or ".]" or "=]". If we
2126 reach an unescaped ']' without the special preceding character, return FALSE.
2127 
2128 Originally, this function only recognized a sequence of letters between the
2129 terminators, but it seems that Perl recognizes any sequence of characters,
2130 though of course unknown POSIX names are subsequently rejected. Perl gives an
2131 "Unknown POSIX class" error for [:f\oo:] for example, where previously PCRE
2132 didn't consider this to be a POSIX class. Likewise for [:1234:].
2133 
2134 The problem in trying to be exactly like Perl is in the handling of escapes. We
2135 have to be sure that [abc[:x\]pqr] is *not* treated as containing a POSIX
2136 class, but [abc[:x\]pqr:]] is (so that an error can be generated). The code
2137 below handles the special cases \\ and \], but does not try to do any other
2138 escape processing. This makes it different from Perl for cases such as
2139 [:l\ower:] where Perl recognizes it as the POSIX class "lower" but PCRE does
2140 not recognize "l\ower". This is a lesser evil than not diagnosing bad classes
2141 when Perl does, I think.
2142 
2143 A user pointed out that PCRE was rejecting [:a[:digit:]] whereas Perl was not.
2144 It seems that the appearance of a nested POSIX class supersedes an apparent
2145 external class. For example, [:a[:digit:]b:] matches "a", "b", ":", or
2146 a digit. This is handled by returning FALSE if the start of a new group with
2147 the same terminator is encountered, since the next closing sequence must close
2148 the nested group, not the outer one.
2149 
2150 In Perl, unescaped square brackets may also appear as part of class names. For
2151 example, [:a[:abc]b:] gives unknown POSIX class "[:abc]b:]". However, for
2152 [:a[:abc]b][b:] it gives unknown POSIX class "[:abc]b][b:]", which does not
2153 seem right at all. PCRE does not allow closing square brackets in POSIX class
2154 names.
2155 
2156 Arguments:
2157   ptr      pointer to the character after the initial [ (colon, dot, equals)
2158   ptrend   pointer to the end of the pattern
2159   endptr   where to return a pointer to the terminating ':', '.', or '='
2160 
2161 Returns:   TRUE or FALSE
2162 */
2163 
2164 static BOOL
check_posix_syntax(PCRE2_SPTR ptr,PCRE2_SPTR ptrend,PCRE2_SPTR * endptr)2165 check_posix_syntax(PCRE2_SPTR ptr, PCRE2_SPTR ptrend, PCRE2_SPTR *endptr)
2166 {
2167 PCRE2_UCHAR terminator;  /* Don't combine these lines; the Solaris cc */
2168 terminator = *ptr++;     /* compiler warns about "non-constant" initializer. */
2169 
2170 for (; ptrend - ptr >= 2; ptr++)
2171   {
2172   if (*ptr == CHAR_BACKSLASH &&
2173       (ptr[1] == CHAR_RIGHT_SQUARE_BRACKET || ptr[1] == CHAR_BACKSLASH))
2174     ptr++;
2175 
2176   else if ((*ptr == CHAR_LEFT_SQUARE_BRACKET && ptr[1] == terminator) ||
2177             *ptr == CHAR_RIGHT_SQUARE_BRACKET) return FALSE;
2178 
2179   else if (*ptr == terminator && ptr[1] == CHAR_RIGHT_SQUARE_BRACKET)
2180     {
2181     *endptr = ptr;
2182     return TRUE;
2183     }
2184   }
2185 
2186 return FALSE;
2187 }
2188 
2189 
2190 
2191 /*************************************************
2192 *          Check POSIX class name                *
2193 *************************************************/
2194 
2195 /* This function is called to check the name given in a POSIX-style class entry
2196 such as [:alnum:].
2197 
2198 Arguments:
2199   ptr        points to the first letter
2200   len        the length of the name
2201 
2202 Returns:     a value representing the name, or -1 if unknown
2203 */
2204 
2205 static int
check_posix_name(PCRE2_SPTR ptr,int len)2206 check_posix_name(PCRE2_SPTR ptr, int len)
2207 {
2208 const char *pn = posix_names;
2209 int yield = 0;
2210 while (posix_name_lengths[yield] != 0)
2211   {
2212   if (len == posix_name_lengths[yield] &&
2213     PRIV(strncmp_c8)(ptr, pn, (unsigned int)len) == 0) return yield;
2214   pn += posix_name_lengths[yield] + 1;
2215   yield++;
2216   }
2217 return -1;
2218 }
2219 
2220 
2221 
2222 /*************************************************
2223 *       Read a subpattern or VERB name           *
2224 *************************************************/
2225 
2226 /* This function is called from parse_regex() below whenever it needs to read
2227 the name of a subpattern or a (*VERB) or an (*alpha_assertion). The initial
2228 pointer must be to the character before the name. If that character is '*' we
2229 are reading a verb or alpha assertion name. The pointer is updated to point
2230 after the name, for a VERB or alpha assertion name, or after tha name's
2231 terminator for a subpattern name. Returning both the offset and the name
2232 pointer is redundant information, but some callers use one and some the other,
2233 so it is simplest just to return both.
2234 
2235 Arguments:
2236   ptrptr      points to the character pointer variable
2237   ptrend      points to the end of the input string
2238   utf         true if the input is UTF-encoded
2239   terminator  the terminator of a subpattern name must be this
2240   offsetptr   where to put the offset from the start of the pattern
2241   nameptr     where to put a pointer to the name in the input
2242   namelenptr  where to put the length of the name
2243   errcodeptr  where to put an error code
2244   cb          pointer to the compile data block
2245 
2246 Returns:    TRUE if a name was read
2247             FALSE otherwise, with error code set
2248 */
2249 
2250 static BOOL
read_name(PCRE2_SPTR * ptrptr,PCRE2_SPTR ptrend,BOOL utf,uint32_t terminator,PCRE2_SIZE * offsetptr,PCRE2_SPTR * nameptr,uint32_t * namelenptr,int * errorcodeptr,compile_block * cb)2251 read_name(PCRE2_SPTR *ptrptr, PCRE2_SPTR ptrend, BOOL utf, uint32_t terminator,
2252   PCRE2_SIZE *offsetptr, PCRE2_SPTR *nameptr, uint32_t *namelenptr,
2253   int *errorcodeptr, compile_block *cb)
2254 {
2255 PCRE2_SPTR ptr = *ptrptr;
2256 BOOL is_group = (*ptr != CHAR_ASTERISK);
2257 
2258 if (++ptr >= ptrend)               /* No characters in name */
2259   {
2260   *errorcodeptr = is_group? ERR62: /* Subpattern name expected */
2261                             ERR60; /* Verb not recognized or malformed */
2262   goto FAILED;
2263   }
2264 
2265 *nameptr = ptr;
2266 *offsetptr = (PCRE2_SIZE)(ptr - cb->start_pattern);
2267 
2268 /* In UTF mode, a group name may contain letters and decimal digits as defined
2269 by Unicode properties, and underscores, but must not start with a digit. */
2270 
2271 #ifdef SUPPORT_UNICODE
2272 if (utf && is_group)
2273   {
2274   uint32_t c, type;
2275 
2276   GETCHAR(c, ptr);
2277   type = UCD_CHARTYPE(c);
2278 
2279   if (type == ucp_Nd)
2280     {
2281     *errorcodeptr = ERR44;
2282     goto FAILED;
2283     }
2284 
2285   for(;;)
2286     {
2287     if (type != ucp_Nd && PRIV(ucp_gentype)[type] != ucp_L &&
2288         c != CHAR_UNDERSCORE) break;
2289     ptr++;
2290     FORWARDCHARTEST(ptr, ptrend);
2291     if (ptr >= ptrend) break;
2292     GETCHAR(c, ptr);
2293     type = UCD_CHARTYPE(c);
2294     }
2295   }
2296 else
2297 #else
2298 (void)utf;  /* Avoid compiler warning */
2299 #endif      /* SUPPORT_UNICODE */
2300 
2301 /* Handle non-group names and group names in non-UTF modes. A group name must
2302 not start with a digit. If either of the others start with a digit it just
2303 won't be recognized. */
2304 
2305   {
2306   if (is_group && IS_DIGIT(*ptr))
2307     {
2308     *errorcodeptr = ERR44;
2309     goto FAILED;
2310     }
2311 
2312   while (ptr < ptrend && MAX_255(*ptr) && (cb->ctypes[*ptr] & ctype_word) != 0)
2313     {
2314     ptr++;
2315     }
2316   }
2317 
2318 /* Check name length */
2319 
2320 if (ptr > *nameptr + MAX_NAME_SIZE)
2321   {
2322   *errorcodeptr = ERR48;
2323   goto FAILED;
2324   }
2325 *namelenptr = ptr - *nameptr;
2326 
2327 /* Subpattern names must not be empty, and their terminator is checked here.
2328 (What follows a verb or alpha assertion name is checked separately.) */
2329 
2330 if (is_group)
2331   {
2332   if (ptr == *nameptr)
2333     {
2334     *errorcodeptr = ERR62;   /* Subpattern name expected */
2335     goto FAILED;
2336     }
2337   if (ptr >= ptrend || *ptr != (PCRE2_UCHAR)terminator)
2338     {
2339     *errorcodeptr = ERR42;
2340     goto FAILED;
2341     }
2342   ptr++;
2343   }
2344 
2345 *ptrptr = ptr;
2346 return TRUE;
2347 
2348 FAILED:
2349 *ptrptr = ptr;
2350 return FALSE;
2351 }
2352 
2353 
2354 
2355 /*************************************************
2356 *          Manage callouts at start of cycle     *
2357 *************************************************/
2358 
2359 /* At the start of a new item in parse_regex() we are able to record the
2360 details of the previous item in a prior callout, and also to set up an
2361 automatic callout if enabled. Avoid having two adjacent automatic callouts,
2362 which would otherwise happen for items such as \Q that contribute nothing to
2363 the parsed pattern.
2364 
2365 Arguments:
2366   ptr              current pattern pointer
2367   pcalloutptr      points to a pointer to previous callout, or NULL
2368   auto_callout     TRUE if auto_callouts are enabled
2369   parsed_pattern   the parsed pattern pointer
2370   cb               compile block
2371 
2372 Returns: possibly updated parsed_pattern pointer.
2373 */
2374 
2375 static uint32_t *
manage_callouts(PCRE2_SPTR ptr,uint32_t ** pcalloutptr,BOOL auto_callout,uint32_t * parsed_pattern,compile_block * cb)2376 manage_callouts(PCRE2_SPTR ptr, uint32_t **pcalloutptr, BOOL auto_callout,
2377   uint32_t *parsed_pattern, compile_block *cb)
2378 {
2379 uint32_t *previous_callout = *pcalloutptr;
2380 
2381 if (previous_callout != NULL) previous_callout[2] = (uint32_t)(ptr -
2382   cb->start_pattern - (PCRE2_SIZE)previous_callout[1]);
2383 
2384 if (!auto_callout) previous_callout = NULL; else
2385   {
2386   if (previous_callout == NULL ||
2387       previous_callout != parsed_pattern - 4 ||
2388       previous_callout[3] != 255)
2389     {
2390     previous_callout = parsed_pattern;  /* Set up new automatic callout */
2391     parsed_pattern += 4;
2392     previous_callout[0] = META_CALLOUT_NUMBER;
2393     previous_callout[2] = 0;
2394     previous_callout[3] = 255;
2395     }
2396   previous_callout[1] = (uint32_t)(ptr - cb->start_pattern);
2397   }
2398 
2399 *pcalloutptr = previous_callout;
2400 return parsed_pattern;
2401 }
2402 
2403 
2404 
2405 /*************************************************
2406 *      Parse regex and identify named groups     *
2407 *************************************************/
2408 
2409 /* This function is called first of all. It scans the pattern and does two
2410 things: (1) It identifies capturing groups and makes a table of named capturing
2411 groups so that information about them is fully available to both the compiling
2412 scans. (2) It writes a parsed version of the pattern with comments omitted and
2413 escapes processed into the parsed_pattern vector.
2414 
2415 Arguments:
2416   ptr             points to the start of the pattern
2417   options         compiling dynamic options (may change during the scan)
2418   has_lookbehind  points to a boolean, set TRUE if a lookbehind is found
2419   cb              pointer to the compile data block
2420 
2421 Returns:   zero on success or a non-zero error code, with the
2422              error offset placed in the cb field
2423 */
2424 
2425 /* A structure and some flags for dealing with nested groups. */
2426 
2427 typedef struct nest_save {
2428   uint16_t  nest_depth;
2429   uint16_t  reset_group;
2430   uint16_t  max_group;
2431   uint16_t  flags;
2432   uint32_t  options;
2433 } nest_save;
2434 
2435 #define NSF_RESET          0x0001u
2436 #define NSF_CONDASSERT     0x0002u
2437 #define NSF_ATOMICSR       0x0004u
2438 
2439 /* Options that are changeable within the pattern must be tracked during
2440 parsing. Some (e.g. PCRE2_EXTENDED) are implemented entirely during parsing,
2441 but all must be tracked so that META_OPTIONS items set the correct values for
2442 the main compiling phase. */
2443 
2444 #define PARSE_TRACKED_OPTIONS (PCRE2_CASELESS|PCRE2_DOTALL|PCRE2_DUPNAMES| \
2445   PCRE2_EXTENDED|PCRE2_EXTENDED_MORE|PCRE2_MULTILINE|PCRE2_NO_AUTO_CAPTURE| \
2446   PCRE2_UNGREEDY)
2447 
2448 /* States used for analyzing ranges in character classes. The two OK values
2449 must be last. */
2450 
2451 enum { RANGE_NO, RANGE_STARTED, RANGE_OK_ESCAPED, RANGE_OK_LITERAL };
2452 
2453 /* Only in 32-bit mode can there be literals > META_END. A macros encapsulates
2454 the storing of literal values in the parsed pattern. */
2455 
2456 #if PCRE2_CODE_UNIT_WIDTH == 32
2457 #define PARSED_LITERAL(c, p) \
2458   { \
2459   if (c >= META_END) *p++ = META_BIGVALUE; \
2460   *p++ = c; \
2461   okquantifier = TRUE; \
2462   }
2463 #else
2464 #define PARSED_LITERAL(c, p) *p++ = c; okquantifier = TRUE;
2465 #endif
2466 
2467 /* Here's the actual function. */
2468 
parse_regex(PCRE2_SPTR ptr,uint32_t options,BOOL * has_lookbehind,compile_block * cb)2469 static int parse_regex(PCRE2_SPTR ptr, uint32_t options, BOOL *has_lookbehind,
2470   compile_block *cb)
2471 {
2472 uint32_t c;
2473 uint32_t delimiter;
2474 uint32_t namelen;
2475 uint32_t class_range_state;
2476 uint32_t *verblengthptr = NULL;     /* Value avoids compiler warning */
2477 uint32_t *previous_callout = NULL;
2478 uint32_t *parsed_pattern = cb->parsed_pattern;
2479 uint32_t *parsed_pattern_end = cb->parsed_pattern_end;
2480 uint32_t meta_quantifier = 0;
2481 uint32_t add_after_mark = 0;
2482 uint32_t extra_options = cb->cx->extra_options;
2483 uint16_t nest_depth = 0;
2484 int after_manual_callout = 0;
2485 int expect_cond_assert = 0;
2486 int errorcode = 0;
2487 int escape;
2488 int i;
2489 BOOL inescq = FALSE;
2490 BOOL inverbname = FALSE;
2491 BOOL utf = (options & PCRE2_UTF) != 0;
2492 BOOL auto_callout = (options & PCRE2_AUTO_CALLOUT) != 0;
2493 BOOL isdupname;
2494 BOOL negate_class;
2495 BOOL okquantifier = FALSE;
2496 PCRE2_SPTR thisptr;
2497 PCRE2_SPTR name;
2498 PCRE2_SPTR ptrend = cb->end_pattern;
2499 PCRE2_SPTR verbnamestart = NULL;    /* Value avoids compiler warning */
2500 named_group *ng;
2501 nest_save *top_nest, *end_nests;
2502 
2503 /* Insert leading items for word and line matching (features provided for the
2504 benefit of pcre2grep). */
2505 
2506 if ((extra_options & PCRE2_EXTRA_MATCH_LINE) != 0)
2507   {
2508   *parsed_pattern++ = META_CIRCUMFLEX;
2509   *parsed_pattern++ = META_NOCAPTURE;
2510   }
2511 else if ((extra_options & PCRE2_EXTRA_MATCH_WORD) != 0)
2512   {
2513   *parsed_pattern++ = META_ESCAPE + ESC_b;
2514   *parsed_pattern++ = META_NOCAPTURE;
2515   }
2516 
2517 /* If the pattern is actually a literal string, process it separately to avoid
2518 cluttering up the main loop. */
2519 
2520 if ((options & PCRE2_LITERAL) != 0)
2521   {
2522   while (ptr < ptrend)
2523     {
2524     if (parsed_pattern >= parsed_pattern_end)
2525       {
2526       errorcode = ERR63;  /* Internal error (parsed pattern overflow) */
2527       goto FAILED;
2528       }
2529     thisptr = ptr;
2530     GETCHARINCTEST(c, ptr);
2531     if (auto_callout)
2532       parsed_pattern = manage_callouts(thisptr, &previous_callout,
2533         auto_callout, parsed_pattern, cb);
2534     PARSED_LITERAL(c, parsed_pattern);
2535     }
2536   goto PARSED_END;
2537   }
2538 
2539 /* Process a real regex which may contain meta-characters. */
2540 
2541 top_nest = NULL;
2542 end_nests = (nest_save *)(cb->start_workspace + cb->workspace_size);
2543 
2544 /* The size of the nest_save structure might not be a factor of the size of the
2545 workspace. Therefore we must round down end_nests so as to correctly avoid
2546 creating a nest_save that spans the end of the workspace. */
2547 
2548 end_nests = (nest_save *)((char *)end_nests -
2549   ((cb->workspace_size * sizeof(PCRE2_UCHAR)) % sizeof(nest_save)));
2550 
2551 /* PCRE2_EXTENDED_MORE implies PCRE2_EXTENDED */
2552 
2553 if ((options & PCRE2_EXTENDED_MORE) != 0) options |= PCRE2_EXTENDED;
2554 
2555 /* Now scan the pattern */
2556 
2557 while (ptr < ptrend)
2558   {
2559   int prev_expect_cond_assert;
2560   uint32_t min_repeat, max_repeat;
2561   uint32_t set, unset, *optset;
2562   uint32_t terminator;
2563   uint32_t prev_meta_quantifier;
2564   BOOL prev_okquantifier;
2565   PCRE2_SPTR tempptr;
2566   PCRE2_SIZE offset;
2567 
2568   if (parsed_pattern >= parsed_pattern_end)
2569     {
2570     errorcode = ERR63;  /* Internal error (parsed pattern overflow) */
2571     goto FAILED;
2572     }
2573 
2574   if (nest_depth > cb->cx->parens_nest_limit)
2575     {
2576     errorcode = ERR19;
2577     goto FAILED;        /* Parentheses too deeply nested */
2578     }
2579 
2580   /* Get next input character, save its position for callout handling. */
2581 
2582   thisptr = ptr;
2583   GETCHARINCTEST(c, ptr);
2584 
2585   /* Copy quoted literals until \E, allowing for the possibility of automatic
2586   callouts, except when processing a (*VERB) "name".  */
2587 
2588   if (inescq)
2589     {
2590     if (c == CHAR_BACKSLASH && ptr < ptrend && *ptr == CHAR_E)
2591       {
2592       inescq = FALSE;
2593       ptr++;   /* Skip E */
2594       }
2595     else
2596       {
2597       if (expect_cond_assert > 0)   /* A literal is not allowed if we are */
2598         {                           /* expecting a conditional assertion, */
2599         ptr--;                      /* but an empty \Q\E sequence is OK.  */
2600         errorcode = ERR28;
2601         goto FAILED;
2602         }
2603       if (!inverbname && after_manual_callout-- <= 0)
2604         parsed_pattern = manage_callouts(thisptr, &previous_callout,
2605           auto_callout, parsed_pattern, cb);
2606       PARSED_LITERAL(c, parsed_pattern);
2607       meta_quantifier = 0;
2608       }
2609     continue;  /* Next character */
2610     }
2611 
2612   /* If we are processing the "name" part of a (*VERB:NAME) item, all
2613   characters up to the closing parenthesis are literals except when
2614   PCRE2_ALT_VERBNAMES is set. That causes backslash interpretation, but only \Q
2615   and \E and escaped characters are allowed (no character types such as \d). If
2616   PCRE2_EXTENDED is also set, we must ignore white space and # comments. Do
2617   this by not entering the special (*VERB:NAME) processing - they are then
2618   picked up below. Note that c is a character, not a code unit, so we must not
2619   use MAX_255 to test its size because MAX_255 tests code units and is assumed
2620   TRUE in 8-bit mode. */
2621 
2622   if (inverbname &&
2623        (
2624         /* EITHER: not both options set */
2625         ((options & (PCRE2_EXTENDED | PCRE2_ALT_VERBNAMES)) !=
2626                     (PCRE2_EXTENDED | PCRE2_ALT_VERBNAMES)) ||
2627 #ifdef SUPPORT_UNICODE
2628         /* OR: character > 255 AND not Unicode Pattern White Space */
2629         (c > 255 && (c|1) != 0x200f && (c|1) != 0x2029) ||
2630 #endif
2631         /* OR: not a # comment or isspace() white space */
2632         (c < 256 && c != CHAR_NUMBER_SIGN && (cb->ctypes[c] & ctype_space) == 0
2633 #ifdef SUPPORT_UNICODE
2634         /* and not CHAR_NEL when Unicode is supported */
2635           && c != CHAR_NEL
2636 #endif
2637        )))
2638     {
2639     PCRE2_SIZE verbnamelength;
2640 
2641     switch(c)
2642       {
2643       default:
2644       PARSED_LITERAL(c, parsed_pattern);
2645       break;
2646 
2647       case CHAR_RIGHT_PARENTHESIS:
2648       inverbname = FALSE;
2649       okquantifier = FALSE;   /* Was probably set by literals */
2650       /* This is the length in characters */
2651       verbnamelength = (PCRE2_SIZE)(parsed_pattern - verblengthptr - 1);
2652       /* But the limit on the length is in code units */
2653       if (ptr - verbnamestart - 1 > (int)MAX_MARK)
2654         {
2655         ptr--;
2656         errorcode = ERR76;
2657         goto FAILED;
2658         }
2659       *verblengthptr = (uint32_t)verbnamelength;
2660 
2661       /* If this name was on a verb such as (*ACCEPT) which does not continue,
2662       a (*MARK) was generated for the name. We now add the original verb as the
2663       next item. */
2664 
2665       if (add_after_mark != 0)
2666         {
2667         *parsed_pattern++ = add_after_mark;
2668         add_after_mark = 0;
2669         }
2670       break;
2671 
2672       case CHAR_BACKSLASH:
2673       if ((options & PCRE2_ALT_VERBNAMES) != 0)
2674         {
2675         escape = PRIV(check_escape)(&ptr, ptrend, &c, &errorcode, options,
2676           cb->cx->extra_options, FALSE, cb);
2677         if (errorcode != 0) goto FAILED;
2678         }
2679       else escape = 0;   /* Treat all as literal */
2680 
2681       switch(escape)
2682         {
2683         case 0:
2684         PARSED_LITERAL(c, parsed_pattern);
2685         break;
2686 
2687         case ESC_Q:
2688         inescq = TRUE;
2689         break;
2690 
2691         case ESC_E:           /* Ignore */
2692         break;
2693 
2694         default:
2695         errorcode = ERR40;    /* Invalid in verb name */
2696         goto FAILED;
2697         }
2698       }
2699     continue;   /* Next character in pattern */
2700     }
2701 
2702   /* Not a verb name character. At this point we must process everything that
2703   must not change the quantification state. This is mainly comments, but we
2704   handle \Q and \E here as well, so that an item such as A\Q\E+ is treated as
2705   A+, as in Perl. An isolated \E is ignored. */
2706 
2707   if (c == CHAR_BACKSLASH && ptr < ptrend)
2708     {
2709     if (*ptr == CHAR_Q || *ptr == CHAR_E)
2710       {
2711       inescq = *ptr == CHAR_Q;
2712       ptr++;
2713       continue;
2714       }
2715     }
2716 
2717   /* Skip over whitespace and # comments in extended mode. Note that c is a
2718   character, not a code unit, so we must not use MAX_255 to test its size
2719   because MAX_255 tests code units and is assumed TRUE in 8-bit mode. The
2720   whitespace characters are those designated as "Pattern White Space" by
2721   Unicode, which are the isspace() characters plus CHAR_NEL (newline), which is
2722   U+0085 in Unicode, plus U+200E, U+200F, U+2028, and U+2029. These are a
2723   subset of space characters that match \h and \v. */
2724 
2725   if ((options & PCRE2_EXTENDED) != 0)
2726     {
2727     if (c < 256 && (cb->ctypes[c] & ctype_space) != 0) continue;
2728 #ifdef SUPPORT_UNICODE
2729     if (c == CHAR_NEL || (c|1) == 0x200f || (c|1) == 0x2029) continue;
2730 #endif
2731     if (c == CHAR_NUMBER_SIGN)
2732       {
2733       while (ptr < ptrend)
2734         {
2735         if (IS_NEWLINE(ptr))      /* For non-fixed-length newline cases, */
2736           {                       /* IS_NEWLINE sets cb->nllen. */
2737           ptr += cb->nllen;
2738           break;
2739           }
2740         ptr++;
2741 #ifdef SUPPORT_UNICODE
2742         if (utf) FORWARDCHARTEST(ptr, ptrend);
2743 #endif
2744         }
2745       continue;  /* Next character in pattern */
2746       }
2747     }
2748 
2749   /* Skip over bracketed comments */
2750 
2751   if (c == CHAR_LEFT_PARENTHESIS && ptrend - ptr >= 2 &&
2752       ptr[0] == CHAR_QUESTION_MARK && ptr[1] == CHAR_NUMBER_SIGN)
2753     {
2754     while (++ptr < ptrend && *ptr != CHAR_RIGHT_PARENTHESIS);
2755     if (ptr >= ptrend)
2756       {
2757       errorcode = ERR18;  /* A special error for missing ) in a comment */
2758       goto FAILED;        /* to make it easier to debug. */
2759       }
2760     ptr++;
2761     continue;  /* Next character in pattern */
2762     }
2763 
2764   /* If the next item is not a quantifier, fill in length of any previous
2765   callout and create an auto callout if required. */
2766 
2767   if (c != CHAR_ASTERISK && c != CHAR_PLUS && c != CHAR_QUESTION_MARK &&
2768        (c != CHAR_LEFT_CURLY_BRACKET ||
2769          (tempptr = ptr,
2770          !read_repeat_counts(&tempptr, ptrend, NULL, NULL, &errorcode))))
2771     {
2772     if (after_manual_callout-- <= 0)
2773       parsed_pattern = manage_callouts(thisptr, &previous_callout, auto_callout,
2774         parsed_pattern, cb);
2775     }
2776 
2777   /* If expect_cond_assert is 2, we have just passed (?( and are expecting an
2778   assertion, possibly preceded by a callout. If the value is 1, we have just
2779   had the callout and expect an assertion. There must be at least 3 more
2780   characters in all cases. When expect_cond_assert is 2, we know that the
2781   current character is an opening parenthesis, as otherwise we wouldn't be
2782   here. However, when it is 1, we need to check, and it's easiest just to check
2783   always. Note that expect_cond_assert may be negative, since all callouts just
2784   decrement it. */
2785 
2786   if (expect_cond_assert > 0)
2787     {
2788     BOOL ok = c == CHAR_LEFT_PARENTHESIS && ptrend - ptr >= 3 &&
2789               (ptr[0] == CHAR_QUESTION_MARK || ptr[0] == CHAR_ASTERISK);
2790     if (ok)
2791       {
2792       if (ptr[0] == CHAR_ASTERISK)  /* New alpha assertion format, possibly */
2793         {
2794         ok = MAX_255(ptr[1]) && (cb->ctypes[ptr[1]] & ctype_lcletter) != 0;
2795         }
2796       else switch(ptr[1])  /* Traditional symbolic format */
2797         {
2798         case CHAR_C:
2799         ok = expect_cond_assert == 2;
2800         break;
2801 
2802         case CHAR_EQUALS_SIGN:
2803         case CHAR_EXCLAMATION_MARK:
2804         break;
2805 
2806         case CHAR_LESS_THAN_SIGN:
2807         ok = ptr[2] == CHAR_EQUALS_SIGN || ptr[2] == CHAR_EXCLAMATION_MARK;
2808         break;
2809 
2810         default:
2811         ok = FALSE;
2812         }
2813       }
2814 
2815     if (!ok)
2816       {
2817       ptr--;   /* Adjust error offset */
2818       errorcode = ERR28;
2819       goto FAILED;
2820       }
2821     }
2822 
2823   /* Remember whether we are expecting a conditional assertion, and set the
2824   default for this item. */
2825 
2826   prev_expect_cond_assert = expect_cond_assert;
2827   expect_cond_assert = 0;
2828 
2829   /* Remember quantification status for the previous significant item, then set
2830   default for this item. */
2831 
2832   prev_okquantifier = okquantifier;
2833   prev_meta_quantifier = meta_quantifier;
2834   okquantifier = FALSE;
2835   meta_quantifier = 0;
2836 
2837   /* If the previous significant item was a quantifier, adjust the parsed code
2838   if there is a following modifier. The base meta value is always followed by
2839   the PLUS and QUERY values, in that order. We do this here rather than after
2840   reading a quantifier so that intervening comments and /x whitespace can be
2841   ignored without having to replicate code. */
2842 
2843   if (prev_meta_quantifier != 0 && (c == CHAR_QUESTION_MARK || c == CHAR_PLUS))
2844     {
2845     parsed_pattern[(prev_meta_quantifier == META_MINMAX)? -3 : -1] =
2846       prev_meta_quantifier + ((c == CHAR_QUESTION_MARK)?
2847         0x00020000u : 0x00010000u);
2848     continue;  /* Next character in pattern */
2849     }
2850 
2851 
2852   /* Process the next item in the main part of a pattern. */
2853 
2854   switch(c)
2855     {
2856     default:              /* Non-special character */
2857     PARSED_LITERAL(c, parsed_pattern);
2858     break;
2859 
2860 
2861     /* ---- Escape sequence ---- */
2862 
2863     case CHAR_BACKSLASH:
2864     tempptr = ptr;
2865     escape = PRIV(check_escape)(&ptr, ptrend, &c, &errorcode, options,
2866       cb->cx->extra_options, FALSE, cb);
2867     if (errorcode != 0)
2868       {
2869       ESCAPE_FAILED:
2870       if ((extra_options & PCRE2_EXTRA_BAD_ESCAPE_IS_LITERAL) == 0)
2871         goto FAILED;
2872       ptr = tempptr;
2873       if (ptr >= ptrend) c = CHAR_BACKSLASH; else
2874         {
2875         GETCHARINCTEST(c, ptr);   /* Get character value, increment pointer */
2876         }
2877       escape = 0;                 /* Treat as literal character */
2878       }
2879 
2880     /* The escape was a data escape or literal character. */
2881 
2882     if (escape == 0)
2883       {
2884       PARSED_LITERAL(c, parsed_pattern);
2885       }
2886 
2887     /* The escape was a back (or forward) reference. We keep the offset in
2888     order to give a more useful diagnostic for a bad forward reference. For
2889     references to groups numbered less than 10 we can't use more than two items
2890     in parsed_pattern because they may be just two characters in the input (and
2891     in a 64-bit world an offset may need two elements). So for them, the offset
2892     of the first occurrent is held in a special vector. */
2893 
2894     else if (escape < 0)
2895       {
2896       offset = (PCRE2_SIZE)(ptr - cb->start_pattern - 1);
2897       escape = -escape;
2898       *parsed_pattern++ = META_BACKREF | (uint32_t)escape;
2899       if (escape < 10)
2900         {
2901         if (cb->small_ref_offset[escape] == PCRE2_UNSET)
2902           cb->small_ref_offset[escape] = offset;
2903         }
2904       else
2905         {
2906         PUTOFFSET(offset, parsed_pattern);
2907         }
2908       okquantifier = TRUE;
2909       }
2910 
2911     /* The escape was a character class such as \d etc. or other special
2912     escape indicator such as \A or \X. Most of them generate just a single
2913     parsed item, but \P and \p are followed by a 16-bit type and a 16-bit
2914     value. They are supported only when Unicode is available. The type and
2915     value are packed into a single 32-bit value so that the whole sequences
2916     uses only two elements in the parsed_vector. This is because the same
2917     coding is used if \d (for example) is turned into \p{Nd} when PCRE2_UCP is
2918     set.
2919 
2920     There are also some cases where the escape sequence is followed by a name:
2921     \k{name}, \k<name>, and \k'name' are backreferences by name, and \g<name>
2922     and \g'name' are subroutine calls by name; \g{name} is a synonym for
2923     \k{name}. Note that \g<number> and \g'number' are handled by check_escape()
2924     and returned as a negative value (handled above). A name is coded as an
2925     offset into the pattern and a length. */
2926 
2927     else switch (escape)
2928       {
2929       case ESC_C:
2930 #ifdef NEVER_BACKSLASH_C
2931       errorcode = ERR85;
2932       goto ESCAPE_FAILED;
2933 #else
2934       if ((options & PCRE2_NEVER_BACKSLASH_C) != 0)
2935         {
2936         errorcode = ERR83;
2937         goto ESCAPE_FAILED;
2938         }
2939 #endif
2940       okquantifier = TRUE;
2941       *parsed_pattern++ = META_ESCAPE + escape;
2942       break;
2943 
2944       case ESC_X:
2945 #ifndef SUPPORT_UNICODE
2946       errorcode = ERR45;   /* Supported only with Unicode support */
2947       goto ESCAPE_FAILED;
2948 #endif
2949       case ESC_H:
2950       case ESC_h:
2951       case ESC_N:
2952       case ESC_R:
2953       case ESC_V:
2954       case ESC_v:
2955       okquantifier = TRUE;
2956       *parsed_pattern++ = META_ESCAPE + escape;
2957       break;
2958 
2959       default:  /* \A, \B, \b, \G, \K, \Z, \z cannot be quantified. */
2960       *parsed_pattern++ = META_ESCAPE + escape;
2961       break;
2962 
2963       /* Escapes that change in UCP mode. Note that PCRE2_UCP will never be set
2964       without Unicode support because it is checked when pcre2_compile() is
2965       called. */
2966 
2967       case ESC_d:
2968       case ESC_D:
2969       case ESC_s:
2970       case ESC_S:
2971       case ESC_w:
2972       case ESC_W:
2973       okquantifier = TRUE;
2974       if ((options & PCRE2_UCP) == 0)
2975         {
2976         *parsed_pattern++ = META_ESCAPE + escape;
2977         }
2978       else
2979         {
2980         *parsed_pattern++ = META_ESCAPE +
2981           ((escape == ESC_d || escape == ESC_s || escape == ESC_w)?
2982             ESC_p : ESC_P);
2983         switch(escape)
2984           {
2985           case ESC_d:
2986           case ESC_D:
2987           *parsed_pattern++ = (PT_PC << 16) | ucp_Nd;
2988           break;
2989 
2990           case ESC_s:
2991           case ESC_S:
2992           *parsed_pattern++ = PT_SPACE << 16;
2993           break;
2994 
2995           case ESC_w:
2996           case ESC_W:
2997           *parsed_pattern++ = PT_WORD << 16;
2998           break;
2999           }
3000         }
3001       break;
3002 
3003       /* Unicode property matching */
3004 
3005       case ESC_P:
3006       case ESC_p:
3007 #ifdef SUPPORT_UNICODE
3008         {
3009         BOOL negated;
3010         uint16_t ptype = 0, pdata = 0;
3011         if (!get_ucp(&ptr, &negated, &ptype, &pdata, &errorcode, cb))
3012           goto ESCAPE_FAILED;
3013         if (negated) escape = (escape == ESC_P)? ESC_p : ESC_P;
3014         *parsed_pattern++ = META_ESCAPE + escape;
3015         *parsed_pattern++ = (ptype << 16) | pdata;
3016         okquantifier = TRUE;
3017         }
3018 #else
3019       errorcode = ERR45;
3020       goto ESCAPE_FAILED;
3021 #endif
3022       break;  /* End \P and \p */
3023 
3024       /* When \g is used with quotes or angle brackets as delimiters, it is a
3025       numerical or named subroutine call, and control comes here. When used
3026       with brace delimiters it is a numberical back reference and does not come
3027       here because check_escape() returns it directly as a reference. \k is
3028       always a named back reference. */
3029 
3030       case ESC_g:
3031       case ESC_k:
3032       if (ptr >= ptrend || (*ptr != CHAR_LEFT_CURLY_BRACKET &&
3033           *ptr != CHAR_LESS_THAN_SIGN && *ptr != CHAR_APOSTROPHE))
3034         {
3035         errorcode = (escape == ESC_g)? ERR57 : ERR69;
3036         goto ESCAPE_FAILED;
3037         }
3038       terminator = (*ptr == CHAR_LESS_THAN_SIGN)?
3039         CHAR_GREATER_THAN_SIGN : (*ptr == CHAR_APOSTROPHE)?
3040         CHAR_APOSTROPHE : CHAR_RIGHT_CURLY_BRACKET;
3041 
3042       /* For a non-braced \g, check for a numerical recursion. */
3043 
3044       if (escape == ESC_g && terminator != CHAR_RIGHT_CURLY_BRACKET)
3045         {
3046         PCRE2_SPTR p = ptr + 1;
3047 
3048         if (read_number(&p, ptrend, cb->bracount, MAX_GROUP_NUMBER, ERR61, &i,
3049             &errorcode))
3050           {
3051           if (p >= ptrend || *p != terminator)
3052             {
3053             errorcode = ERR57;
3054             goto ESCAPE_FAILED;
3055             }
3056           ptr = p;
3057           goto SET_RECURSION;
3058           }
3059         if (errorcode != 0) goto ESCAPE_FAILED;
3060         }
3061 
3062       /* Not a numerical recursion */
3063 
3064       if (!read_name(&ptr, ptrend, utf, terminator, &offset, &name, &namelen,
3065           &errorcode, cb)) goto ESCAPE_FAILED;
3066 
3067       /* \k and \g when used with braces are back references, whereas \g used
3068       with quotes or angle brackets is a recursion */
3069 
3070       *parsed_pattern++ =
3071         (escape == ESC_k || terminator == CHAR_RIGHT_CURLY_BRACKET)?
3072           META_BACKREF_BYNAME : META_RECURSE_BYNAME;
3073       *parsed_pattern++ = namelen;
3074 
3075       PUTOFFSET(offset, parsed_pattern);
3076       okquantifier = TRUE;
3077       break;  /* End special escape processing */
3078       }
3079     break;    /* End escape sequence processing */
3080 
3081 
3082     /* ---- Single-character special items ---- */
3083 
3084     case CHAR_CIRCUMFLEX_ACCENT:
3085     *parsed_pattern++ = META_CIRCUMFLEX;
3086     break;
3087 
3088     case CHAR_DOLLAR_SIGN:
3089     *parsed_pattern++ = META_DOLLAR;
3090     break;
3091 
3092     case CHAR_DOT:
3093     *parsed_pattern++ = META_DOT;
3094     okquantifier = TRUE;
3095     break;
3096 
3097 
3098     /* ---- Single-character quantifiers ---- */
3099 
3100     case CHAR_ASTERISK:
3101     meta_quantifier = META_ASTERISK;
3102     goto CHECK_QUANTIFIER;
3103 
3104     case CHAR_PLUS:
3105     meta_quantifier = META_PLUS;
3106     goto CHECK_QUANTIFIER;
3107 
3108     case CHAR_QUESTION_MARK:
3109     meta_quantifier = META_QUERY;
3110     goto CHECK_QUANTIFIER;
3111 
3112 
3113     /* ---- Potential {n,m} quantifier ---- */
3114 
3115     case CHAR_LEFT_CURLY_BRACKET:
3116     if (!read_repeat_counts(&ptr, ptrend, &min_repeat, &max_repeat,
3117         &errorcode))
3118       {
3119       if (errorcode != 0) goto FAILED;     /* Error in quantifier. */
3120       PARSED_LITERAL(c, parsed_pattern);   /* Not a quantifier */
3121       break;                               /* No more quantifier processing */
3122       }
3123     meta_quantifier = META_MINMAX;
3124     /* Fall through */
3125 
3126 
3127     /* ---- Quantifier post-processing ---- */
3128 
3129     /* Check that a quantifier is allowed after the previous item. */
3130 
3131     CHECK_QUANTIFIER:
3132     if (!prev_okquantifier)
3133       {
3134       errorcode = ERR9;
3135       goto FAILED_BACK;
3136       }
3137 
3138     /* Now we can put the quantifier into the parsed pattern vector. At this
3139     stage, we have only the basic quantifier. The check for a following + or ?
3140     modifier happens at the top of the loop, after any intervening comments
3141     have been removed. */
3142 
3143     *parsed_pattern++ = meta_quantifier;
3144     if (c == CHAR_LEFT_CURLY_BRACKET)
3145       {
3146       *parsed_pattern++ = min_repeat;
3147       *parsed_pattern++ = max_repeat;
3148       }
3149     break;
3150 
3151 
3152     /* ---- Character class ---- */
3153 
3154     case CHAR_LEFT_SQUARE_BRACKET:
3155     okquantifier = TRUE;
3156 
3157     /* In another (POSIX) regex library, the ugly syntax [[:<:]] and [[:>:]] is
3158     used for "start of word" and "end of word". As these are otherwise illegal
3159     sequences, we don't break anything by recognizing them. They are replaced
3160     by \b(?=\w) and \b(?<=\w) respectively. Sequences like [a[:<:]] are
3161     erroneous and are handled by the normal code below. */
3162 
3163     if (ptrend - ptr >= 6 &&
3164          (PRIV(strncmp_c8)(ptr, STRING_WEIRD_STARTWORD, 6) == 0 ||
3165           PRIV(strncmp_c8)(ptr, STRING_WEIRD_ENDWORD, 6) == 0))
3166       {
3167       *parsed_pattern++ = META_ESCAPE + ESC_b;
3168 
3169       if (ptr[2] == CHAR_LESS_THAN_SIGN)
3170         {
3171         *parsed_pattern++ = META_LOOKAHEAD;
3172         }
3173       else
3174         {
3175         *parsed_pattern++ = META_LOOKBEHIND;
3176         *has_lookbehind = TRUE;
3177 
3178         /* The offset is used only for the "non-fixed length" error; this won't
3179         occur here, so just store zero. */
3180 
3181         PUTOFFSET((PCRE2_SIZE)0, parsed_pattern);
3182         }
3183 
3184       if ((options & PCRE2_UCP) == 0)
3185         *parsed_pattern++ = META_ESCAPE + ESC_w;
3186       else
3187         {
3188         *parsed_pattern++ = META_ESCAPE + ESC_p;
3189         *parsed_pattern++ = PT_WORD << 16;
3190         }
3191       *parsed_pattern++ = META_KET;
3192       ptr += 6;
3193       break;
3194       }
3195 
3196     /* PCRE supports POSIX class stuff inside a class. Perl gives an error if
3197     they are encountered at the top level, so we'll do that too. */
3198 
3199     if (ptr < ptrend && (*ptr == CHAR_COLON || *ptr == CHAR_DOT ||
3200          *ptr == CHAR_EQUALS_SIGN) &&
3201         check_posix_syntax(ptr, ptrend, &tempptr))
3202       {
3203       errorcode = (*ptr-- == CHAR_COLON)? ERR12 : ERR13;
3204       goto FAILED;
3205       }
3206 
3207     /* Process a regular character class. If the first character is '^', set
3208     the negation flag. If the first few characters (either before or after ^)
3209     are \Q\E or \E or space or tab in extended-more mode, we skip them too.
3210     This makes for compatibility with Perl. */
3211 
3212     negate_class = FALSE;
3213     while (ptr < ptrend)
3214       {
3215       GETCHARINCTEST(c, ptr);
3216       if (c == CHAR_BACKSLASH)
3217         {
3218         if (ptr < ptrend && *ptr == CHAR_E) ptr++;
3219         else if (ptrend - ptr >= 3 &&
3220              PRIV(strncmp_c8)(ptr, STR_Q STR_BACKSLASH STR_E, 3) == 0)
3221           ptr += 3;
3222         else
3223           break;
3224         }
3225       else if ((options & PCRE2_EXTENDED_MORE) != 0 &&
3226                (c == CHAR_SPACE || c == CHAR_HT))  /* Note: just these two */
3227         continue;
3228       else if (!negate_class && c == CHAR_CIRCUMFLEX_ACCENT)
3229         negate_class = TRUE;
3230       else break;
3231       }
3232 
3233     /* Now the real contents of the class; c has the first "real" character.
3234     Empty classes are permitted only if the option is set. */
3235 
3236     if (c == CHAR_RIGHT_SQUARE_BRACKET &&
3237         (cb->external_options & PCRE2_ALLOW_EMPTY_CLASS) != 0)
3238       {
3239       *parsed_pattern++ = negate_class? META_CLASS_EMPTY_NOT : META_CLASS_EMPTY;
3240       break;  /* End of class processing */
3241       }
3242 
3243     /* Process a non-empty class. */
3244 
3245     *parsed_pattern++ = negate_class? META_CLASS_NOT : META_CLASS;
3246     class_range_state = RANGE_NO;
3247 
3248     /* In an EBCDIC environment, Perl treats alphabetic ranges specially
3249     because there are holes in the encoding, and simply using the range A-Z
3250     (for example) would include the characters in the holes. This applies only
3251     to ranges where both values are literal; [\xC1-\xE9] is different to [A-Z]
3252     in this respect. In order to accommodate this, we keep track of whether
3253     character values are literal or not, and a state variable for handling
3254     ranges. */
3255 
3256     /* Loop for the contents of the class */
3257 
3258     for (;;)
3259       {
3260       BOOL char_is_literal = TRUE;
3261 
3262       /* Inside \Q...\E everything is literal except \E */
3263 
3264       if (inescq)
3265         {
3266         if (c == CHAR_BACKSLASH && ptr < ptrend && *ptr == CHAR_E)
3267           {
3268           inescq = FALSE;                   /* Reset literal state */
3269           ptr++;                            /* Skip the 'E' */
3270           goto CLASS_CONTINUE;
3271           }
3272         goto CLASS_LITERAL;
3273         }
3274 
3275       /* Skip over space and tab (only) in extended-more mode. */
3276 
3277       if ((options & PCRE2_EXTENDED_MORE) != 0 &&
3278           (c == CHAR_SPACE || c == CHAR_HT))
3279         goto CLASS_CONTINUE;
3280 
3281       /* Handle POSIX class names. Perl allows a negation extension of the
3282       form [:^name:]. A square bracket that doesn't match the syntax is
3283       treated as a literal. We also recognize the POSIX constructions
3284       [.ch.] and [=ch=] ("collating elements") and fault them, as Perl
3285       5.6 and 5.8 do. */
3286 
3287       if (c == CHAR_LEFT_SQUARE_BRACKET &&
3288           ptrend - ptr >= 3 &&
3289           (*ptr == CHAR_COLON || *ptr == CHAR_DOT ||
3290            *ptr == CHAR_EQUALS_SIGN) &&
3291           check_posix_syntax(ptr, ptrend, &tempptr))
3292         {
3293         BOOL posix_negate = FALSE;
3294         int posix_class;
3295 
3296         /* Perl treats a hyphen before a POSIX class as a literal, not the
3297         start of a range. However, it gives a warning in its warning mode. PCRE
3298         does not have a warning mode, so we give an error, because this is
3299         likely an error on the user's part. */
3300 
3301         if (class_range_state == RANGE_STARTED)
3302           {
3303           errorcode = ERR50;
3304           goto FAILED;
3305           }
3306 
3307         if (*ptr != CHAR_COLON)
3308           {
3309           errorcode = ERR13;
3310           goto FAILED_BACK;
3311           }
3312 
3313         if (*(++ptr) == CHAR_CIRCUMFLEX_ACCENT)
3314           {
3315           posix_negate = TRUE;
3316           ptr++;
3317           }
3318 
3319         posix_class = check_posix_name(ptr, (int)(tempptr - ptr));
3320         if (posix_class < 0)
3321           {
3322           errorcode = ERR30;
3323           goto FAILED;
3324           }
3325         ptr = tempptr + 2;
3326 
3327         /* Perl treats a hyphen after a POSIX class as a literal, not the
3328         start of a range. However, it gives a warning in its warning mode
3329         unless the hyphen is the last character in the class. PCRE does not
3330         have a warning mode, so we give an error, because this is likely an
3331         error on the user's part. */
3332 
3333         if (ptr < ptrend - 1 && *ptr == CHAR_MINUS &&
3334             ptr[1] != CHAR_RIGHT_SQUARE_BRACKET)
3335           {
3336           errorcode = ERR50;
3337           goto FAILED;
3338           }
3339 
3340         /* Set "a hyphen is not the start of a range" for the -] case, and also
3341         in case the POSIX class is followed by \E or \Q\E (possibly repeated -
3342         fuzzers do that kind of thing) and *then* a hyphen. This causes that
3343         hyphen to be treated as a literal. I don't think it's worth setting up
3344         special apparatus to do otherwise. */
3345 
3346         class_range_state = RANGE_NO;
3347 
3348         /* When PCRE2_UCP is set, some of the POSIX classes are converted to
3349         use Unicode properties \p or \P or, in one case, \h or \H. The
3350         substitutes table has two values per class, containing the type and
3351         value of a \p or \P item. The special cases are specified with a
3352         negative type: a non-zero value causes \h or \H to be used, and a zero
3353         value falls through to behave like a non-UCP POSIX class. */
3354 
3355 #ifdef SUPPORT_UNICODE
3356         if ((options & PCRE2_UCP) != 0)
3357           {
3358           int ptype = posix_substitutes[2*posix_class];
3359           int pvalue = posix_substitutes[2*posix_class + 1];
3360           if (ptype >= 0)
3361             {
3362             *parsed_pattern++ = META_ESCAPE + (posix_negate? ESC_P : ESC_p);
3363             *parsed_pattern++ = (ptype << 16) | pvalue;
3364             goto CLASS_CONTINUE;
3365             }
3366 
3367           if (pvalue != 0)
3368             {
3369             *parsed_pattern++ = META_ESCAPE + (posix_negate? ESC_H : ESC_h);
3370             goto CLASS_CONTINUE;
3371             }
3372 
3373           /* Fall through */
3374           }
3375 #endif  /* SUPPORT_UNICODE */
3376 
3377         /* Non-UCP POSIX class */
3378 
3379         *parsed_pattern++ = posix_negate? META_POSIX_NEG : META_POSIX;
3380         *parsed_pattern++ = posix_class;
3381         }
3382 
3383       /* Handle potential start of range */
3384 
3385       else if (c == CHAR_MINUS && class_range_state >= RANGE_OK_ESCAPED)
3386         {
3387         *parsed_pattern++ = (class_range_state == RANGE_OK_LITERAL)?
3388           META_RANGE_LITERAL : META_RANGE_ESCAPED;
3389         class_range_state = RANGE_STARTED;
3390         }
3391 
3392       /* Handle a literal character */
3393 
3394       else if (c != CHAR_BACKSLASH)
3395         {
3396         CLASS_LITERAL:
3397         if (class_range_state == RANGE_STARTED)
3398           {
3399           if (c == parsed_pattern[-2])       /* Optimize one-char range */
3400             parsed_pattern--;
3401           else if (parsed_pattern[-2] > c)   /* Check range is in order */
3402             {
3403             errorcode = ERR8;
3404             goto FAILED_BACK;
3405             }
3406           else
3407             {
3408             if (!char_is_literal && parsed_pattern[-1] == META_RANGE_LITERAL)
3409               parsed_pattern[-1] = META_RANGE_ESCAPED;
3410             PARSED_LITERAL(c, parsed_pattern);
3411             }
3412           class_range_state = RANGE_NO;
3413           }
3414         else  /* Potential start of range */
3415           {
3416           class_range_state = char_is_literal?
3417             RANGE_OK_LITERAL : RANGE_OK_ESCAPED;
3418           PARSED_LITERAL(c, parsed_pattern);
3419           }
3420         }
3421 
3422       /* Handle escapes in a class */
3423 
3424       else
3425         {
3426         tempptr = ptr;
3427         escape = PRIV(check_escape)(&ptr, ptrend, &c, &errorcode, options,
3428           cb->cx->extra_options, TRUE, cb);
3429 
3430         if (errorcode != 0)
3431           {
3432           if ((extra_options & PCRE2_EXTRA_BAD_ESCAPE_IS_LITERAL) == 0)
3433             goto FAILED;
3434           ptr = tempptr;
3435           if (ptr >= ptrend) c = CHAR_BACKSLASH; else
3436             {
3437             GETCHARINCTEST(c, ptr);   /* Get character value, increment pointer */
3438             }
3439           escape = 0;                 /* Treat as literal character */
3440           }
3441 
3442         switch(escape)
3443           {
3444           case 0:  /* Escaped character code point is in c */
3445           char_is_literal = FALSE;
3446           goto CLASS_LITERAL;
3447 
3448           case ESC_b:
3449           c = CHAR_BS;    /* \b is backspace in a class */
3450           char_is_literal = FALSE;
3451           goto CLASS_LITERAL;
3452 
3453           case ESC_Q:
3454           inescq = TRUE;  /* Enter literal mode */
3455           goto CLASS_CONTINUE;
3456 
3457           case ESC_E:     /* Ignore orphan \E */
3458           goto CLASS_CONTINUE;
3459 
3460           case ESC_B:     /* Always an error in a class */
3461           case ESC_R:
3462           case ESC_X:
3463           errorcode = ERR7;
3464           ptr--;
3465           goto FAILED;
3466           }
3467 
3468         /* The second part of a range can be a single-character escape
3469         sequence (detected above), but not any of the other escapes. Perl
3470         treats a hyphen as a literal in such circumstances. However, in Perl's
3471         warning mode, a warning is given, so PCRE now faults it, as it is
3472         almost certainly a mistake on the user's part. */
3473 
3474         if (class_range_state == RANGE_STARTED)
3475           {
3476           errorcode = ERR50;
3477           goto FAILED;  /* Not CLASS_ESCAPE_FAILED; always an error */
3478           }
3479 
3480         /* Of the remaining escapes, only those that define characters are
3481         allowed in a class. None may start a range. */
3482 
3483         class_range_state = RANGE_NO;
3484         switch(escape)
3485           {
3486           case ESC_N:
3487           errorcode = ERR71;
3488           goto FAILED;
3489 
3490           case ESC_H:
3491           case ESC_h:
3492           case ESC_V:
3493           case ESC_v:
3494           *parsed_pattern++ = META_ESCAPE + escape;
3495           break;
3496 
3497           /* These escapes are converted to Unicode property tests when
3498           PCRE2_UCP is set. */
3499 
3500           case ESC_d:
3501           case ESC_D:
3502           case ESC_s:
3503           case ESC_S:
3504           case ESC_w:
3505           case ESC_W:
3506           if ((options & PCRE2_UCP) == 0)
3507             {
3508             *parsed_pattern++ = META_ESCAPE + escape;
3509             }
3510           else
3511             {
3512             *parsed_pattern++ = META_ESCAPE +
3513               ((escape == ESC_d || escape == ESC_s || escape == ESC_w)?
3514                 ESC_p : ESC_P);
3515             switch(escape)
3516               {
3517               case ESC_d:
3518               case ESC_D:
3519               *parsed_pattern++ = (PT_PC << 16) | ucp_Nd;
3520               break;
3521 
3522               case ESC_s:
3523               case ESC_S:
3524               *parsed_pattern++ = PT_SPACE << 16;
3525               break;
3526 
3527               case ESC_w:
3528               case ESC_W:
3529               *parsed_pattern++ = PT_WORD << 16;
3530               break;
3531               }
3532             }
3533           break;
3534 
3535           /* Explicit Unicode property matching */
3536 
3537           case ESC_P:
3538           case ESC_p:
3539 #ifdef SUPPORT_UNICODE
3540             {
3541             BOOL negated;
3542             uint16_t ptype = 0, pdata = 0;
3543             if (!get_ucp(&ptr, &negated, &ptype, &pdata, &errorcode, cb))
3544               goto FAILED;
3545             if (negated) escape = (escape == ESC_P)? ESC_p : ESC_P;
3546             *parsed_pattern++ = META_ESCAPE + escape;
3547             *parsed_pattern++ = (ptype << 16) | pdata;
3548             }
3549 #else
3550           errorcode = ERR45;
3551           goto FAILED;
3552 #endif
3553           break;  /* End \P and \p */
3554 
3555           default:    /* All others are not allowed in a class */
3556           errorcode = ERR7;
3557           ptr--;
3558           goto FAILED;
3559           }
3560 
3561         /* Perl gives a warning unless a following hyphen is the last character
3562         in the class. PCRE throws an error. */
3563 
3564         if (ptr < ptrend - 1 && *ptr == CHAR_MINUS &&
3565             ptr[1] != CHAR_RIGHT_SQUARE_BRACKET)
3566           {
3567           errorcode = ERR50;
3568           goto FAILED;
3569           }
3570         }
3571 
3572       /* Proceed to next thing in the class. */
3573 
3574       CLASS_CONTINUE:
3575       if (ptr >= ptrend)
3576         {
3577         errorcode = ERR6;  /* Missing terminating ']' */
3578         goto FAILED;
3579         }
3580       GETCHARINCTEST(c, ptr);
3581       if (c == CHAR_RIGHT_SQUARE_BRACKET && !inescq) break;
3582       }     /* End of class-processing loop */
3583 
3584     if (class_range_state == RANGE_STARTED)
3585       {
3586       parsed_pattern[-1] = CHAR_MINUS;
3587       class_range_state = RANGE_NO;
3588       }
3589 
3590     *parsed_pattern++ = META_CLASS_END;
3591     break;  /* End of character class */
3592 
3593 
3594     /* ---- Opening parenthesis ---- */
3595 
3596     case CHAR_LEFT_PARENTHESIS:
3597     if (ptr >= ptrend) goto UNCLOSED_PARENTHESIS;
3598 
3599     /* If ( is not followed by ? it is either a capture or a special verb or an
3600     alpha assertion. */
3601 
3602     if (*ptr != CHAR_QUESTION_MARK)
3603       {
3604       const char *vn;
3605 
3606       /* Handle capturing brackets (or non-capturing if auto-capture is turned
3607       off). */
3608 
3609       if (*ptr != CHAR_ASTERISK)
3610         {
3611         nest_depth++;
3612         if ((options & PCRE2_NO_AUTO_CAPTURE) == 0)
3613           {
3614           cb->bracount++;
3615           *parsed_pattern++ = META_CAPTURE | cb->bracount;
3616           }
3617         else *parsed_pattern++ = META_NOCAPTURE;
3618         }
3619 
3620       /* Do nothing for (* followed by end of pattern or ) so it gives a "bad
3621       quantifier" error rather than "(*MARK) must have an argument". */
3622 
3623       else if (ptrend - ptr <= 1 || (c = ptr[1]) == CHAR_RIGHT_PARENTHESIS)
3624         break;
3625 
3626       /* Handle "alpha assertions" such as (*pla:...). Most of these are
3627       synonyms for the historical symbolic assertions, but the script run ones
3628       are new. They are distinguished by starting with a lower case letter.
3629       Checking both ends of the alphabet makes this work in all character
3630       codes. */
3631 
3632       else if (CHMAX_255(c) && (cb->ctypes[c] & ctype_lcletter) != 0)
3633         {
3634         uint32_t meta;
3635 
3636         vn = alasnames;
3637         if (!read_name(&ptr, ptrend, utf, 0, &offset, &name, &namelen,
3638           &errorcode, cb)) goto FAILED;
3639         if (ptr >= ptrend || *ptr != CHAR_COLON)
3640           {
3641           errorcode = ERR95;  /* Malformed */
3642           goto FAILED;
3643           }
3644 
3645         /* Scan the table of alpha assertion names */
3646 
3647         for (i = 0; i < alascount; i++)
3648           {
3649           if (namelen == alasmeta[i].len &&
3650               PRIV(strncmp_c8)(name, vn, namelen) == 0)
3651             break;
3652           vn += alasmeta[i].len + 1;
3653           }
3654 
3655         if (i >= alascount)
3656           {
3657           errorcode = ERR95;  /* Alpha assertion not recognized */
3658           goto FAILED;
3659           }
3660 
3661         /* Check for expecting an assertion condition. If so, only lookaround
3662         assertions are valid. */
3663 
3664         meta = alasmeta[i].meta;
3665         if (prev_expect_cond_assert > 0 &&
3666             (meta < META_LOOKAHEAD || meta > META_LOOKBEHINDNOT))
3667           {
3668           errorcode = ERR28;  /* Assertion expected */
3669           goto FAILED;
3670           }
3671 
3672         /* The lookaround alphabetic synonyms can be almost entirely handled by
3673         jumping to the code that handles the traditional symbolic forms. */
3674 
3675         switch(meta)
3676           {
3677           default:
3678           errorcode = ERR89;  /* Unknown code; should never occur because */
3679           goto FAILED;        /* the meta values come from a table above. */
3680 
3681           case META_ATOMIC:
3682           goto ATOMIC_GROUP;
3683 
3684           case META_LOOKAHEAD:
3685           goto POSITIVE_LOOK_AHEAD;
3686 
3687           case META_LOOKAHEADNOT:
3688           goto NEGATIVE_LOOK_AHEAD;
3689 
3690           case META_LOOKBEHIND:
3691           case META_LOOKBEHINDNOT:
3692           *parsed_pattern++ = meta;
3693           ptr--;
3694           goto POST_LOOKBEHIND;
3695 
3696           /* The script run facilities are handled here. Unicode support is
3697           required (give an error if not, as this is a security issue). Always
3698           record a META_SCRIPT_RUN item. Then, for the atomic version, insert
3699           META_ATOMIC and remember that we need two META_KETs at the end. */
3700 
3701           case META_SCRIPT_RUN:
3702           case META_ATOMIC_SCRIPT_RUN:
3703 #ifdef SUPPORT_UNICODE
3704           *parsed_pattern++ = META_SCRIPT_RUN;
3705           nest_depth++;
3706           ptr++;
3707           if (meta == META_ATOMIC_SCRIPT_RUN)
3708             {
3709             *parsed_pattern++ = META_ATOMIC;
3710             if (top_nest == NULL) top_nest = (nest_save *)(cb->start_workspace);
3711             else if (++top_nest >= end_nests)
3712               {
3713               errorcode = ERR84;
3714               goto FAILED;
3715               }
3716             top_nest->nest_depth = nest_depth;
3717             top_nest->flags = NSF_ATOMICSR;
3718             top_nest->options = options & PARSE_TRACKED_OPTIONS;
3719             }
3720           break;
3721 #else  /* SUPPORT_UNICODE */
3722           errorcode = ERR96;
3723           goto FAILED;
3724 #endif
3725           }
3726         }
3727 
3728 
3729       /* ---- Handle (*VERB) and (*VERB:NAME) ---- */
3730 
3731       else
3732         {
3733         vn = verbnames;
3734         if (!read_name(&ptr, ptrend, utf, 0, &offset, &name, &namelen,
3735           &errorcode, cb)) goto FAILED;
3736         if (ptr >= ptrend || (*ptr != CHAR_COLON &&
3737                               *ptr != CHAR_RIGHT_PARENTHESIS))
3738           {
3739           errorcode = ERR60;  /* Malformed */
3740           goto FAILED;
3741           }
3742 
3743         /* Scan the table of verb names */
3744 
3745         for (i = 0; i < verbcount; i++)
3746           {
3747           if (namelen == verbs[i].len &&
3748               PRIV(strncmp_c8)(name, vn, namelen) == 0)
3749             break;
3750           vn += verbs[i].len + 1;
3751           }
3752 
3753         if (i >= verbcount)
3754           {
3755           errorcode = ERR60;  /* Verb not recognized */
3756           goto FAILED;
3757           }
3758 
3759         /* An empty argument is treated as no argument. */
3760 
3761         if (*ptr == CHAR_COLON && ptr + 1 < ptrend &&
3762              ptr[1] == CHAR_RIGHT_PARENTHESIS)
3763           ptr++;    /* Advance to the closing parens */
3764 
3765         /* Check for mandatory non-empty argument; this is (*MARK) */
3766 
3767         if (verbs[i].has_arg > 0 && *ptr != CHAR_COLON)
3768           {
3769           errorcode = ERR66;
3770           goto FAILED;
3771           }
3772 
3773         /* It appears that Perl allows any characters whatsoever, other than a
3774         closing parenthesis, to appear in arguments ("names"), so we no longer
3775         insist on letters, digits, and underscores. Perl does not, however, do
3776         any interpretation within arguments, and has no means of including a
3777         closing parenthesis. PCRE supports escape processing but only when it
3778         is requested by an option. We set inverbname TRUE here, and let the
3779         main loop take care of this so that escape and \x processing is done by
3780         the main code above. */
3781 
3782         if (*ptr++ == CHAR_COLON)   /* Skip past : or ) */
3783           {
3784           /* Some optional arguments can be treated as a preceding (*MARK) */
3785 
3786           if (verbs[i].has_arg < 0)
3787             {
3788             add_after_mark = verbs[i].meta;
3789             *parsed_pattern++ = META_MARK;
3790             }
3791 
3792           /* The remaining verbs with arguments (except *MARK) need a different
3793           opcode. */
3794 
3795           else
3796             {
3797             *parsed_pattern++ = verbs[i].meta +
3798               ((verbs[i].meta != META_MARK)? 0x00010000u:0);
3799             }
3800 
3801           /* Set up for reading the name in the main loop. */
3802 
3803           verblengthptr = parsed_pattern++;
3804           verbnamestart = ptr;
3805           inverbname = TRUE;
3806           }
3807         else  /* No verb "name" argument */
3808           {
3809           *parsed_pattern++ = verbs[i].meta;
3810           }
3811         }     /* End of (*VERB) handling */
3812       break;  /* Done with this parenthesis */
3813       }       /* End of groups that don't start with (? */
3814 
3815 
3816     /* ---- Items starting (? ---- */
3817 
3818     /* The type of item is determined by what follows (?. Handle (?| and option
3819     changes under "default" because both need a new block on the nest stack.
3820     Comments starting with (?# are handled above. Note that there is some
3821     ambiguity about the sequence (?- because if a digit follows it's a relative
3822     recursion or subroutine call whereas otherwise it's an option unsetting. */
3823 
3824     if (++ptr >= ptrend) goto UNCLOSED_PARENTHESIS;
3825 
3826     switch(*ptr)
3827       {
3828       default:
3829       if (*ptr == CHAR_MINUS && ptrend - ptr > 1 && IS_DIGIT(ptr[1]))
3830         goto RECURSION_BYNUMBER;  /* The + case is handled by CHAR_PLUS */
3831 
3832       /* We now have either (?| or a (possibly empty) option setting,
3833       optionally followed by a non-capturing group. */
3834 
3835       nest_depth++;
3836       if (top_nest == NULL) top_nest = (nest_save *)(cb->start_workspace);
3837       else if (++top_nest >= end_nests)
3838         {
3839         errorcode = ERR84;
3840         goto FAILED;
3841         }
3842       top_nest->nest_depth = nest_depth;
3843       top_nest->flags = 0;
3844       top_nest->options = options & PARSE_TRACKED_OPTIONS;
3845 
3846       /* Start of non-capturing group that resets the capture count for each
3847       branch. */
3848 
3849       if (*ptr == CHAR_VERTICAL_LINE)
3850         {
3851         top_nest->reset_group = (uint16_t)cb->bracount;
3852         top_nest->max_group = (uint16_t)cb->bracount;
3853         top_nest->flags |= NSF_RESET;
3854         cb->external_flags |= PCRE2_DUPCAPUSED;
3855         *parsed_pattern++ = META_NOCAPTURE;
3856         ptr++;
3857         }
3858 
3859       /* Scan for options imnsxJU to be set or unset. */
3860 
3861       else
3862         {
3863         BOOL hyphenok = TRUE;
3864         uint32_t oldoptions = options;
3865 
3866         top_nest->reset_group = 0;
3867         top_nest->max_group = 0;
3868         set = unset = 0;
3869         optset = &set;
3870 
3871         /* ^ at the start unsets imnsx and disables the subsequent use of - */
3872 
3873         if (ptr < ptrend && *ptr == CHAR_CIRCUMFLEX_ACCENT)
3874           {
3875           options &= ~(PCRE2_CASELESS|PCRE2_MULTILINE|PCRE2_NO_AUTO_CAPTURE|
3876                        PCRE2_DOTALL|PCRE2_EXTENDED|PCRE2_EXTENDED_MORE);
3877           hyphenok = FALSE;
3878           ptr++;
3879           }
3880 
3881         while (ptr < ptrend && *ptr != CHAR_RIGHT_PARENTHESIS &&
3882                                *ptr != CHAR_COLON)
3883           {
3884           switch (*ptr++)
3885             {
3886             case CHAR_MINUS:
3887             if (!hyphenok)
3888               {
3889               errorcode = ERR94;
3890               ptr--;  /* Correct the offset */
3891               goto FAILED;
3892               }
3893             optset = &unset;
3894             hyphenok = FALSE;
3895             break;
3896 
3897             case CHAR_J:  /* Record that it changed in the external options */
3898             *optset |= PCRE2_DUPNAMES;
3899             cb->external_flags |= PCRE2_JCHANGED;
3900             break;
3901 
3902             case CHAR_i: *optset |= PCRE2_CASELESS; break;
3903             case CHAR_m: *optset |= PCRE2_MULTILINE; break;
3904             case CHAR_n: *optset |= PCRE2_NO_AUTO_CAPTURE; break;
3905             case CHAR_s: *optset |= PCRE2_DOTALL; break;
3906             case CHAR_U: *optset |= PCRE2_UNGREEDY; break;
3907 
3908             /* If x appears twice it sets the extended extended option. */
3909 
3910             case CHAR_x:
3911             *optset |= PCRE2_EXTENDED;
3912             if (ptr < ptrend && *ptr == CHAR_x)
3913               {
3914               *optset |= PCRE2_EXTENDED_MORE;
3915               ptr++;
3916               }
3917             break;
3918 
3919             default:
3920             errorcode = ERR11;
3921             ptr--;    /* Correct the offset */
3922             goto FAILED;
3923             }
3924           }
3925 
3926         /* If we are setting extended without extended-more, ensure that any
3927         existing extended-more gets unset. Also, unsetting extended must also
3928         unset extended-more. */
3929 
3930         if ((set & (PCRE2_EXTENDED|PCRE2_EXTENDED_MORE)) == PCRE2_EXTENDED ||
3931             (unset & PCRE2_EXTENDED) != 0)
3932           unset |= PCRE2_EXTENDED_MORE;
3933 
3934         options = (options | set) & (~unset);
3935 
3936         /* If the options ended with ')' this is not the start of a nested
3937         group with option changes, so the options change at this level.
3938         In this case, if the previous level set up a nest block, discard the
3939         one we have just created. Otherwise adjust it for the previous level.
3940         If the options ended with ':' we are starting a non-capturing group,
3941         possibly with an options setting. */
3942 
3943         if (ptr >= ptrend) goto UNCLOSED_PARENTHESIS;
3944         if (*ptr++ == CHAR_RIGHT_PARENTHESIS)
3945           {
3946           nest_depth--;  /* This is not a nested group after all. */
3947           if (top_nest > (nest_save *)(cb->start_workspace) &&
3948               (top_nest-1)->nest_depth == nest_depth) top_nest--;
3949           else top_nest->nest_depth = nest_depth;
3950           }
3951         else *parsed_pattern++ = META_NOCAPTURE;
3952 
3953         /* If nothing changed, no need to record. */
3954 
3955         if (options != oldoptions)
3956           {
3957           *parsed_pattern++ = META_OPTIONS;
3958           *parsed_pattern++ = options;
3959           }
3960         }     /* End options processing */
3961       break;  /* End default case after (? */
3962 
3963 
3964       /* ---- Python syntax support ---- */
3965 
3966       case CHAR_P:
3967       if (++ptr >= ptrend) goto UNCLOSED_PARENTHESIS;
3968 
3969       /* (?P<name> is the same as (?<name>, which defines a named group. */
3970 
3971       if (*ptr == CHAR_LESS_THAN_SIGN)
3972         {
3973         terminator = CHAR_GREATER_THAN_SIGN;
3974         goto DEFINE_NAME;
3975         }
3976 
3977       /* (?P>name) is the same as (?&name), which is a recursion or subroutine
3978       call. */
3979 
3980       if (*ptr == CHAR_GREATER_THAN_SIGN) goto RECURSE_BY_NAME;
3981 
3982       /* (?P=name) is the same as \k<name>, a back reference by name. Anything
3983       else after (?P is an error. */
3984 
3985       if (*ptr != CHAR_EQUALS_SIGN)
3986         {
3987         errorcode = ERR41;
3988         goto FAILED;
3989         }
3990       if (!read_name(&ptr, ptrend, utf, CHAR_RIGHT_PARENTHESIS, &offset, &name,
3991           &namelen, &errorcode, cb)) goto FAILED;
3992       *parsed_pattern++ = META_BACKREF_BYNAME;
3993       *parsed_pattern++ = namelen;
3994       PUTOFFSET(offset, parsed_pattern);
3995       okquantifier = TRUE;
3996       break;   /* End of (?P processing */
3997 
3998 
3999       /* ---- Recursion/subroutine calls by number ---- */
4000 
4001       case CHAR_R:
4002       i = 0;         /* (?R) == (?R0) */
4003       ptr++;
4004       if (ptr >= ptrend || *ptr != CHAR_RIGHT_PARENTHESIS)
4005         {
4006         errorcode = ERR58;
4007         goto FAILED;
4008         }
4009       goto SET_RECURSION;
4010 
4011       /* An item starting (?- followed by a digit comes here via the "default"
4012       case because (?- followed by a non-digit is an options setting. */
4013 
4014       case CHAR_PLUS:
4015       if (ptrend - ptr < 2 || !IS_DIGIT(ptr[1]))
4016         {
4017         errorcode = ERR29;   /* Missing number */
4018         goto FAILED;
4019         }
4020       /* Fall through */
4021 
4022       case CHAR_0: case CHAR_1: case CHAR_2: case CHAR_3: case CHAR_4:
4023       case CHAR_5: case CHAR_6: case CHAR_7: case CHAR_8: case CHAR_9:
4024       RECURSION_BYNUMBER:
4025       if (!read_number(&ptr, ptrend,
4026           (IS_DIGIT(*ptr))? -1:(int)(cb->bracount), /* + and - are relative */
4027           MAX_GROUP_NUMBER, ERR61,
4028           &i, &errorcode)) goto FAILED;
4029       if (i < 0)  /* NB (?0) is permitted */
4030         {
4031         errorcode = ERR15;   /* Unknown group */
4032         goto FAILED_BACK;
4033         }
4034       if (ptr >= ptrend || *ptr != CHAR_RIGHT_PARENTHESIS)
4035         goto UNCLOSED_PARENTHESIS;
4036 
4037       SET_RECURSION:
4038       *parsed_pattern++ = META_RECURSE | (uint32_t)i;
4039       offset = (PCRE2_SIZE)(ptr - cb->start_pattern);
4040       ptr++;
4041       PUTOFFSET(offset, parsed_pattern);
4042       okquantifier = TRUE;
4043       break;  /* End of recursive call by number handling */
4044 
4045 
4046       /* ---- Recursion/subroutine calls by name ---- */
4047 
4048       case CHAR_AMPERSAND:
4049       RECURSE_BY_NAME:
4050       if (!read_name(&ptr, ptrend, utf, CHAR_RIGHT_PARENTHESIS, &offset, &name,
4051           &namelen, &errorcode, cb)) goto FAILED;
4052       *parsed_pattern++ = META_RECURSE_BYNAME;
4053       *parsed_pattern++ = namelen;
4054       PUTOFFSET(offset, parsed_pattern);
4055       okquantifier = TRUE;
4056       break;
4057 
4058       /* ---- Callout with numerical or string argument ---- */
4059 
4060       case CHAR_C:
4061       if (++ptr >= ptrend) goto UNCLOSED_PARENTHESIS;
4062 
4063       /* If the previous item was a condition starting (?(? an assertion,
4064       optionally preceded by a callout, is expected. This is checked later on,
4065       during actual compilation. However we need to identify this kind of
4066       assertion in this pass because it must not be qualified. The value of
4067       expect_cond_assert is set to 2 after (?(? is processed. We decrement it
4068       for a callout - still leaving a positive value that identifies the
4069       assertion. Multiple callouts or any other items will make it zero or
4070       less, which doesn't matter because they will cause an error later. */
4071 
4072       expect_cond_assert = prev_expect_cond_assert - 1;
4073 
4074       /* If previous_callout is not NULL, it means this follows a previous
4075       callout. If it was a manual callout, do nothing; this means its "length
4076       of next pattern item" field will remain zero. If it was an automatic
4077       callout, abolish it. */
4078 
4079       if (previous_callout != NULL && (options & PCRE2_AUTO_CALLOUT) != 0 &&
4080           previous_callout == parsed_pattern - 4 &&
4081           parsed_pattern[-1] == 255)
4082         parsed_pattern = previous_callout;
4083 
4084       /* Save for updating next pattern item length, and skip one item before
4085       completing. */
4086 
4087       previous_callout = parsed_pattern;
4088       after_manual_callout = 1;
4089 
4090       /* Handle a string argument; specific delimiter is required. */
4091 
4092       if (*ptr != CHAR_RIGHT_PARENTHESIS && !IS_DIGIT(*ptr))
4093         {
4094         PCRE2_SIZE calloutlength;
4095         PCRE2_SPTR startptr = ptr;
4096 
4097         delimiter = 0;
4098         for (i = 0; PRIV(callout_start_delims)[i] != 0; i++)
4099           {
4100           if (*ptr == PRIV(callout_start_delims)[i])
4101             {
4102             delimiter = PRIV(callout_end_delims)[i];
4103             break;
4104             }
4105           }
4106         if (delimiter == 0)
4107           {
4108           errorcode = ERR82;
4109           goto FAILED;
4110           }
4111 
4112         *parsed_pattern = META_CALLOUT_STRING;
4113         parsed_pattern += 3;   /* Skip pattern info */
4114 
4115         for (;;)
4116           {
4117           if (++ptr >= ptrend)
4118             {
4119             errorcode = ERR81;
4120             ptr = startptr;   /* To give a more useful message */
4121             goto FAILED;
4122             }
4123           if (*ptr == delimiter && (++ptr >= ptrend || *ptr != delimiter))
4124             break;
4125           }
4126 
4127         calloutlength = (PCRE2_SIZE)(ptr - startptr);
4128         if (calloutlength > UINT32_MAX)
4129           {
4130           errorcode = ERR72;
4131           goto FAILED;
4132           }
4133         *parsed_pattern++ = (uint32_t)calloutlength;
4134         offset = (PCRE2_SIZE)(startptr - cb->start_pattern);
4135         PUTOFFSET(offset, parsed_pattern);
4136         }
4137 
4138       /* Handle a callout with an optional numerical argument, which must be
4139       less than or equal to 255. A missing argument gives 0. */
4140 
4141       else
4142         {
4143         int n = 0;
4144         *parsed_pattern = META_CALLOUT_NUMBER;     /* Numerical callout */
4145         parsed_pattern += 3;                       /* Skip pattern info */
4146         while (ptr < ptrend && IS_DIGIT(*ptr))
4147           {
4148           n = n * 10 + *ptr++ - CHAR_0;
4149           if (n > 255)
4150             {
4151             errorcode = ERR38;
4152             goto FAILED;
4153             }
4154           }
4155         *parsed_pattern++ = n;
4156         }
4157 
4158       /* Both formats must have a closing parenthesis */
4159 
4160       if (ptr >= ptrend || *ptr != CHAR_RIGHT_PARENTHESIS)
4161         {
4162         errorcode = ERR39;
4163         goto FAILED;
4164         }
4165       ptr++;
4166 
4167       /* Remember the offset to the next item in the pattern, and set a default
4168       length. This should get updated after the next item is read. */
4169 
4170       previous_callout[1] = (uint32_t)(ptr - cb->start_pattern);
4171       previous_callout[2] = 0;
4172       break;                  /* End callout */
4173 
4174 
4175       /* ---- Conditional group ---- */
4176 
4177       /* A condition can be an assertion, a number (referring to a numbered
4178       group's having been set), a name (referring to a named group), or 'R',
4179       referring to overall recursion. R<digits> and R&name are also permitted
4180       for recursion state tests. Numbers may be preceded by + or - to specify a
4181       relative group number.
4182 
4183       There are several syntaxes for testing a named group: (?(name)) is used
4184       by Python; Perl 5.10 onwards uses (?(<name>) or (?('name')).
4185 
4186       There are two unfortunate ambiguities. 'R' can be the recursive thing or
4187       the name 'R' (and similarly for 'R' followed by digits). 'DEFINE' can be
4188       the Perl DEFINE feature or the Python named test. We look for a name
4189       first; if not found, we try the other case.
4190 
4191       For compatibility with auto-callouts, we allow a callout to be specified
4192       before a condition that is an assertion. */
4193 
4194       case CHAR_LEFT_PARENTHESIS:
4195       if (++ptr >= ptrend) goto UNCLOSED_PARENTHESIS;
4196       nest_depth++;
4197 
4198       /* If the next character is ? or * there must be an assertion next
4199       (optionally preceded by a callout). We do not check this here, but
4200       instead we set expect_cond_assert to 2. If this is still greater than
4201       zero (callouts decrement it) when the next assertion is read, it will be
4202       marked as a condition that must not be repeated. A value greater than
4203       zero also causes checking that an assertion (possibly with callout)
4204       follows. */
4205 
4206       if (*ptr == CHAR_QUESTION_MARK || *ptr == CHAR_ASTERISK)
4207         {
4208         *parsed_pattern++ = META_COND_ASSERT;
4209         ptr--;   /* Pull pointer back to the opening parenthesis. */
4210         expect_cond_assert = 2;
4211         break;  /* End of conditional */
4212         }
4213 
4214       /* Handle (?([+-]number)... */
4215 
4216       if (read_number(&ptr, ptrend, cb->bracount, MAX_GROUP_NUMBER, ERR61, &i,
4217           &errorcode))
4218         {
4219         if (i <= 0)
4220           {
4221           errorcode = ERR15;
4222           goto FAILED;
4223           }
4224         *parsed_pattern++ = META_COND_NUMBER;
4225         offset = (PCRE2_SIZE)(ptr - cb->start_pattern - 2);
4226         PUTOFFSET(offset, parsed_pattern);
4227         *parsed_pattern++ = i;
4228         }
4229       else if (errorcode != 0) goto FAILED;   /* Number too big */
4230 
4231       /* No number found. Handle the special case (?(VERSION[>]=n.m)... */
4232 
4233       else if (ptrend - ptr >= 10 &&
4234                PRIV(strncmp_c8)(ptr, STRING_VERSION, 7) == 0 &&
4235                ptr[7] != CHAR_RIGHT_PARENTHESIS)
4236         {
4237         uint32_t ge = 0;
4238         int major = 0;
4239         int minor = 0;
4240 
4241         ptr += 7;
4242         if (*ptr == CHAR_GREATER_THAN_SIGN)
4243           {
4244           ge = 1;
4245           ptr++;
4246           }
4247 
4248         /* NOTE: cannot write IS_DIGIT(*(++ptr)) here because IS_DIGIT
4249         references its argument twice. */
4250 
4251         if (*ptr != CHAR_EQUALS_SIGN || (ptr++, !IS_DIGIT(*ptr)))
4252           goto BAD_VERSION_CONDITION;
4253 
4254         if (!read_number(&ptr, ptrend, -1, 1000, ERR79, &major, &errorcode))
4255           goto FAILED;
4256 
4257         if (ptr >= ptrend) goto BAD_VERSION_CONDITION;
4258         if (*ptr == CHAR_DOT)
4259           {
4260           if (++ptr >= ptrend || !IS_DIGIT(*ptr)) goto BAD_VERSION_CONDITION;
4261           minor = (*ptr++ - CHAR_0) * 10;
4262           if (IS_DIGIT(*ptr)) minor += *ptr++ - CHAR_0;
4263           if (ptr >= ptrend || *ptr != CHAR_RIGHT_PARENTHESIS)
4264             goto BAD_VERSION_CONDITION;
4265           }
4266 
4267         *parsed_pattern++ = META_COND_VERSION;
4268         *parsed_pattern++ = ge;
4269         *parsed_pattern++ = major;
4270         *parsed_pattern++ = minor;
4271         }
4272 
4273       /* All the remaining cases now require us to read a name. We cannot at
4274       this stage distinguish ambiguous cases such as (?(R12) which might be a
4275       recursion test by number or a name, because the named groups have not yet
4276       all been identified. Those cases are treated as names, but given a
4277       different META code. */
4278 
4279       else
4280         {
4281         BOOL was_r_ampersand = FALSE;
4282 
4283         if (*ptr == CHAR_R && ptrend - ptr > 1 && ptr[1] == CHAR_AMPERSAND)
4284           {
4285           terminator = CHAR_RIGHT_PARENTHESIS;
4286           was_r_ampersand = TRUE;
4287           ptr++;
4288           }
4289         else if (*ptr == CHAR_LESS_THAN_SIGN)
4290           terminator = CHAR_GREATER_THAN_SIGN;
4291         else if (*ptr == CHAR_APOSTROPHE)
4292           terminator = CHAR_APOSTROPHE;
4293         else
4294           {
4295           terminator = CHAR_RIGHT_PARENTHESIS;
4296           ptr--;   /* Point to char before name */
4297           }
4298         if (!read_name(&ptr, ptrend, utf, terminator, &offset, &name, &namelen,
4299             &errorcode, cb)) goto FAILED;
4300 
4301         /* Handle (?(R&name) */
4302 
4303         if (was_r_ampersand)
4304           {
4305           *parsed_pattern = META_COND_RNAME;
4306           ptr--;   /* Back to closing parens */
4307           }
4308 
4309         /* Handle (?(name). If the name is "DEFINE" we identify it with a
4310         special code. Likewise if the name consists of R followed only by
4311         digits. Otherwise, handle it like a quoted name. */
4312 
4313         else if (terminator == CHAR_RIGHT_PARENTHESIS)
4314           {
4315           if (namelen == 6 && PRIV(strncmp_c8)(name, STRING_DEFINE, 6) == 0)
4316             *parsed_pattern = META_COND_DEFINE;
4317           else
4318             {
4319             for (i = 1; i < (int)namelen; i++)
4320               if (!IS_DIGIT(name[i])) break;
4321             *parsed_pattern = (*name == CHAR_R && i >= (int)namelen)?
4322               META_COND_RNUMBER : META_COND_NAME;
4323             }
4324           ptr--;   /* Back to closing parens */
4325           }
4326 
4327         /* Handle (?('name') or (?(<name>) */
4328 
4329         else *parsed_pattern = META_COND_NAME;
4330 
4331         /* All these cases except DEFINE end with the name length and offset;
4332         DEFINE just has an offset (for the "too many branches" error). */
4333 
4334         if (*parsed_pattern++ != META_COND_DEFINE) *parsed_pattern++ = namelen;
4335         PUTOFFSET(offset, parsed_pattern);
4336         }  /* End cases that read a name */
4337 
4338       /* Check the closing parenthesis of the condition */
4339 
4340       if (ptr >= ptrend || *ptr != CHAR_RIGHT_PARENTHESIS)
4341         {
4342         errorcode = ERR24;
4343         goto FAILED;
4344         }
4345       ptr++;
4346       break;  /* End of condition processing */
4347 
4348 
4349       /* ---- Atomic group ---- */
4350 
4351       case CHAR_GREATER_THAN_SIGN:
4352       ATOMIC_GROUP:                          /* Come from (*atomic: */
4353       *parsed_pattern++ = META_ATOMIC;
4354       nest_depth++;
4355       ptr++;
4356       break;
4357 
4358 
4359       /* ---- Lookahead assertions ---- */
4360 
4361       case CHAR_EQUALS_SIGN:
4362       POSITIVE_LOOK_AHEAD:                   /* Come from (*pla: */
4363       *parsed_pattern++ = META_LOOKAHEAD;
4364       ptr++;
4365       goto POST_ASSERTION;
4366 
4367       case CHAR_EXCLAMATION_MARK:
4368       NEGATIVE_LOOK_AHEAD:                   /* Come from (*nla: */
4369       *parsed_pattern++ = META_LOOKAHEADNOT;
4370       ptr++;
4371       goto POST_ASSERTION;
4372 
4373 
4374       /* ---- Lookbehind assertions ---- */
4375 
4376       /* (?< followed by = or ! is a lookbehind assertion. Otherwise (?< is the
4377       start of the name of a capturing group. */
4378 
4379       case CHAR_LESS_THAN_SIGN:
4380       if (ptrend - ptr <= 1 ||
4381          (ptr[1] != CHAR_EQUALS_SIGN && ptr[1] != CHAR_EXCLAMATION_MARK))
4382         {
4383         terminator = CHAR_GREATER_THAN_SIGN;
4384         goto DEFINE_NAME;
4385         }
4386       *parsed_pattern++ = (ptr[1] == CHAR_EQUALS_SIGN)?
4387         META_LOOKBEHIND : META_LOOKBEHINDNOT;
4388 
4389       POST_LOOKBEHIND:              /* Come from (*plb: and (*nlb: */
4390       *has_lookbehind = TRUE;
4391       offset = (PCRE2_SIZE)(ptr - cb->start_pattern - 2);
4392       PUTOFFSET(offset, parsed_pattern);
4393       ptr += 2;
4394       /* Fall through */
4395 
4396       /* If the previous item was a condition starting (?(? an assertion,
4397       optionally preceded by a callout, is expected. This is checked later on,
4398       during actual compilation. However we need to identify this kind of
4399       assertion in this pass because it must not be qualified. The value of
4400       expect_cond_assert is set to 2 after (?(? is processed. We decrement it
4401       for a callout - still leaving a positive value that identifies the
4402       assertion. Multiple callouts or any other items will make it zero or
4403       less, which doesn't matter because they will cause an error later. */
4404 
4405       POST_ASSERTION:
4406       nest_depth++;
4407       if (prev_expect_cond_assert > 0)
4408         {
4409         if (top_nest == NULL) top_nest = (nest_save *)(cb->start_workspace);
4410         else if (++top_nest >= end_nests)
4411           {
4412           errorcode = ERR84;
4413           goto FAILED;
4414           }
4415         top_nest->nest_depth = nest_depth;
4416         top_nest->flags = NSF_CONDASSERT;
4417         top_nest->options = options & PARSE_TRACKED_OPTIONS;
4418         }
4419       break;
4420 
4421 
4422       /* ---- Define a named group ---- */
4423 
4424       /* A named group may be defined as (?'name') or (?<name>). In the latter
4425       case we jump to DEFINE_NAME from the disambiguation of (?< above with the
4426       terminator set to '>'. */
4427 
4428       case CHAR_APOSTROPHE:
4429       terminator = CHAR_APOSTROPHE;    /* Terminator */
4430 
4431       DEFINE_NAME:
4432       if (!read_name(&ptr, ptrend, utf, terminator, &offset, &name, &namelen,
4433           &errorcode, cb)) goto FAILED;
4434 
4435       /* We have a name for this capturing group. It is also assigned a number,
4436       which is its primary means of identification. */
4437 
4438       cb->bracount++;
4439       *parsed_pattern++ = META_CAPTURE | cb->bracount;
4440       nest_depth++;
4441 
4442       /* Check not too many names */
4443 
4444       if (cb->names_found >= MAX_NAME_COUNT)
4445         {
4446         errorcode = ERR49;
4447         goto FAILED;
4448         }
4449 
4450       /* Adjust the entry size to accommodate the longest name found. */
4451 
4452       if (namelen + IMM2_SIZE + 1 > cb->name_entry_size)
4453         cb->name_entry_size = (uint16_t)(namelen + IMM2_SIZE + 1);
4454 
4455       /* Scan the list to check for duplicates. For duplicate names, if the
4456       number is the same, break the loop, which causes the name to be
4457       discarded; otherwise, if DUPNAMES is not set, give an error.
4458       If it is set, allow the name with a different number, but continue
4459       scanning in case this is a duplicate with the same number. For
4460       non-duplicate names, give an error if the number is duplicated. */
4461 
4462       isdupname = FALSE;
4463       ng = cb->named_groups;
4464       for (i = 0; i < cb->names_found; i++, ng++)
4465         {
4466         if (namelen == ng->length &&
4467             PRIV(strncmp)(name, ng->name, (PCRE2_SIZE)namelen) == 0)
4468           {
4469           if (ng->number == cb->bracount) break;
4470           if ((options & PCRE2_DUPNAMES) == 0)
4471             {
4472             errorcode = ERR43;
4473             goto FAILED;
4474             }
4475           isdupname = ng->isdup = TRUE;     /* Mark as a duplicate */
4476           cb->dupnames = TRUE;              /* Duplicate names exist */
4477           }
4478         else if (ng->number == cb->bracount)
4479           {
4480           errorcode = ERR65;
4481           goto FAILED;
4482           }
4483         }
4484 
4485       if (i < cb->names_found) break;   /* Ignore duplicate with same number */
4486 
4487       /* Increase the list size if necessary */
4488 
4489       if (cb->names_found >= cb->named_group_list_size)
4490         {
4491         uint32_t newsize = cb->named_group_list_size * 2;
4492         named_group *newspace =
4493           cb->cx->memctl.malloc(newsize * sizeof(named_group),
4494           cb->cx->memctl.memory_data);
4495         if (newspace == NULL)
4496           {
4497           errorcode = ERR21;
4498           goto FAILED;
4499           }
4500 
4501         memcpy(newspace, cb->named_groups,
4502           cb->named_group_list_size * sizeof(named_group));
4503         if (cb->named_group_list_size > NAMED_GROUP_LIST_SIZE)
4504           cb->cx->memctl.free((void *)cb->named_groups,
4505           cb->cx->memctl.memory_data);
4506         cb->named_groups = newspace;
4507         cb->named_group_list_size = newsize;
4508         }
4509 
4510       /* Add this name to the list */
4511 
4512       cb->named_groups[cb->names_found].name = name;
4513       cb->named_groups[cb->names_found].length = (uint16_t)namelen;
4514       cb->named_groups[cb->names_found].number = cb->bracount;
4515       cb->named_groups[cb->names_found].isdup = (uint16_t)isdupname;
4516       cb->names_found++;
4517       break;
4518       }        /* End of (? switch */
4519     break;     /* End of ( handling */
4520 
4521 
4522     /* ---- Branch terminators ---- */
4523 
4524     /* Alternation: reset the capture count if we are in a (?| group. */
4525 
4526     case CHAR_VERTICAL_LINE:
4527     if (top_nest != NULL && top_nest->nest_depth == nest_depth &&
4528         (top_nest->flags & NSF_RESET) != 0)
4529       {
4530       if (cb->bracount > top_nest->max_group)
4531         top_nest->max_group = (uint16_t)cb->bracount;
4532       cb->bracount = top_nest->reset_group;
4533       }
4534     *parsed_pattern++ = META_ALT;
4535     break;
4536 
4537     /* End of group; reset the capture count to the maximum if we are in a (?|
4538     group and/or reset the options that are tracked during parsing. Disallow
4539     quantifier for a condition that is an assertion. */
4540 
4541     case CHAR_RIGHT_PARENTHESIS:
4542     okquantifier = TRUE;
4543     if (top_nest != NULL && top_nest->nest_depth == nest_depth)
4544       {
4545       options = (options & ~PARSE_TRACKED_OPTIONS) | top_nest->options;
4546       if ((top_nest->flags & NSF_RESET) != 0 &&
4547           top_nest->max_group > cb->bracount)
4548         cb->bracount = top_nest->max_group;
4549       if ((top_nest->flags & NSF_CONDASSERT) != 0)
4550         okquantifier = FALSE;
4551 
4552       if ((top_nest->flags & NSF_ATOMICSR) != 0)
4553         {
4554         *parsed_pattern++ = META_KET;
4555         }
4556 
4557 
4558 
4559       if (top_nest == (nest_save *)(cb->start_workspace)) top_nest = NULL;
4560         else top_nest--;
4561       }
4562     if (nest_depth == 0)    /* Unmatched closing parenthesis */
4563       {
4564       errorcode = ERR22;
4565       goto FAILED_BACK;
4566       }
4567     nest_depth--;
4568     *parsed_pattern++ = META_KET;
4569     break;
4570     }  /* End of switch on pattern character */
4571   }    /* End of main character scan loop */
4572 
4573 /* End of pattern reached. Check for missing ) at the end of a verb name. */
4574 
4575 if (inverbname && ptr >= ptrend)
4576   {
4577   errorcode = ERR60;
4578   goto FAILED;
4579   }
4580 
4581 /* Manage callout for the final item */
4582 
4583 PARSED_END:
4584 parsed_pattern = manage_callouts(ptr, &previous_callout, auto_callout,
4585   parsed_pattern, cb);
4586 
4587 /* Insert trailing items for word and line matching (features provided for the
4588 benefit of pcre2grep). */
4589 
4590 if ((extra_options & PCRE2_EXTRA_MATCH_LINE) != 0)
4591   {
4592   *parsed_pattern++ = META_KET;
4593   *parsed_pattern++ = META_DOLLAR;
4594   }
4595 else if ((extra_options & PCRE2_EXTRA_MATCH_WORD) != 0)
4596   {
4597   *parsed_pattern++ = META_KET;
4598   *parsed_pattern++ = META_ESCAPE + ESC_b;
4599   }
4600 
4601 /* Terminate the parsed pattern, then return success if all groups are closed.
4602 Otherwise we have unclosed parentheses. */
4603 
4604 if (parsed_pattern >= parsed_pattern_end)
4605   {
4606   errorcode = ERR63;  /* Internal error (parsed pattern overflow) */
4607   goto FAILED;
4608   }
4609 
4610 *parsed_pattern = META_END;
4611 if (nest_depth == 0) return 0;
4612 
4613 UNCLOSED_PARENTHESIS:
4614 errorcode = ERR14;
4615 
4616 /* Come here for all failures. */
4617 
4618 FAILED:
4619 cb->erroroffset = (PCRE2_SIZE)(ptr - cb->start_pattern);
4620 return errorcode;
4621 
4622 /* Some errors need to indicate the previous character. */
4623 
4624 FAILED_BACK:
4625 ptr--;
4626 goto FAILED;
4627 
4628 /* This failure happens several times. */
4629 
4630 BAD_VERSION_CONDITION:
4631 errorcode = ERR79;
4632 goto FAILED;
4633 }
4634 
4635 
4636 
4637 /*************************************************
4638 *       Find first significant opcode            *
4639 *************************************************/
4640 
4641 /* This is called by several functions that scan a compiled expression looking
4642 for a fixed first character, or an anchoring opcode etc. It skips over things
4643 that do not influence this. For some calls, it makes sense to skip negative
4644 forward and all backward assertions, and also the \b assertion; for others it
4645 does not.
4646 
4647 Arguments:
4648   code         pointer to the start of the group
4649   skipassert   TRUE if certain assertions are to be skipped
4650 
4651 Returns:       pointer to the first significant opcode
4652 */
4653 
4654 static const PCRE2_UCHAR*
first_significant_code(PCRE2_SPTR code,BOOL skipassert)4655 first_significant_code(PCRE2_SPTR code, BOOL skipassert)
4656 {
4657 for (;;)
4658   {
4659   switch ((int)*code)
4660     {
4661     case OP_ASSERT_NOT:
4662     case OP_ASSERTBACK:
4663     case OP_ASSERTBACK_NOT:
4664     if (!skipassert) return code;
4665     do code += GET(code, 1); while (*code == OP_ALT);
4666     code += PRIV(OP_lengths)[*code];
4667     break;
4668 
4669     case OP_WORD_BOUNDARY:
4670     case OP_NOT_WORD_BOUNDARY:
4671     if (!skipassert) return code;
4672     /* Fall through */
4673 
4674     case OP_CALLOUT:
4675     case OP_CREF:
4676     case OP_DNCREF:
4677     case OP_RREF:
4678     case OP_DNRREF:
4679     case OP_FALSE:
4680     case OP_TRUE:
4681     code += PRIV(OP_lengths)[*code];
4682     break;
4683 
4684     case OP_CALLOUT_STR:
4685     code += GET(code, 1 + 2*LINK_SIZE);
4686     break;
4687 
4688     case OP_SKIPZERO:
4689     code += 2 + GET(code, 2) + LINK_SIZE;
4690     break;
4691 
4692     case OP_COND:
4693     case OP_SCOND:
4694     if (code[1+LINK_SIZE] != OP_FALSE ||   /* Not DEFINE */
4695         code[GET(code, 1)] != OP_KET)      /* More than one branch */
4696       return code;
4697     code += GET(code, 1) + 1 + LINK_SIZE;
4698     break;
4699 
4700     case OP_MARK:
4701     case OP_COMMIT_ARG:
4702     case OP_PRUNE_ARG:
4703     case OP_SKIP_ARG:
4704     case OP_THEN_ARG:
4705     code += code[1] + PRIV(OP_lengths)[*code];
4706     break;
4707 
4708     default:
4709     return code;
4710     }
4711   }
4712 /* Control never reaches here */
4713 }
4714 
4715 
4716 
4717 #ifdef SUPPORT_UNICODE
4718 /*************************************************
4719 *           Get othercase range                  *
4720 *************************************************/
4721 
4722 /* This function is passed the start and end of a class range in UCP mode. It
4723 searches up the characters, looking for ranges of characters in the "other"
4724 case. Each call returns the next one, updating the start address. A character
4725 with multiple other cases is returned on its own with a special return value.
4726 
4727 Arguments:
4728   cptr        points to starting character value; updated
4729   d           end value
4730   ocptr       where to put start of othercase range
4731   odptr       where to put end of othercase range
4732 
4733 Yield:        -1 when no more
4734                0 when a range is returned
4735               >0 the CASESET offset for char with multiple other cases
4736                 in this case, ocptr contains the original
4737 */
4738 
4739 static int
get_othercase_range(uint32_t * cptr,uint32_t d,uint32_t * ocptr,uint32_t * odptr)4740 get_othercase_range(uint32_t *cptr, uint32_t d, uint32_t *ocptr,
4741   uint32_t *odptr)
4742 {
4743 uint32_t c, othercase, next;
4744 unsigned int co;
4745 
4746 /* Find the first character that has an other case. If it has multiple other
4747 cases, return its case offset value. */
4748 
4749 for (c = *cptr; c <= d; c++)
4750   {
4751   if ((co = UCD_CASESET(c)) != 0)
4752     {
4753     *ocptr = c++;   /* Character that has the set */
4754     *cptr = c;      /* Rest of input range */
4755     return (int)co;
4756     }
4757   if ((othercase = UCD_OTHERCASE(c)) != c) break;
4758   }
4759 
4760 if (c > d) return -1;  /* Reached end of range */
4761 
4762 /* Found a character that has a single other case. Search for the end of the
4763 range, which is either the end of the input range, or a character that has zero
4764 or more than one other cases. */
4765 
4766 *ocptr = othercase;
4767 next = othercase + 1;
4768 
4769 for (++c; c <= d; c++)
4770   {
4771   if ((co = UCD_CASESET(c)) != 0 || UCD_OTHERCASE(c) != next) break;
4772   next++;
4773   }
4774 
4775 *odptr = next - 1;     /* End of othercase range */
4776 *cptr = c;             /* Rest of input range */
4777 return 0;
4778 }
4779 #endif  /* SUPPORT_UNICODE */
4780 
4781 
4782 
4783 /*************************************************
4784 * Add a character or range to a class (internal) *
4785 *************************************************/
4786 
4787 /* This function packages up the logic of adding a character or range of
4788 characters to a class. The character values in the arguments will be within the
4789 valid values for the current mode (8-bit, 16-bit, UTF, etc). This function is
4790 called only from within the "add to class" group of functions, some of which
4791 are recursive and mutually recursive. The external entry point is
4792 add_to_class().
4793 
4794 Arguments:
4795   classbits     the bit map for characters < 256
4796   uchardptr     points to the pointer for extra data
4797   options       the options word
4798   cb            compile data
4799   start         start of range character
4800   end           end of range character
4801 
4802 Returns:        the number of < 256 characters added
4803                 the pointer to extra data is updated
4804 */
4805 
4806 static unsigned int
add_to_class_internal(uint8_t * classbits,PCRE2_UCHAR ** uchardptr,uint32_t options,compile_block * cb,uint32_t start,uint32_t end)4807 add_to_class_internal(uint8_t *classbits, PCRE2_UCHAR **uchardptr,
4808   uint32_t options, compile_block *cb, uint32_t start, uint32_t end)
4809 {
4810 uint32_t c;
4811 uint32_t classbits_end = (end <= 0xff ? end : 0xff);
4812 unsigned int n8 = 0;
4813 
4814 /* If caseless matching is required, scan the range and process alternate
4815 cases. In Unicode, there are 8-bit characters that have alternate cases that
4816 are greater than 255 and vice-versa. Sometimes we can just extend the original
4817 range. */
4818 
4819 if ((options & PCRE2_CASELESS) != 0)
4820   {
4821 #ifdef SUPPORT_UNICODE
4822   if ((options & PCRE2_UTF) != 0)
4823     {
4824     int rc;
4825     uint32_t oc, od;
4826 
4827     options &= ~PCRE2_CASELESS;   /* Remove for recursive calls */
4828     c = start;
4829 
4830     while ((rc = get_othercase_range(&c, end, &oc, &od)) >= 0)
4831       {
4832       /* Handle a single character that has more than one other case. */
4833 
4834       if (rc > 0) n8 += add_list_to_class_internal(classbits, uchardptr, options, cb,
4835         PRIV(ucd_caseless_sets) + rc, oc);
4836 
4837       /* Do nothing if the other case range is within the original range. */
4838 
4839       else if (oc >= cb->class_range_start && od <= cb->class_range_end) continue;
4840 
4841       /* Extend the original range if there is overlap, noting that if oc < c, we
4842       can't have od > end because a subrange is always shorter than the basic
4843       range. Otherwise, use a recursive call to add the additional range. */
4844 
4845       else if (oc < start && od >= start - 1) start = oc; /* Extend downwards */
4846       else if (od > end && oc <= end + 1)
4847         {
4848         end = od;       /* Extend upwards */
4849         if (end > classbits_end) classbits_end = (end <= 0xff ? end : 0xff);
4850         }
4851       else n8 += add_to_class_internal(classbits, uchardptr, options, cb, oc, od);
4852       }
4853     }
4854   else
4855 #endif  /* SUPPORT_UNICODE */
4856 
4857   /* Not UTF mode */
4858 
4859   for (c = start; c <= classbits_end; c++)
4860     {
4861     SETBIT(classbits, cb->fcc[c]);
4862     n8++;
4863     }
4864   }
4865 
4866 /* Now handle the originally supplied range. Adjust the final value according
4867 to the bit length - this means that the same lists of (e.g.) horizontal spaces
4868 can be used in all cases. */
4869 
4870 if ((options & PCRE2_UTF) == 0 && end > MAX_NON_UTF_CHAR)
4871   end = MAX_NON_UTF_CHAR;
4872 
4873 if (start > cb->class_range_start && end < cb->class_range_end) return n8;
4874 
4875 /* Use the bitmap for characters < 256. Otherwise use extra data.*/
4876 
4877 for (c = start; c <= classbits_end; c++)
4878   {
4879   /* Regardless of start, c will always be <= 255. */
4880   SETBIT(classbits, c);
4881   n8++;
4882   }
4883 
4884 #ifdef SUPPORT_WIDE_CHARS
4885 if (start <= 0xff) start = 0xff + 1;
4886 
4887 if (end >= start)
4888   {
4889   PCRE2_UCHAR *uchardata = *uchardptr;
4890 
4891 #ifdef SUPPORT_UNICODE
4892   if ((options & PCRE2_UTF) != 0)
4893     {
4894     if (start < end)
4895       {
4896       *uchardata++ = XCL_RANGE;
4897       uchardata += PRIV(ord2utf)(start, uchardata);
4898       uchardata += PRIV(ord2utf)(end, uchardata);
4899       }
4900     else if (start == end)
4901       {
4902       *uchardata++ = XCL_SINGLE;
4903       uchardata += PRIV(ord2utf)(start, uchardata);
4904       }
4905     }
4906   else
4907 #endif  /* SUPPORT_UNICODE */
4908 
4909   /* Without UTF support, character values are constrained by the bit length,
4910   and can only be > 256 for 16-bit and 32-bit libraries. */
4911 
4912 #if PCRE2_CODE_UNIT_WIDTH == 8
4913     {}
4914 #else
4915   if (start < end)
4916     {
4917     *uchardata++ = XCL_RANGE;
4918     *uchardata++ = start;
4919     *uchardata++ = end;
4920     }
4921   else if (start == end)
4922     {
4923     *uchardata++ = XCL_SINGLE;
4924     *uchardata++ = start;
4925     }
4926 #endif  /* PCRE2_CODE_UNIT_WIDTH == 8 */
4927   *uchardptr = uchardata;   /* Updata extra data pointer */
4928   }
4929 #else  /* SUPPORT_WIDE_CHARS */
4930   (void)uchardptr;          /* Avoid compiler warning */
4931 #endif /* SUPPORT_WIDE_CHARS */
4932 
4933 return n8;    /* Number of 8-bit characters */
4934 }
4935 
4936 
4937 
4938 #ifdef SUPPORT_UNICODE
4939 /*************************************************
4940 * Add a list of characters to a class (internal) *
4941 *************************************************/
4942 
4943 /* This function is used for adding a list of case-equivalent characters to a
4944 class when in UTF mode. This function is called only from within
4945 add_to_class_internal(), with which it is mutually recursive.
4946 
4947 Arguments:
4948   classbits     the bit map for characters < 256
4949   uchardptr     points to the pointer for extra data
4950   options       the options word
4951   cb            contains pointers to tables etc.
4952   p             points to row of 32-bit values, terminated by NOTACHAR
4953   except        character to omit; this is used when adding lists of
4954                   case-equivalent characters to avoid including the one we
4955                   already know about
4956 
4957 Returns:        the number of < 256 characters added
4958                 the pointer to extra data is updated
4959 */
4960 
4961 static unsigned int
add_list_to_class_internal(uint8_t * classbits,PCRE2_UCHAR ** uchardptr,uint32_t options,compile_block * cb,const uint32_t * p,unsigned int except)4962 add_list_to_class_internal(uint8_t *classbits, PCRE2_UCHAR **uchardptr,
4963   uint32_t options, compile_block *cb, const uint32_t *p, unsigned int except)
4964 {
4965 unsigned int n8 = 0;
4966 while (p[0] < NOTACHAR)
4967   {
4968   unsigned int n = 0;
4969   if (p[0] != except)
4970     {
4971     while(p[n+1] == p[0] + n + 1) n++;
4972     n8 += add_to_class_internal(classbits, uchardptr, options, cb, p[0], p[n]);
4973     }
4974   p += n + 1;
4975   }
4976 return n8;
4977 }
4978 #endif
4979 
4980 
4981 
4982 /*************************************************
4983 *   External entry point for add range to class  *
4984 *************************************************/
4985 
4986 /* This function sets the overall range so that the internal functions can try
4987 to avoid duplication when handling case-independence.
4988 
4989 Arguments:
4990   classbits     the bit map for characters < 256
4991   uchardptr     points to the pointer for extra data
4992   options       the options word
4993   cb            compile data
4994   start         start of range character
4995   end           end of range character
4996 
4997 Returns:        the number of < 256 characters added
4998                 the pointer to extra data is updated
4999 */
5000 
5001 static unsigned int
add_to_class(uint8_t * classbits,PCRE2_UCHAR ** uchardptr,uint32_t options,compile_block * cb,uint32_t start,uint32_t end)5002 add_to_class(uint8_t *classbits, PCRE2_UCHAR **uchardptr, uint32_t options,
5003   compile_block *cb, uint32_t start, uint32_t end)
5004 {
5005 cb->class_range_start = start;
5006 cb->class_range_end = end;
5007 return add_to_class_internal(classbits, uchardptr, options, cb, start, end);
5008 }
5009 
5010 
5011 /*************************************************
5012 *   External entry point for add list to class   *
5013 *************************************************/
5014 
5015 /* This function is used for adding a list of horizontal or vertical whitespace
5016 characters to a class. The list must be in order so that ranges of characters
5017 can be detected and handled appropriately. This function sets the overall range
5018 so that the internal functions can try to avoid duplication when handling
5019 case-independence.
5020 
5021 Arguments:
5022   classbits     the bit map for characters < 256
5023   uchardptr     points to the pointer for extra data
5024   options       the options word
5025   cb            contains pointers to tables etc.
5026   p             points to row of 32-bit values, terminated by NOTACHAR
5027   except        character to omit; this is used when adding lists of
5028                   case-equivalent characters to avoid including the one we
5029                   already know about
5030 
5031 Returns:        the number of < 256 characters added
5032                 the pointer to extra data is updated
5033 */
5034 
5035 static unsigned int
add_list_to_class(uint8_t * classbits,PCRE2_UCHAR ** uchardptr,uint32_t options,compile_block * cb,const uint32_t * p,unsigned int except)5036 add_list_to_class(uint8_t *classbits, PCRE2_UCHAR **uchardptr, uint32_t options,
5037   compile_block *cb, const uint32_t *p, unsigned int except)
5038 {
5039 unsigned int n8 = 0;
5040 while (p[0] < NOTACHAR)
5041   {
5042   unsigned int n = 0;
5043   if (p[0] != except)
5044     {
5045     while(p[n+1] == p[0] + n + 1) n++;
5046     cb->class_range_start = p[0];
5047     cb->class_range_end = p[n];
5048     n8 += add_to_class_internal(classbits, uchardptr, options, cb, p[0], p[n]);
5049     }
5050   p += n + 1;
5051   }
5052 return n8;
5053 }
5054 
5055 
5056 
5057 /*************************************************
5058 *    Add characters not in a list to a class     *
5059 *************************************************/
5060 
5061 /* This function is used for adding the complement of a list of horizontal or
5062 vertical whitespace to a class. The list must be in order.
5063 
5064 Arguments:
5065   classbits     the bit map for characters < 256
5066   uchardptr     points to the pointer for extra data
5067   options       the options word
5068   cb            contains pointers to tables etc.
5069   p             points to row of 32-bit values, terminated by NOTACHAR
5070 
5071 Returns:        the number of < 256 characters added
5072                 the pointer to extra data is updated
5073 */
5074 
5075 static unsigned int
add_not_list_to_class(uint8_t * classbits,PCRE2_UCHAR ** uchardptr,uint32_t options,compile_block * cb,const uint32_t * p)5076 add_not_list_to_class(uint8_t *classbits, PCRE2_UCHAR **uchardptr,
5077   uint32_t options, compile_block *cb, const uint32_t *p)
5078 {
5079 BOOL utf = (options & PCRE2_UTF) != 0;
5080 unsigned int n8 = 0;
5081 if (p[0] > 0)
5082   n8 += add_to_class(classbits, uchardptr, options, cb, 0, p[0] - 1);
5083 while (p[0] < NOTACHAR)
5084   {
5085   while (p[1] == p[0] + 1) p++;
5086   n8 += add_to_class(classbits, uchardptr, options, cb, p[0] + 1,
5087     (p[1] == NOTACHAR) ? (utf ? 0x10ffffu : 0xffffffffu) : p[1] - 1);
5088   p++;
5089   }
5090 return n8;
5091 }
5092 
5093 
5094 
5095 /*************************************************
5096 *    Find details of duplicate group names       *
5097 *************************************************/
5098 
5099 /* This is called from compile_branch() when it needs to know the index and
5100 count of duplicates in the names table when processing named backreferences,
5101 either directly, or as conditions.
5102 
5103 Arguments:
5104   name          points to the name
5105   length        the length of the name
5106   indexptr      where to put the index
5107   countptr      where to put the count of duplicates
5108   errorcodeptr  where to put an error code
5109   cb            the compile block
5110 
5111 Returns:        TRUE if OK, FALSE if not, error code set
5112 */
5113 
5114 static BOOL
find_dupname_details(PCRE2_SPTR name,uint32_t length,int * indexptr,int * countptr,int * errorcodeptr,compile_block * cb)5115 find_dupname_details(PCRE2_SPTR name, uint32_t length, int *indexptr,
5116   int *countptr, int *errorcodeptr, compile_block *cb)
5117 {
5118 uint32_t i, groupnumber;
5119 int count;
5120 PCRE2_UCHAR *slot = cb->name_table;
5121 
5122 /* Find the first entry in the table */
5123 
5124 for (i = 0; i < cb->names_found; i++)
5125   {
5126   if (PRIV(strncmp)(name, slot+IMM2_SIZE, length) == 0 &&
5127       slot[IMM2_SIZE+length] == 0) break;
5128   slot += cb->name_entry_size;
5129   }
5130 
5131 /* This should not occur, because this function is called only when we know we
5132 have duplicate names. Give an internal error. */
5133 
5134 if (i >= cb->names_found)
5135   {
5136   *errorcodeptr = ERR53;
5137   cb->erroroffset = name - cb->start_pattern;
5138   return FALSE;
5139   }
5140 
5141 /* Record the index and then see how many duplicates there are, updating the
5142 backref map and maximum back reference as we do. */
5143 
5144 *indexptr = i;
5145 count = 0;
5146 
5147 for (;;)
5148   {
5149   count++;
5150   groupnumber = GET2(slot,0);
5151   cb->backref_map |= (groupnumber < 32)? (1u << groupnumber) : 1;
5152   if (groupnumber > cb->top_backref) cb->top_backref = groupnumber;
5153   if (++i >= cb->names_found) break;
5154   slot += cb->name_entry_size;
5155   if (PRIV(strncmp)(name, slot+IMM2_SIZE, length) != 0 ||
5156     (slot+IMM2_SIZE)[length] != 0) break;
5157   }
5158 
5159 *countptr = count;
5160 return TRUE;
5161 }
5162 
5163 
5164 
5165 /*************************************************
5166 *           Compile one branch                   *
5167 *************************************************/
5168 
5169 /* Scan the parsed pattern, compiling it into the a vector of PCRE2_UCHAR. If
5170 the options are changed during the branch, the pointer is used to change the
5171 external options bits. This function is used during the pre-compile phase when
5172 we are trying to find out the amount of memory needed, as well as during the
5173 real compile phase. The value of lengthptr distinguishes the two phases.
5174 
5175 Arguments:
5176   optionsptr        pointer to the option bits
5177   codeptr           points to the pointer to the current code point
5178   pptrptr           points to the current parsed pattern pointer
5179   errorcodeptr      points to error code variable
5180   firstcuptr        place to put the first required code unit
5181   firstcuflagsptr   place to put the first code unit flags, or a negative number
5182   reqcuptr          place to put the last required code unit
5183   reqcuflagsptr     place to put the last required code unit flags, or a negative number
5184   bcptr             points to current branch chain
5185   cb                contains pointers to tables etc.
5186   lengthptr         NULL during the real compile phase
5187                     points to length accumulator during pre-compile phase
5188 
5189 Returns:            0 There's been an error, *errorcodeptr is non-zero
5190                    +1 Success, this branch must match at least one character
5191                    -1 Success, this branch may match an empty string
5192 */
5193 
5194 static int
compile_branch(uint32_t * optionsptr,PCRE2_UCHAR ** codeptr,uint32_t ** pptrptr,int * errorcodeptr,uint32_t * firstcuptr,int32_t * firstcuflagsptr,uint32_t * reqcuptr,int32_t * reqcuflagsptr,branch_chain * bcptr,compile_block * cb,PCRE2_SIZE * lengthptr)5195 compile_branch(uint32_t *optionsptr, PCRE2_UCHAR **codeptr, uint32_t **pptrptr,
5196   int *errorcodeptr, uint32_t *firstcuptr, int32_t *firstcuflagsptr,
5197   uint32_t *reqcuptr, int32_t *reqcuflagsptr, branch_chain *bcptr,
5198   compile_block *cb, PCRE2_SIZE *lengthptr)
5199 {
5200 int bravalue = 0;
5201 int okreturn = -1;
5202 int group_return = 0;
5203 uint32_t repeat_min = 0, repeat_max = 0;      /* To please picky compilers */
5204 uint32_t greedy_default, greedy_non_default;
5205 uint32_t repeat_type, op_type;
5206 uint32_t options = *optionsptr;               /* May change dynamically */
5207 uint32_t firstcu, reqcu;
5208 uint32_t zeroreqcu, zerofirstcu;
5209 uint32_t escape;
5210 uint32_t *pptr = *pptrptr;
5211 uint32_t meta, meta_arg;
5212 int32_t firstcuflags, reqcuflags;
5213 int32_t zeroreqcuflags, zerofirstcuflags;
5214 int32_t req_caseopt, reqvary, tempreqvary;
5215 PCRE2_SIZE offset = 0;
5216 PCRE2_SIZE length_prevgroup = 0;
5217 PCRE2_UCHAR *code = *codeptr;
5218 PCRE2_UCHAR *last_code = code;
5219 PCRE2_UCHAR *orig_code = code;
5220 PCRE2_UCHAR *tempcode;
5221 PCRE2_UCHAR *previous = NULL;
5222 PCRE2_UCHAR op_previous;
5223 BOOL groupsetfirstcu = FALSE;
5224 BOOL matched_char = FALSE;
5225 BOOL previous_matched_char = FALSE;
5226 const uint8_t *cbits = cb->cbits;
5227 uint8_t classbits[32];
5228 
5229 /* We can fish out the UTF setting once and for all into a BOOL, but we must
5230 not do this for other options (e.g. PCRE2_EXTENDED) because they may change
5231 dynamically as we process the pattern. */
5232 
5233 #ifdef SUPPORT_UNICODE
5234 BOOL utf = (options & PCRE2_UTF) != 0;
5235 #else  /* No UTF support */
5236 BOOL utf = FALSE;
5237 #endif
5238 
5239 /* Helper variables for OP_XCLASS opcode (for characters > 255). We define
5240 class_uchardata always so that it can be passed to add_to_class() always,
5241 though it will not be used in non-UTF 8-bit cases. This avoids having to supply
5242 alternative calls for the different cases. */
5243 
5244 PCRE2_UCHAR *class_uchardata;
5245 #ifdef SUPPORT_WIDE_CHARS
5246 BOOL xclass;
5247 PCRE2_UCHAR *class_uchardata_base;
5248 #endif
5249 
5250 /* Set up the default and non-default settings for greediness */
5251 
5252 greedy_default = ((options & PCRE2_UNGREEDY) != 0);
5253 greedy_non_default = greedy_default ^ 1;
5254 
5255 /* Initialize no first unit, no required unit. REQ_UNSET means "no char
5256 matching encountered yet". It gets changed to REQ_NONE if we hit something that
5257 matches a non-fixed first unit; reqcu just remains unset if we never find one.
5258 
5259 When we hit a repeat whose minimum is zero, we may have to adjust these values
5260 to take the zero repeat into account. This is implemented by setting them to
5261 zerofirstcu and zeroreqcu when such a repeat is encountered. The individual
5262 item types that can be repeated set these backoff variables appropriately. */
5263 
5264 firstcu = reqcu = zerofirstcu = zeroreqcu = 0;
5265 firstcuflags = reqcuflags = zerofirstcuflags = zeroreqcuflags = REQ_UNSET;
5266 
5267 /* The variable req_caseopt contains either the REQ_CASELESS value or zero,
5268 according to the current setting of the caseless flag. The REQ_CASELESS value
5269 leaves the lower 28 bit empty. It is added into the firstcu or reqcu variables
5270 to record the case status of the value. This is used only for ASCII characters.
5271 */
5272 
5273 req_caseopt = ((options & PCRE2_CASELESS) != 0)? REQ_CASELESS:0;
5274 
5275 /* Switch on next META item until the end of the branch */
5276 
5277 for (;; pptr++)
5278   {
5279 #ifdef SUPPORT_WIDE_CHARS
5280   BOOL xclass_has_prop;
5281 #endif
5282   BOOL negate_class;
5283   BOOL should_flip_negation;
5284   BOOL match_all_or_no_wide_chars;
5285   BOOL possessive_quantifier;
5286   BOOL note_group_empty;
5287   int class_has_8bitchar;
5288   int i;
5289   uint32_t mclength;
5290   uint32_t skipunits;
5291   uint32_t subreqcu, subfirstcu;
5292   uint32_t groupnumber;
5293   uint32_t verbarglen, verbculen;
5294   int32_t subreqcuflags, subfirstcuflags;  /* Must be signed */
5295   open_capitem *oc;
5296   PCRE2_UCHAR mcbuffer[8];
5297 
5298   /* Get next META item in the pattern and its potential argument. */
5299 
5300   meta = META_CODE(*pptr);
5301   meta_arg = META_DATA(*pptr);
5302 
5303   /* If we are in the pre-compile phase, accumulate the length used for the
5304   previous cycle of this loop, unless the next item is a quantifier. */
5305 
5306   if (lengthptr != NULL)
5307     {
5308     if (code > cb->start_workspace + cb->workspace_size -
5309         WORK_SIZE_SAFETY_MARGIN)                       /* Check for overrun */
5310       {
5311       *errorcodeptr = (code >= cb->start_workspace + cb->workspace_size)?
5312         ERR52 : ERR86;
5313       return 0;
5314       }
5315 
5316     /* There is at least one situation where code goes backwards: this is the
5317     case of a zero quantifier after a class (e.g. [ab]{0}). When the quantifier
5318     is processed, the whole class is eliminated. However, it is created first,
5319     so we have to allow memory for it. Therefore, don't ever reduce the length
5320     at this point. */
5321 
5322     if (code < last_code) code = last_code;
5323 
5324     /* If the next thing is not a quantifier, we add the length of the previous
5325     item into the total, and reset the code pointer to the start of the
5326     workspace. Otherwise leave the previous item available to be quantified. */
5327 
5328     if (meta < META_ASTERISK || meta > META_MINMAX_QUERY)
5329       {
5330       if (OFLOW_MAX - *lengthptr < (PCRE2_SIZE)(code - orig_code))
5331         {
5332         *errorcodeptr = ERR20;   /* Integer overflow */
5333         return 0;
5334         }
5335       *lengthptr += (PCRE2_SIZE)(code - orig_code);
5336       if (*lengthptr > MAX_PATTERN_SIZE)
5337         {
5338         *errorcodeptr = ERR20;   /* Pattern is too large */
5339         return 0;
5340         }
5341       code = orig_code;
5342       }
5343 
5344     /* Remember where this code item starts so we can catch the "backwards"
5345     case above next time round. */
5346 
5347     last_code = code;
5348     }
5349 
5350   /* Process the next parsed pattern item. If it is not a quantifier, remember
5351   where it starts so that it can be quantified when a quantifier follows.
5352   Checking for the legality of quantifiers happens in parse_regex(), except for
5353   a quantifier after an assertion that is a condition. */
5354 
5355   if (meta < META_ASTERISK || meta > META_MINMAX_QUERY)
5356     {
5357     previous = code;
5358     if (matched_char) okreturn = 1;
5359     }
5360 
5361   previous_matched_char = matched_char;
5362   matched_char = FALSE;
5363   note_group_empty = FALSE;
5364   skipunits = 0;         /* Default value for most subgroups */
5365 
5366   switch(meta)
5367     {
5368     /* ===================================================================*/
5369     /* The branch terminates at pattern end or | or ) */
5370 
5371     case META_END:
5372     case META_ALT:
5373     case META_KET:
5374     *firstcuptr = firstcu;
5375     *firstcuflagsptr = firstcuflags;
5376     *reqcuptr = reqcu;
5377     *reqcuflagsptr = reqcuflags;
5378     *codeptr = code;
5379     *pptrptr = pptr;
5380     return okreturn;
5381 
5382 
5383     /* ===================================================================*/
5384     /* Handle single-character metacharacters. In multiline mode, ^ disables
5385     the setting of any following char as a first character. */
5386 
5387     case META_CIRCUMFLEX:
5388     if ((options & PCRE2_MULTILINE) != 0)
5389       {
5390       if (firstcuflags == REQ_UNSET)
5391         zerofirstcuflags = firstcuflags = REQ_NONE;
5392       *code++ = OP_CIRCM;
5393       }
5394     else *code++ = OP_CIRC;
5395     break;
5396 
5397     case META_DOLLAR:
5398     *code++ = ((options & PCRE2_MULTILINE) != 0)? OP_DOLLM : OP_DOLL;
5399     break;
5400 
5401     /* There can never be a first char if '.' is first, whatever happens about
5402     repeats. The value of reqcu doesn't change either. */
5403 
5404     case META_DOT:
5405     matched_char = TRUE;
5406     if (firstcuflags == REQ_UNSET) firstcuflags = REQ_NONE;
5407     zerofirstcu = firstcu;
5408     zerofirstcuflags = firstcuflags;
5409     zeroreqcu = reqcu;
5410     zeroreqcuflags = reqcuflags;
5411     *code++ = ((options & PCRE2_DOTALL) != 0)? OP_ALLANY: OP_ANY;
5412     break;
5413 
5414 
5415     /* ===================================================================*/
5416     /* Empty character classes are allowed if PCRE2_ALLOW_EMPTY_CLASS is set.
5417     Otherwise, an initial ']' is taken as a data character. When empty classes
5418     are allowed, [] must always fail, so generate OP_FAIL, whereas [^] must
5419     match any character, so generate OP_ALLANY. */
5420 
5421     case META_CLASS_EMPTY:
5422     case META_CLASS_EMPTY_NOT:
5423     matched_char = TRUE;
5424     *code++ = (meta == META_CLASS_EMPTY_NOT)? OP_ALLANY : OP_FAIL;
5425     if (firstcuflags == REQ_UNSET) firstcuflags = REQ_NONE;
5426     zerofirstcu = firstcu;
5427     zerofirstcuflags = firstcuflags;
5428     break;
5429 
5430 
5431     /* ===================================================================*/
5432     /* Non-empty character class. If the included characters are all < 256, we
5433     build a 32-byte bitmap of the permitted characters, except in the special
5434     case where there is only one such character. For negated classes, we build
5435     the map as usual, then invert it at the end. However, we use a different
5436     opcode so that data characters > 255 can be handled correctly.
5437 
5438     If the class contains characters outside the 0-255 range, a different
5439     opcode is compiled. It may optionally have a bit map for characters < 256,
5440     but those above are are explicitly listed afterwards. A flag code unit
5441     tells whether the bitmap is present, and whether this is a negated class or
5442     not. */
5443 
5444     case META_CLASS_NOT:
5445     case META_CLASS:
5446     matched_char = TRUE;
5447     negate_class = meta == META_CLASS_NOT;
5448 
5449     /* We can optimize the case of a single character in a class by generating
5450     OP_CHAR or OP_CHARI if it's positive, or OP_NOT or OP_NOTI if it's
5451     negative. In the negative case there can be no first char if this item is
5452     first, whatever repeat count may follow. In the case of reqcu, save the
5453     previous value for reinstating. */
5454 
5455     /* NOTE: at present this optimization is not effective if the only
5456     character in a class in 32-bit, non-UCP mode has its top bit set. */
5457 
5458     if (pptr[1] < META_END && pptr[2] == META_CLASS_END)
5459       {
5460 #ifdef SUPPORT_UNICODE
5461       uint32_t d;
5462 #endif
5463       uint32_t c = pptr[1];
5464 
5465       pptr += 2;                 /* Move on to class end */
5466       if (meta == META_CLASS)    /* A positive one-char class can be */
5467         {                        /* handled as a normal literal character. */
5468         meta = c;                /* Set up the character */
5469         goto NORMAL_CHAR_SET;
5470         }
5471 
5472       /* Handle a negative one-character class */
5473 
5474       zeroreqcu = reqcu;
5475       zeroreqcuflags = reqcuflags;
5476       if (firstcuflags == REQ_UNSET) firstcuflags = REQ_NONE;
5477       zerofirstcu = firstcu;
5478       zerofirstcuflags = firstcuflags;
5479 
5480       /* For caseless UTF mode, check whether this character has more than
5481       one other case. If so, generate a special OP_NOTPROP item instead of
5482       OP_NOTI. */
5483 
5484 #ifdef SUPPORT_UNICODE
5485       if (utf && (options & PCRE2_CASELESS) != 0 &&
5486           (d = UCD_CASESET(c)) != 0)
5487         {
5488         *code++ = OP_NOTPROP;
5489         *code++ = PT_CLIST;
5490         *code++ = d;
5491         break;   /* We are finished with this class */
5492         }
5493 #endif
5494       /* Char has only one other case, or UCP not available */
5495 
5496       *code++ = ((options & PCRE2_CASELESS) != 0)? OP_NOTI: OP_NOT;
5497       code += PUTCHAR(c, code);
5498       break;   /* We are finished with this class */
5499       }        /* End of 1-char optimization */
5500 
5501     /* Handle character classes that contain more than just one literal
5502     character. */
5503 
5504     /* If a non-extended class contains a negative special such as \S, we need
5505     to flip the negation flag at the end, so that support for characters > 255
5506     works correctly (they are all included in the class). An extended class may
5507     need to insert specific matching or non-matching code for wide characters.
5508     */
5509 
5510     should_flip_negation = match_all_or_no_wide_chars = FALSE;
5511 
5512     /* Extended class (xclass) will be used when characters > 255
5513     might match. */
5514 
5515 #ifdef SUPPORT_WIDE_CHARS
5516     xclass = FALSE;
5517     class_uchardata = code + LINK_SIZE + 2;   /* For XCLASS items */
5518     class_uchardata_base = class_uchardata;   /* Save the start */
5519 #endif
5520 
5521     /* For optimization purposes, we track some properties of the class:
5522     class_has_8bitchar will be non-zero if the class contains at least one
5523     character with a code point less than 256; xclass_has_prop will be TRUE if
5524     Unicode property checks are present in the class. */
5525 
5526     class_has_8bitchar = 0;
5527 #ifdef SUPPORT_WIDE_CHARS
5528     xclass_has_prop = FALSE;
5529 #endif
5530 
5531     /* Initialize the 256-bit (32-byte) bit map to all zeros. We build the map
5532     in a temporary bit of memory, in case the class contains fewer than two
5533     8-bit characters because in that case the compiled code doesn't use the bit
5534     map. */
5535 
5536     memset(classbits, 0, 32 * sizeof(uint8_t));
5537 
5538     /* Process items until META_CLASS_END is reached. */
5539 
5540     while ((meta = *(++pptr)) != META_CLASS_END)
5541       {
5542       /* Handle POSIX classes such as [:alpha:] etc. */
5543 
5544       if (meta == META_POSIX || meta == META_POSIX_NEG)
5545         {
5546         BOOL local_negate = (meta == META_POSIX_NEG);
5547         int posix_class = *(++pptr);
5548         int taboffset, tabopt;
5549         uint8_t pbits[32];
5550 
5551         should_flip_negation = local_negate;  /* Note negative special */
5552 
5553         /* If matching is caseless, upper and lower are converted to alpha.
5554         This relies on the fact that the class table starts with alpha,
5555         lower, upper as the first 3 entries. */
5556 
5557         if ((options & PCRE2_CASELESS) != 0 && posix_class <= 2)
5558           posix_class = 0;
5559 
5560         /* When PCRE2_UCP is set, some of the POSIX classes are converted to
5561         different escape sequences that use Unicode properties \p or \P.
5562         Others that are not available via \p or \P have to generate
5563         XCL_PROP/XCL_NOTPROP directly, which is done here. */
5564 
5565 #ifdef SUPPORT_UNICODE
5566         if ((options & PCRE2_UCP) != 0) switch(posix_class)
5567           {
5568           case PC_GRAPH:
5569           case PC_PRINT:
5570           case PC_PUNCT:
5571           *class_uchardata++ = local_negate? XCL_NOTPROP : XCL_PROP;
5572           *class_uchardata++ = (PCRE2_UCHAR)
5573             ((posix_class == PC_GRAPH)? PT_PXGRAPH :
5574              (posix_class == PC_PRINT)? PT_PXPRINT : PT_PXPUNCT);
5575           *class_uchardata++ = 0;
5576           xclass_has_prop = TRUE;
5577           goto CONTINUE_CLASS;
5578 
5579           /* For the other POSIX classes (ascii, xdigit) we are going to
5580           fall through to the non-UCP case and build a bit map for
5581           characters with code points less than 256. However, if we are in
5582           a negated POSIX class, characters with code points greater than
5583           255 must either all match or all not match, depending on whether
5584           the whole class is not or is negated. For example, for
5585           [[:^ascii:]... they must all match, whereas for [^[:^xdigit:]...
5586           they must not.
5587 
5588           In the special case where there are no xclass items, this is
5589           automatically handled by the use of OP_CLASS or OP_NCLASS, but an
5590           explicit range is needed for OP_XCLASS. Setting a flag here
5591           causes the range to be generated later when it is known that
5592           OP_XCLASS is required. In the 8-bit library this is relevant only in
5593           utf mode, since no wide characters can exist otherwise. */
5594 
5595           default:
5596 #if PCRE2_CODE_UNIT_WIDTH == 8
5597           if (utf)
5598 #endif
5599           match_all_or_no_wide_chars |= local_negate;
5600           break;
5601           }
5602 #endif  /* SUPPORT_UNICODE */
5603 
5604         /* In the non-UCP case, or when UCP makes no difference, we build the
5605         bit map for the POSIX class in a chunk of local store because we may
5606         be adding and subtracting from it, and we don't want to subtract bits
5607         that may be in the main map already. At the end we or the result into
5608         the bit map that is being built. */
5609 
5610         posix_class *= 3;
5611 
5612         /* Copy in the first table (always present) */
5613 
5614         memcpy(pbits, cbits + posix_class_maps[posix_class],
5615           32 * sizeof(uint8_t));
5616 
5617         /* If there is a second table, add or remove it as required. */
5618 
5619         taboffset = posix_class_maps[posix_class + 1];
5620         tabopt = posix_class_maps[posix_class + 2];
5621 
5622         if (taboffset >= 0)
5623           {
5624           if (tabopt >= 0)
5625             for (i = 0; i < 32; i++) pbits[i] |= cbits[(int)i + taboffset];
5626           else
5627             for (i = 0; i < 32; i++) pbits[i] &= ~cbits[(int)i + taboffset];
5628           }
5629 
5630         /* Now see if we need to remove any special characters. An option
5631         value of 1 removes vertical space and 2 removes underscore. */
5632 
5633         if (tabopt < 0) tabopt = -tabopt;
5634         if (tabopt == 1) pbits[1] &= ~0x3c;
5635           else if (tabopt == 2) pbits[11] &= 0x7f;
5636 
5637         /* Add the POSIX table or its complement into the main table that is
5638         being built and we are done. */
5639 
5640         if (local_negate)
5641           for (i = 0; i < 32; i++) classbits[i] |= ~pbits[i];
5642         else
5643           for (i = 0; i < 32; i++) classbits[i] |= pbits[i];
5644 
5645         /* Every class contains at least one < 256 character. */
5646 
5647         class_has_8bitchar = 1;
5648         goto CONTINUE_CLASS;    /* End of POSIX handling */
5649         }
5650 
5651       /* Other than POSIX classes, the only items we should encounter are
5652       \d-type escapes and literal characters (possibly as ranges). */
5653 
5654       if (meta == META_BIGVALUE)
5655         {
5656         meta = *(++pptr);
5657         goto CLASS_LITERAL;
5658         }
5659 
5660       /* Any other non-literal must be an escape */
5661 
5662       if (meta >= META_END)
5663         {
5664         if (META_CODE(meta) != META_ESCAPE)
5665           {
5666 #ifdef DEBUG_SHOW_PARSED
5667           fprintf(stderr, "** Unrecognized parsed pattern item 0x%.8x "
5668                           "in character class\n", meta);
5669 #endif
5670           *errorcodeptr = ERR89;  /* Internal error - unrecognized. */
5671           return 0;
5672           }
5673         escape = META_DATA(meta);
5674 
5675         /* Every class contains at least one < 256 character. */
5676 
5677         class_has_8bitchar++;
5678 
5679         switch(escape)
5680           {
5681           case ESC_d:
5682           for (i = 0; i < 32; i++) classbits[i] |= cbits[i+cbit_digit];
5683           break;
5684 
5685           case ESC_D:
5686           should_flip_negation = TRUE;
5687           for (i = 0; i < 32; i++) classbits[i] |= ~cbits[i+cbit_digit];
5688           break;
5689 
5690           case ESC_w:
5691           for (i = 0; i < 32; i++) classbits[i] |= cbits[i+cbit_word];
5692           break;
5693 
5694           case ESC_W:
5695           should_flip_negation = TRUE;
5696           for (i = 0; i < 32; i++) classbits[i] |= ~cbits[i+cbit_word];
5697           break;
5698 
5699           /* Perl 5.004 onwards omitted VT from \s, but restored it at Perl
5700           5.18. Before PCRE 8.34, we had to preserve the VT bit if it was
5701           previously set by something earlier in the character class.
5702           Luckily, the value of CHAR_VT is 0x0b in both ASCII and EBCDIC, so
5703           we could just adjust the appropriate bit. From PCRE 8.34 we no
5704           longer treat \s and \S specially. */
5705 
5706           case ESC_s:
5707           for (i = 0; i < 32; i++) classbits[i] |= cbits[i+cbit_space];
5708           break;
5709 
5710           case ESC_S:
5711           should_flip_negation = TRUE;
5712           for (i = 0; i < 32; i++) classbits[i] |= ~cbits[i+cbit_space];
5713           break;
5714 
5715           /* When adding the horizontal or vertical space lists to a class, or
5716           their complements, disable PCRE2_CASELESS, because it justs wastes
5717           time, and in the "not-x" UTF cases can create unwanted duplicates in
5718           the XCLASS list (provoked by characters that have more than one other
5719           case and by both cases being in the same "not-x" sublist). */
5720 
5721           case ESC_h:
5722           (void)add_list_to_class(classbits, &class_uchardata,
5723             options & ~PCRE2_CASELESS, cb, PRIV(hspace_list), NOTACHAR);
5724           break;
5725 
5726           case ESC_H:
5727           (void)add_not_list_to_class(classbits, &class_uchardata,
5728             options & ~PCRE2_CASELESS, cb, PRIV(hspace_list));
5729           break;
5730 
5731           case ESC_v:
5732           (void)add_list_to_class(classbits, &class_uchardata,
5733             options & ~PCRE2_CASELESS, cb, PRIV(vspace_list), NOTACHAR);
5734           break;
5735 
5736           case ESC_V:
5737           (void)add_not_list_to_class(classbits, &class_uchardata,
5738             options & ~PCRE2_CASELESS, cb, PRIV(vspace_list));
5739           break;
5740 
5741           /* If Unicode is not supported, \P and \p are not allowed and are
5742           faulted at parse time, so will never appear here. */
5743 
5744 #ifdef SUPPORT_UNICODE
5745           case ESC_p:
5746           case ESC_P:
5747             {
5748             uint32_t ptype = *(++pptr) >> 16;
5749             uint32_t pdata = *pptr & 0xffff;
5750             *class_uchardata++ = (escape == ESC_p)? XCL_PROP : XCL_NOTPROP;
5751             *class_uchardata++ = ptype;
5752             *class_uchardata++ = pdata;
5753             xclass_has_prop = TRUE;
5754             class_has_8bitchar--;                /* Undo! */
5755             }
5756           break;
5757 #endif
5758           }
5759 
5760         goto CONTINUE_CLASS;
5761         }  /* End handling \d-type escapes */
5762 
5763       /* A literal character may be followed by a range meta. At parse time
5764       there are checks for out-of-order characters, for ranges where the two
5765       characters are equal, and for hyphens that cannot indicate a range. At
5766       this point, therefore, no checking is needed. */
5767 
5768       else
5769         {
5770         uint32_t c, d;
5771 
5772         CLASS_LITERAL:
5773         c = d = meta;
5774 
5775         /* Remember if \r or \n were explicitly used */
5776 
5777         if (c == CHAR_CR || c == CHAR_NL) cb->external_flags |= PCRE2_HASCRORLF;
5778 
5779         /* Process a character range */
5780 
5781         if (pptr[1] == META_RANGE_LITERAL || pptr[1] == META_RANGE_ESCAPED)
5782           {
5783 #ifdef EBCDIC
5784           BOOL range_is_literal = (pptr[1] == META_RANGE_LITERAL);
5785 #endif
5786           pptr += 2;
5787           d = *pptr;
5788           if (d == META_BIGVALUE) d = *(++pptr);
5789 
5790           /* Remember an explicit \r or \n, and add the range to the class. */
5791 
5792           if (d == CHAR_CR || d == CHAR_NL) cb->external_flags |= PCRE2_HASCRORLF;
5793 
5794           /* In an EBCDIC environment, Perl treats alphabetic ranges specially
5795           because there are holes in the encoding, and simply using the range
5796           A-Z (for example) would include the characters in the holes. This
5797           applies only to literal ranges; [\xC1-\xE9] is different to [A-Z]. */
5798 
5799 #ifdef EBCDIC
5800           if (range_is_literal &&
5801                (cb->ctypes[c] & ctype_letter) != 0 &&
5802                (cb->ctypes[d] & ctype_letter) != 0 &&
5803                (c <= CHAR_z) == (d <= CHAR_z))
5804             {
5805             uint32_t uc = (d <= CHAR_z)? 0 : 64;
5806             uint32_t C = c - uc;
5807             uint32_t D = d - uc;
5808 
5809             if (C <= CHAR_i)
5810               {
5811               class_has_8bitchar +=
5812                 add_to_class(classbits, &class_uchardata, options, cb, C + uc,
5813                   ((D < CHAR_i)? D : CHAR_i) + uc);
5814               C = CHAR_j;
5815               }
5816 
5817             if (C <= D && C <= CHAR_r)
5818               {
5819               class_has_8bitchar +=
5820                 add_to_class(classbits, &class_uchardata, options, cb, C + uc,
5821                   ((D < CHAR_r)? D : CHAR_r) + uc);
5822               C = CHAR_s;
5823               }
5824 
5825             if (C <= D)
5826               {
5827               class_has_8bitchar +=
5828                 add_to_class(classbits, &class_uchardata, options, cb, C + uc,
5829                   D + uc);
5830               }
5831             }
5832           else
5833 #endif
5834           /* Not an EBCDIC special range */
5835 
5836           class_has_8bitchar +=
5837             add_to_class(classbits, &class_uchardata, options, cb, c, d);
5838           goto CONTINUE_CLASS;   /* Go get the next char in the class */
5839           }  /* End of range handling */
5840 
5841 
5842         /* Handle a single character. */
5843 
5844         class_has_8bitchar +=
5845           add_to_class(classbits, &class_uchardata, options, cb, meta, meta);
5846         }
5847 
5848       /* Continue to the next item in the class. */
5849 
5850       CONTINUE_CLASS:
5851 
5852 #ifdef SUPPORT_WIDE_CHARS
5853       /* If any wide characters or Unicode properties have been encountered,
5854       set xclass = TRUE. Then, in the pre-compile phase, accumulate the length
5855       of the extra data and reset the pointer. This is so that very large
5856       classes that contain a zillion wide characters or Unicode property tests
5857       do not overwrite the workspace (which is on the stack). */
5858 
5859       if (class_uchardata > class_uchardata_base)
5860         {
5861         xclass = TRUE;
5862         if (lengthptr != NULL)
5863           {
5864           *lengthptr += class_uchardata - class_uchardata_base;
5865           class_uchardata = class_uchardata_base;
5866           }
5867         }
5868 #endif
5869 
5870       continue;  /* Needed to avoid error when not supporting wide chars */
5871       }   /* End of main class-processing loop */
5872 
5873     /* If this class is the first thing in the branch, there can be no first
5874     char setting, whatever the repeat count. Any reqcu setting must remain
5875     unchanged after any kind of repeat. */
5876 
5877     if (firstcuflags == REQ_UNSET) firstcuflags = REQ_NONE;
5878     zerofirstcu = firstcu;
5879     zerofirstcuflags = firstcuflags;
5880     zeroreqcu = reqcu;
5881     zeroreqcuflags = reqcuflags;
5882 
5883     /* If there are characters with values > 255, or Unicode property settings
5884     (\p or \P), we have to compile an extended class, with its own opcode,
5885     unless there were no property settings and there was a negated special such
5886     as \S in the class, and PCRE2_UCP is not set, because in that case all
5887     characters > 255 are in or not in the class, so any that were explicitly
5888     given as well can be ignored.
5889 
5890     In the UCP case, if certain negated POSIX classes ([:^ascii:] or
5891     [^:xdigit:]) were present in a class, we either have to match or not match
5892     all wide characters (depending on whether the whole class is or is not
5893     negated). This requirement is indicated by match_all_or_no_wide_chars being
5894     true. We do this by including an explicit range, which works in both cases.
5895     This applies only in UTF and 16-bit and 32-bit non-UTF modes, since there
5896     cannot be any wide characters in 8-bit non-UTF mode.
5897 
5898     When there *are* properties in a positive UTF-8 or any 16-bit or 32_bit
5899     class where \S etc is present without PCRE2_UCP, causing an extended class
5900     to be compiled, we make sure that all characters > 255 are included by
5901     forcing match_all_or_no_wide_chars to be true.
5902 
5903     If, when generating an xclass, there are no characters < 256, we can omit
5904     the bitmap in the actual compiled code. */
5905 
5906 #ifdef SUPPORT_WIDE_CHARS  /* Defined for 16/32 bits, or 8-bit with Unicode */
5907     if (xclass && (
5908 #ifdef SUPPORT_UNICODE
5909         (options & PCRE2_UCP) != 0 ||
5910 #endif
5911         xclass_has_prop || !should_flip_negation))
5912       {
5913       if (match_all_or_no_wide_chars || (
5914 #if PCRE2_CODE_UNIT_WIDTH == 8
5915            utf &&
5916 #endif
5917            should_flip_negation && !negate_class && (options & PCRE2_UCP) == 0))
5918         {
5919         *class_uchardata++ = XCL_RANGE;
5920         if (utf)   /* Will always be utf in the 8-bit library */
5921           {
5922           class_uchardata += PRIV(ord2utf)(0x100, class_uchardata);
5923           class_uchardata += PRIV(ord2utf)(MAX_UTF_CODE_POINT, class_uchardata);
5924           }
5925         else       /* Can only happen for the 16-bit & 32-bit libraries */
5926           {
5927 #if PCRE2_CODE_UNIT_WIDTH == 16
5928           *class_uchardata++ = 0x100;
5929           *class_uchardata++ = 0xffffu;
5930 #elif PCRE2_CODE_UNIT_WIDTH == 32
5931           *class_uchardata++ = 0x100;
5932           *class_uchardata++ = 0xffffffffu;
5933 #endif
5934           }
5935         }
5936       *class_uchardata++ = XCL_END;    /* Marks the end of extra data */
5937       *code++ = OP_XCLASS;
5938       code += LINK_SIZE;
5939       *code = negate_class? XCL_NOT:0;
5940       if (xclass_has_prop) *code |= XCL_HASPROP;
5941 
5942       /* If the map is required, move up the extra data to make room for it;
5943       otherwise just move the code pointer to the end of the extra data. */
5944 
5945       if (class_has_8bitchar > 0)
5946         {
5947         *code++ |= XCL_MAP;
5948         (void)memmove(code + (32 / sizeof(PCRE2_UCHAR)), code,
5949           CU2BYTES(class_uchardata - code));
5950         if (negate_class && !xclass_has_prop)
5951           {
5952           /* Using 255 ^ instead of ~ avoids clang sanitize warning. */
5953           for (i = 0; i < 32; i++) classbits[i] = 255 ^ classbits[i];
5954           }
5955         memcpy(code, classbits, 32);
5956         code = class_uchardata + (32 / sizeof(PCRE2_UCHAR));
5957         }
5958       else code = class_uchardata;
5959 
5960       /* Now fill in the complete length of the item */
5961 
5962       PUT(previous, 1, (int)(code - previous));
5963       break;   /* End of class handling */
5964       }
5965 #endif  /* SUPPORT_WIDE_CHARS */
5966 
5967     /* If there are no characters > 255, or they are all to be included or
5968     excluded, set the opcode to OP_CLASS or OP_NCLASS, depending on whether the
5969     whole class was negated and whether there were negative specials such as \S
5970     (non-UCP) in the class. Then copy the 32-byte map into the code vector,
5971     negating it if necessary. */
5972 
5973     *code++ = (negate_class == should_flip_negation) ? OP_CLASS : OP_NCLASS;
5974     if (lengthptr == NULL)    /* Save time in the pre-compile phase */
5975       {
5976       if (negate_class)
5977         {
5978        /* Using 255 ^ instead of ~ avoids clang sanitize warning. */
5979        for (i = 0; i < 32; i++) classbits[i] = 255 ^ classbits[i];
5980        }
5981       memcpy(code, classbits, 32);
5982       }
5983     code += 32 / sizeof(PCRE2_UCHAR);
5984     break;  /* End of class processing */
5985 
5986 
5987     /* ===================================================================*/
5988     /* Deal with (*VERB)s. */
5989 
5990     /* Check for open captures before ACCEPT and close those that are within
5991     the same assertion level, also converting ACCEPT to ASSERT_ACCEPT in an
5992     assertion. In the first pass, just accumulate the length required;
5993     otherwise hitting (*ACCEPT) inside many nested parentheses can cause
5994     workspace overflow. Do not set firstcu after *ACCEPT. */
5995 
5996     case META_ACCEPT:
5997     cb->had_accept = TRUE;
5998     for (oc = cb->open_caps;
5999          oc != NULL && oc->assert_depth >= cb->assert_depth;
6000          oc = oc->next)
6001       {
6002       if (lengthptr != NULL)
6003         {
6004         *lengthptr += CU2BYTES(1) + IMM2_SIZE;
6005         }
6006       else
6007         {
6008         *code++ = OP_CLOSE;
6009         PUT2INC(code, 0, oc->number);
6010         }
6011       }
6012     *code++ = (cb->assert_depth > 0)? OP_ASSERT_ACCEPT : OP_ACCEPT;
6013     if (firstcuflags == REQ_UNSET) firstcuflags = REQ_NONE;
6014     break;
6015 
6016     case META_PRUNE:
6017     case META_SKIP:
6018     cb->had_pruneorskip = TRUE;
6019     /* Fall through */
6020     case META_COMMIT:
6021     case META_FAIL:
6022     *code++ = verbops[(meta - META_MARK) >> 16];
6023     break;
6024 
6025     case META_THEN:
6026     cb->external_flags |= PCRE2_HASTHEN;
6027     *code++ = OP_THEN;
6028     break;
6029 
6030     /* Handle verbs with arguments. Arguments can be very long, especially in
6031     16- and 32-bit modes, and can overflow the workspace in the first pass.
6032     However, the argument length is constrained to be small enough to fit in
6033     one code unit. This check happens in parse_regex(). In the first pass,
6034     instead of putting the argument into memory, we just update the length
6035     counter and set up an empty argument. */
6036 
6037     case META_THEN_ARG:
6038     cb->external_flags |= PCRE2_HASTHEN;
6039     goto VERB_ARG;
6040 
6041     case META_PRUNE_ARG:
6042     case META_SKIP_ARG:
6043     cb->had_pruneorskip = TRUE;
6044     /* Fall through */
6045     case META_MARK:
6046     case META_COMMIT_ARG:
6047     VERB_ARG:
6048     *code++ = verbops[(meta - META_MARK) >> 16];
6049     /* The length is in characters. */
6050     verbarglen = *(++pptr);
6051     verbculen = 0;
6052     tempcode = code++;
6053     for (i = 0; i < (int)verbarglen; i++)
6054       {
6055       meta = *(++pptr);
6056 #ifdef SUPPORT_UNICODE
6057       if (utf) mclength = PRIV(ord2utf)(meta, mcbuffer); else
6058 #endif
6059         {
6060         mclength = 1;
6061         mcbuffer[0] = meta;
6062         }
6063       if (lengthptr != NULL) *lengthptr += mclength; else
6064         {
6065         memcpy(code, mcbuffer, CU2BYTES(mclength));
6066         code += mclength;
6067         verbculen += mclength;
6068         }
6069       }
6070 
6071     *tempcode = verbculen;   /* Fill in the code unit length */
6072     *code++ = 0;             /* Terminating zero */
6073     break;
6074 
6075 
6076     /* ===================================================================*/
6077     /* Handle options change. The new setting must be passed back for use in
6078     subsequent branches. Reset the greedy defaults and the case value for
6079     firstcu and reqcu. */
6080 
6081     case META_OPTIONS:
6082     *optionsptr = options = *(++pptr);
6083     greedy_default = ((options & PCRE2_UNGREEDY) != 0);
6084     greedy_non_default = greedy_default ^ 1;
6085     req_caseopt = ((options & PCRE2_CASELESS) != 0)? REQ_CASELESS : 0;
6086     break;
6087 
6088 
6089     /* ===================================================================*/
6090     /* Handle conditional subpatterns. The case of (?(Rdigits) is ambiguous
6091     because it could be a numerical check on recursion, or a name check on a
6092     group's being set. The pre-pass sets up META_COND_RNUMBER as a name so that
6093     we can handle it either way. We first try for a name; if not found, process
6094     the number. */
6095 
6096     case META_COND_RNUMBER:   /* (?(Rdigits) */
6097     case META_COND_NAME:      /* (?(name) or (?'name') or ?(<name>) */
6098     case META_COND_RNAME:     /* (?(R&name) - test for recursion */
6099     bravalue = OP_COND;
6100       {
6101       int count, index;
6102       PCRE2_SPTR name;
6103       named_group *ng = cb->named_groups;
6104       uint32_t length = *(++pptr);
6105 
6106       GETPLUSOFFSET(offset, pptr);
6107       name = cb->start_pattern + offset;
6108 
6109       /* In the first pass, the names generated in the pre-pass are available,
6110       but the main name table has not yet been created. Scan the list of names
6111       generated in the pre-pass in order to get a number and whether or not
6112       this name is duplicated. If it is not duplicated, we can handle it as a
6113       numerical group. */
6114 
6115       for (i = 0; i < cb->names_found; i++, ng++)
6116         {
6117         if (length == ng->length &&
6118             PRIV(strncmp)(name, ng->name, length) == 0)
6119           {
6120           if (!ng->isdup)
6121             {
6122             code[1+LINK_SIZE] = (meta == META_COND_RNAME)? OP_RREF : OP_CREF;
6123             PUT2(code, 2+LINK_SIZE, ng->number);
6124             if (ng->number > cb->top_backref) cb->top_backref = ng->number;
6125             skipunits = 1+IMM2_SIZE;
6126             goto GROUP_PROCESS_NOTE_EMPTY;
6127             }
6128           break;  /* Found a duplicated name */
6129           }
6130         }
6131 
6132       /* If the name was not found we have a bad reference, unless we are
6133       dealing with R<digits>, which is treated as a recursion test by number.
6134       */
6135 
6136       if (i >= cb->names_found)
6137         {
6138         groupnumber = 0;
6139         if (meta == META_COND_RNUMBER)
6140           {
6141           for (i = 1; i < (int)length; i++)
6142             {
6143             groupnumber = groupnumber * 10 + name[i] - CHAR_0;
6144             if (groupnumber > MAX_GROUP_NUMBER)
6145               {
6146               *errorcodeptr = ERR61;
6147               cb->erroroffset = offset + i;
6148               return 0;
6149               }
6150             }
6151           }
6152 
6153         if (meta != META_COND_RNUMBER || groupnumber > cb->bracount)
6154           {
6155           *errorcodeptr = ERR15;
6156           cb->erroroffset = offset;
6157           return 0;
6158           }
6159 
6160         /* (?Rdigits) treated as a recursion reference by number. A value of
6161         zero (which is the result of both (?R) and (?R0)) means "any", and is
6162         translated into RREF_ANY (which is 0xffff). */
6163 
6164         if (groupnumber == 0) groupnumber = RREF_ANY;
6165         code[1+LINK_SIZE] = OP_RREF;
6166         PUT2(code, 2+LINK_SIZE, groupnumber);
6167         skipunits = 1+IMM2_SIZE;
6168         goto GROUP_PROCESS_NOTE_EMPTY;
6169         }
6170 
6171       /* A duplicated name was found. Note that if an R<digits> name is found
6172       (META_COND_RNUMBER), it is a reference test, not a recursion test. */
6173 
6174       code[1+LINK_SIZE] = (meta == META_COND_RNAME)? OP_RREF : OP_CREF;
6175 
6176       /* We have a duplicated name. In the compile pass we have to search the
6177       main table in order to get the index and count values. */
6178 
6179       count = 0;  /* Values for first pass (avoids compiler warning) */
6180       index = 0;
6181       if (lengthptr == NULL && !find_dupname_details(name, length, &index,
6182             &count, errorcodeptr, cb)) return 0;
6183 
6184       /* Add one to the opcode to change CREF/RREF into DNCREF/DNRREF and
6185       insert appropriate data values. */
6186 
6187       code[1+LINK_SIZE]++;
6188       skipunits = 1+2*IMM2_SIZE;
6189       PUT2(code, 2+LINK_SIZE, index);
6190       PUT2(code, 2+LINK_SIZE+IMM2_SIZE, count);
6191       }
6192     goto GROUP_PROCESS_NOTE_EMPTY;
6193 
6194     /* The DEFINE condition is always false. Its internal groups may never
6195     be called, so matched_char must remain false, hence the jump to
6196     GROUP_PROCESS rather than GROUP_PROCESS_NOTE_EMPTY. */
6197 
6198     case META_COND_DEFINE:
6199     bravalue = OP_COND;
6200     GETPLUSOFFSET(offset, pptr);
6201     code[1+LINK_SIZE] = OP_DEFINE;
6202     skipunits = 1;
6203     goto GROUP_PROCESS;
6204 
6205     /* Conditional test of a group's being set. */
6206 
6207     case META_COND_NUMBER:
6208     bravalue = OP_COND;
6209     GETPLUSOFFSET(offset, pptr);
6210     groupnumber = *(++pptr);
6211     if (groupnumber > cb->bracount)
6212       {
6213       *errorcodeptr = ERR15;
6214       cb->erroroffset = offset;
6215       return 0;
6216       }
6217     if (groupnumber > cb->top_backref) cb->top_backref = groupnumber;
6218     offset -= 2;   /* Point at initial ( for too many branches error */
6219     code[1+LINK_SIZE] = OP_CREF;
6220     skipunits = 1+IMM2_SIZE;
6221     PUT2(code, 2+LINK_SIZE, groupnumber);
6222     goto GROUP_PROCESS_NOTE_EMPTY;
6223 
6224     /* Test for the PCRE2 version. */
6225 
6226     case META_COND_VERSION:
6227     bravalue = OP_COND;
6228     if (pptr[1] > 0)
6229       code[1+LINK_SIZE] = ((PCRE2_MAJOR > pptr[2]) ||
6230         (PCRE2_MAJOR == pptr[2] && PCRE2_MINOR >= pptr[3]))?
6231           OP_TRUE : OP_FALSE;
6232     else
6233       code[1+LINK_SIZE] = (PCRE2_MAJOR == pptr[2] && PCRE2_MINOR == pptr[3])?
6234         OP_TRUE : OP_FALSE;
6235     skipunits = 1;
6236     pptr += 3;
6237     goto GROUP_PROCESS_NOTE_EMPTY;
6238 
6239     /* The condition is an assertion, possibly preceded by a callout. */
6240 
6241     case META_COND_ASSERT:
6242     bravalue = OP_COND;
6243     goto GROUP_PROCESS_NOTE_EMPTY;
6244 
6245 
6246     /* ===================================================================*/
6247     /* Handle all kinds of nested bracketed groups. The non-capturing,
6248     non-conditional cases are here; others come to GROUP_PROCESS via goto. */
6249 
6250     case META_LOOKAHEAD:
6251     bravalue = OP_ASSERT;
6252     cb->assert_depth += 1;
6253     goto GROUP_PROCESS;
6254 
6255     /* Optimize (?!) to (*FAIL) unless it is quantified - which is a weird
6256     thing to do, but Perl allows all assertions to be quantified, and when
6257     they contain capturing parentheses there may be a potential use for
6258     this feature. Not that that applies to a quantified (?!) but we allow
6259     it for uniformity. */
6260 
6261     case META_LOOKAHEADNOT:
6262     if (pptr[1] == META_KET &&
6263          (pptr[2] < META_ASTERISK || pptr[2] > META_MINMAX_QUERY))
6264       {
6265       *code++ = OP_FAIL;
6266       pptr++;
6267       }
6268     else
6269       {
6270       bravalue = OP_ASSERT_NOT;
6271       cb->assert_depth += 1;
6272       goto GROUP_PROCESS;
6273       }
6274     break;
6275 
6276     case META_LOOKBEHIND:
6277     bravalue = OP_ASSERTBACK;
6278     cb->assert_depth += 1;
6279     goto GROUP_PROCESS;
6280 
6281     case META_LOOKBEHINDNOT:
6282     bravalue = OP_ASSERTBACK_NOT;
6283     cb->assert_depth += 1;
6284     goto GROUP_PROCESS;
6285 
6286     case META_ATOMIC:
6287     bravalue = OP_ONCE;
6288     goto GROUP_PROCESS_NOTE_EMPTY;
6289 
6290     case META_SCRIPT_RUN:
6291     bravalue = OP_SCRIPT_RUN;
6292     goto GROUP_PROCESS_NOTE_EMPTY;
6293 
6294     case META_NOCAPTURE:
6295     bravalue = OP_BRA;
6296     /* Fall through */
6297 
6298     /* Process nested bracketed regex. The nesting depth is maintained for the
6299     benefit of the stackguard function. The test for too deep nesting is now
6300     done in parse_regex(). Assertion and DEFINE groups come to GROUP_PROCESS;
6301     others come to GROUP_PROCESS_NOTE_EMPTY, to indicate that we need to take
6302     note of whether or not they may match an empty string. */
6303 
6304     GROUP_PROCESS_NOTE_EMPTY:
6305     note_group_empty = TRUE;
6306 
6307     GROUP_PROCESS:
6308     cb->parens_depth += 1;
6309     *code = bravalue;
6310     pptr++;
6311     tempcode = code;
6312     tempreqvary = cb->req_varyopt;        /* Save value before group */
6313     length_prevgroup = 0;                 /* Initialize for pre-compile phase */
6314 
6315     if ((group_return =
6316          compile_regex(
6317          options,                         /* The option state */
6318          &tempcode,                       /* Where to put code (updated) */
6319          &pptr,                           /* Input pointer (updated) */
6320          errorcodeptr,                    /* Where to put an error message */
6321          skipunits,                       /* Skip over bracket number */
6322          &subfirstcu,                     /* For possible first char */
6323          &subfirstcuflags,
6324          &subreqcu,                       /* For possible last char */
6325          &subreqcuflags,
6326          bcptr,                           /* Current branch chain */
6327          cb,                              /* Compile data block */
6328          (lengthptr == NULL)? NULL :      /* Actual compile phase */
6329            &length_prevgroup              /* Pre-compile phase */
6330          )) == 0)
6331       return 0;  /* Error */
6332 
6333     cb->parens_depth -= 1;
6334 
6335     /* If that was a non-conditional significant group (not an assertion, not a
6336     DEFINE) that matches at least one character, then the current item matches
6337     a character. Conditionals are handled below. */
6338 
6339     if (note_group_empty && bravalue != OP_COND && group_return > 0)
6340       matched_char = TRUE;
6341 
6342     /* If we've just compiled an assertion, pop the assert depth. */
6343 
6344     if (bravalue >= OP_ASSERT && bravalue <= OP_ASSERTBACK_NOT)
6345       cb->assert_depth -= 1;
6346 
6347     /* At the end of compiling, code is still pointing to the start of the
6348     group, while tempcode has been updated to point past the end of the group.
6349     The parsed pattern pointer (pptr) is on the closing META_KET.
6350 
6351     If this is a conditional bracket, check that there are no more than
6352     two branches in the group, or just one if it's a DEFINE group. We do this
6353     in the real compile phase, not in the pre-pass, where the whole group may
6354     not be available. */
6355 
6356     if (bravalue == OP_COND && lengthptr == NULL)
6357       {
6358       PCRE2_UCHAR *tc = code;
6359       int condcount = 0;
6360 
6361       do {
6362          condcount++;
6363          tc += GET(tc,1);
6364          }
6365       while (*tc != OP_KET);
6366 
6367       /* A DEFINE group is never obeyed inline (the "condition" is always
6368       false). It must have only one branch. Having checked this, change the
6369       opcode to OP_FALSE. */
6370 
6371       if (code[LINK_SIZE+1] == OP_DEFINE)
6372         {
6373         if (condcount > 1)
6374           {
6375           cb->erroroffset = offset;
6376           *errorcodeptr = ERR54;
6377           return 0;
6378           }
6379         code[LINK_SIZE+1] = OP_FALSE;
6380         bravalue = OP_DEFINE;   /* A flag to suppress char handling below */
6381         }
6382 
6383       /* A "normal" conditional group. If there is just one branch, we must not
6384       make use of its firstcu or reqcu, because this is equivalent to an
6385       empty second branch. Also, it may match an empty string. If there are two
6386       branches, this item must match a character if the group must. */
6387 
6388       else
6389         {
6390         if (condcount > 2)
6391           {
6392           cb->erroroffset = offset;
6393           *errorcodeptr = ERR27;
6394           return 0;
6395           }
6396         if (condcount == 1) subfirstcuflags = subreqcuflags = REQ_NONE;
6397           else if (group_return > 0) matched_char = TRUE;
6398         }
6399       }
6400 
6401     /* In the pre-compile phase, update the length by the length of the group,
6402     less the brackets at either end. Then reduce the compiled code to just a
6403     set of non-capturing brackets so that it doesn't use much memory if it is
6404     duplicated by a quantifier.*/
6405 
6406     if (lengthptr != NULL)
6407       {
6408       if (OFLOW_MAX - *lengthptr < length_prevgroup - 2 - 2*LINK_SIZE)
6409         {
6410         *errorcodeptr = ERR20;
6411         return 0;
6412         }
6413       *lengthptr += length_prevgroup - 2 - 2*LINK_SIZE;
6414       code++;   /* This already contains bravalue */
6415       PUTINC(code, 0, 1 + LINK_SIZE);
6416       *code++ = OP_KET;
6417       PUTINC(code, 0, 1 + LINK_SIZE);
6418       break;    /* No need to waste time with special character handling */
6419       }
6420 
6421     /* Otherwise update the main code pointer to the end of the group. */
6422 
6423     code = tempcode;
6424 
6425     /* For a DEFINE group, required and first character settings are not
6426     relevant. */
6427 
6428     if (bravalue == OP_DEFINE) break;
6429 
6430     /* Handle updating of the required and first code units for other types of
6431     group. Update for normal brackets of all kinds, and conditions with two
6432     branches (see code above). If the bracket is followed by a quantifier with
6433     zero repeat, we have to back off. Hence the definition of zeroreqcu and
6434     zerofirstcu outside the main loop so that they can be accessed for the back
6435     off. */
6436 
6437     zeroreqcu = reqcu;
6438     zeroreqcuflags = reqcuflags;
6439     zerofirstcu = firstcu;
6440     zerofirstcuflags = firstcuflags;
6441     groupsetfirstcu = FALSE;
6442 
6443     if (bravalue >= OP_ONCE)  /* Not an assertion */
6444       {
6445       /* If we have not yet set a firstcu in this branch, take it from the
6446       subpattern, remembering that it was set here so that a repeat of more
6447       than one can replicate it as reqcu if necessary. If the subpattern has
6448       no firstcu, set "none" for the whole branch. In both cases, a zero
6449       repeat forces firstcu to "none". */
6450 
6451       if (firstcuflags == REQ_UNSET && subfirstcuflags != REQ_UNSET)
6452         {
6453         if (subfirstcuflags >= 0)
6454           {
6455           firstcu = subfirstcu;
6456           firstcuflags = subfirstcuflags;
6457           groupsetfirstcu = TRUE;
6458           }
6459         else firstcuflags = REQ_NONE;
6460         zerofirstcuflags = REQ_NONE;
6461         }
6462 
6463       /* If firstcu was previously set, convert the subpattern's firstcu
6464       into reqcu if there wasn't one, using the vary flag that was in
6465       existence beforehand. */
6466 
6467       else if (subfirstcuflags >= 0 && subreqcuflags < 0)
6468         {
6469         subreqcu = subfirstcu;
6470         subreqcuflags = subfirstcuflags | tempreqvary;
6471         }
6472 
6473       /* If the subpattern set a required code unit (or set a first code unit
6474       that isn't really the first code unit - see above), set it. */
6475 
6476       if (subreqcuflags >= 0)
6477         {
6478         reqcu = subreqcu;
6479         reqcuflags = subreqcuflags;
6480         }
6481       }
6482 
6483     /* For a forward assertion, we take the reqcu, if set, provided that the
6484     group has also set a firstcu. This can be helpful if the pattern that
6485     follows the assertion doesn't set a different char. For example, it's
6486     useful for /(?=abcde).+/. We can't set firstcu for an assertion, however
6487     because it leads to incorrect effect for patterns such as /(?=a)a.+/ when
6488     the "real" "a" would then become a reqcu instead of a firstcu. This is
6489     overcome by a scan at the end if there's no firstcu, looking for an
6490     asserted first char. A similar effect for patterns like /(?=.*X)X$/ means
6491     we must only take the reqcu when the group also set a firstcu. Otherwise,
6492     in that example, 'X' ends up set for both. */
6493 
6494     else if (bravalue == OP_ASSERT && subreqcuflags >= 0 &&
6495              subfirstcuflags >= 0)
6496       {
6497       reqcu = subreqcu;
6498       reqcuflags = subreqcuflags;
6499       }
6500 
6501     break;  /* End of nested group handling */
6502 
6503 
6504     /* ===================================================================*/
6505     /* Handle named backreferences and recursions. */
6506 
6507     case META_BACKREF_BYNAME:
6508     case META_RECURSE_BYNAME:
6509       {
6510       int count, index;
6511       PCRE2_SPTR name;
6512       BOOL is_dupname = FALSE;
6513       named_group *ng = cb->named_groups;
6514       uint32_t length = *(++pptr);
6515 
6516       GETPLUSOFFSET(offset, pptr);
6517       name = cb->start_pattern + offset;
6518 
6519       /* In the first pass, the names generated in the pre-pass are available,
6520       but the main name table has not yet been created. Scan the list of names
6521       generated in the pre-pass in order to get a number and whether or not
6522       this name is duplicated. */
6523 
6524       groupnumber = 0;
6525       for (i = 0; i < cb->names_found; i++, ng++)
6526         {
6527         if (length == ng->length &&
6528             PRIV(strncmp)(name, ng->name, length) == 0)
6529           {
6530           is_dupname = ng->isdup;
6531           groupnumber = ng->number;
6532 
6533           /* For a recursion, that's all that is needed. We can now go to
6534           the code that handles numerical recursion, applying it to the first
6535           group with the given name. */
6536 
6537           if (meta == META_RECURSE_BYNAME)
6538             {
6539             meta_arg = groupnumber;
6540             goto HANDLE_NUMERICAL_RECURSION;
6541             }
6542 
6543           /* For a back reference, update the back reference map and the
6544           maximum back reference. Then, for each group, we must check to
6545           see if it is recursive, that is, it is inside the group that it
6546           references. A flag is set so that the group can be made atomic.
6547           */
6548 
6549           cb->backref_map |= (groupnumber < 32)? (1u << groupnumber) : 1;
6550           if (groupnumber > cb->top_backref)
6551             cb->top_backref = groupnumber;
6552 
6553           for (oc = cb->open_caps; oc != NULL; oc = oc->next)
6554             {
6555             if (oc->number == groupnumber)
6556               {
6557               oc->flag = TRUE;
6558               break;
6559               }
6560             }
6561           }
6562         }
6563 
6564       /* If the name was not found we have a bad reference. */
6565 
6566       if (groupnumber == 0)
6567         {
6568         *errorcodeptr = ERR15;
6569         cb->erroroffset = offset;
6570         return 0;
6571         }
6572 
6573       /* If a back reference name is not duplicated, we can handle it as
6574       a numerical reference. */
6575 
6576       if (!is_dupname)
6577         {
6578         meta_arg = groupnumber;
6579         goto HANDLE_SINGLE_REFERENCE;
6580         }
6581 
6582       /* If a back reference name is duplicated, we generate a different
6583       opcode to a numerical back reference. In the second pass we must
6584       search for the index and count in the final name table. */
6585 
6586       count = 0;  /* Values for first pass (avoids compiler warning) */
6587       index = 0;
6588       if (lengthptr == NULL && !find_dupname_details(name, length, &index,
6589             &count, errorcodeptr, cb)) return 0;
6590 
6591       if (firstcuflags == REQ_UNSET) firstcuflags = REQ_NONE;
6592       *code++ = ((options & PCRE2_CASELESS) != 0)? OP_DNREFI : OP_DNREF;
6593       PUT2INC(code, 0, index);
6594       PUT2INC(code, 0, count);
6595       }
6596     break;
6597 
6598 
6599     /* ===================================================================*/
6600     /* Handle a numerical callout. */
6601 
6602     case META_CALLOUT_NUMBER:
6603     code[0] = OP_CALLOUT;
6604     PUT(code, 1, pptr[1]);               /* Offset to next pattern item */
6605     PUT(code, 1 + LINK_SIZE, pptr[2]);   /* Length of next pattern item */
6606     code[1 + 2*LINK_SIZE] = pptr[3];
6607     pptr += 3;
6608     code += PRIV(OP_lengths)[OP_CALLOUT];
6609     break;
6610 
6611 
6612     /* ===================================================================*/
6613     /* Handle a callout with a string argument. In the pre-pass we just compute
6614     the length without generating anything. The length in pptr[3] includes both
6615     delimiters; in the actual compile only the first one is copied, but a
6616     terminating zero is added. Any doubled delimiters within the string make
6617     this an overestimate, but it is not worth bothering about. */
6618 
6619     case META_CALLOUT_STRING:
6620     if (lengthptr != NULL)
6621       {
6622       *lengthptr += pptr[3] + (1 + 4*LINK_SIZE);
6623       pptr += 3;
6624       SKIPOFFSET(pptr);
6625       }
6626 
6627     /* In the real compile we can copy the string. The starting delimiter is
6628      included so that the client can discover it if they want. We also pass the
6629      start offset to help a script language give better error messages. */
6630 
6631     else
6632       {
6633       PCRE2_SPTR pp;
6634       uint32_t delimiter;
6635       uint32_t length = pptr[3];
6636       PCRE2_UCHAR *callout_string = code + (1 + 4*LINK_SIZE);
6637 
6638       code[0] = OP_CALLOUT_STR;
6639       PUT(code, 1, pptr[1]);               /* Offset to next pattern item */
6640       PUT(code, 1 + LINK_SIZE, pptr[2]);   /* Length of next pattern item */
6641 
6642       pptr += 3;
6643       GETPLUSOFFSET(offset, pptr);         /* Offset to string in pattern */
6644       pp = cb->start_pattern + offset;
6645       delimiter = *callout_string++ = *pp++;
6646       if (delimiter == CHAR_LEFT_CURLY_BRACKET)
6647         delimiter = CHAR_RIGHT_CURLY_BRACKET;
6648       PUT(code, 1 + 3*LINK_SIZE, (int)(offset + 1));  /* One after delimiter */
6649 
6650       /* The syntax of the pattern was checked in the parsing scan. The length
6651       includes both delimiters, but we have passed the opening one just above,
6652       so we reduce length before testing it. The test is for > 1 because we do
6653       not want to copy the final delimiter. This also ensures that pp[1] is
6654       accessible. */
6655 
6656       while (--length > 1)
6657         {
6658         if (*pp == delimiter && pp[1] == delimiter)
6659           {
6660           *callout_string++ = delimiter;
6661           pp += 2;
6662           length--;
6663           }
6664         else *callout_string++ = *pp++;
6665         }
6666       *callout_string++ = CHAR_NUL;
6667 
6668       /* Set the length of the entire item, the advance to its end. */
6669 
6670       PUT(code, 1 + 2*LINK_SIZE, (int)(callout_string - code));
6671       code = callout_string;
6672       }
6673     break;
6674 
6675 
6676     /* ===================================================================*/
6677     /* Handle repetition. The different types are all sorted out in the parsing
6678     pass. */
6679 
6680     case META_MINMAX_PLUS:
6681     case META_MINMAX_QUERY:
6682     case META_MINMAX:
6683     repeat_min = *(++pptr);
6684     repeat_max = *(++pptr);
6685     goto REPEAT;
6686 
6687     case META_ASTERISK:
6688     case META_ASTERISK_PLUS:
6689     case META_ASTERISK_QUERY:
6690     repeat_min = 0;
6691     repeat_max = REPEAT_UNLIMITED;
6692     goto REPEAT;
6693 
6694     case META_PLUS:
6695     case META_PLUS_PLUS:
6696     case META_PLUS_QUERY:
6697     repeat_min = 1;
6698     repeat_max = REPEAT_UNLIMITED;
6699     goto REPEAT;
6700 
6701     case META_QUERY:
6702     case META_QUERY_PLUS:
6703     case META_QUERY_QUERY:
6704     repeat_min = 0;
6705     repeat_max = 1;
6706 
6707     REPEAT:
6708     if (previous_matched_char && repeat_min > 0) matched_char = TRUE;
6709 
6710     /* Remember whether this is a variable length repeat, and default to
6711     single-char opcodes. */
6712 
6713     reqvary = (repeat_min == repeat_max)? 0 : REQ_VARY;
6714     op_type = 0;
6715 
6716     /* If the repeat is {1} we can ignore it. */
6717 
6718     if (repeat_max == 1 && repeat_min == 1) goto END_REPEAT;
6719 
6720     /* Adjust first and required code units for a zero repeat. */
6721 
6722     if (repeat_min == 0)
6723       {
6724       firstcu = zerofirstcu;
6725       firstcuflags = zerofirstcuflags;
6726       reqcu = zeroreqcu;
6727       reqcuflags = zeroreqcuflags;
6728       }
6729 
6730     /* Note the greediness and possessiveness. */
6731 
6732     switch (meta)
6733       {
6734       case META_MINMAX_PLUS:
6735       case META_ASTERISK_PLUS:
6736       case META_PLUS_PLUS:
6737       case META_QUERY_PLUS:
6738       repeat_type = 0;                  /* Force greedy */
6739       possessive_quantifier = TRUE;
6740       break;
6741 
6742       case META_MINMAX_QUERY:
6743       case META_ASTERISK_QUERY:
6744       case META_PLUS_QUERY:
6745       case META_QUERY_QUERY:
6746       repeat_type = greedy_non_default;
6747       possessive_quantifier = FALSE;
6748       break;
6749 
6750       default:
6751       repeat_type = greedy_default;
6752       possessive_quantifier = FALSE;
6753       break;
6754       }
6755 
6756     /* Save start of previous item, in case we have to move it up in order to
6757     insert something before it, and remember what it was. */
6758 
6759     tempcode = previous;
6760     op_previous = *previous;
6761 
6762     /* Now handle repetition for the different types of item. */
6763 
6764     switch (op_previous)
6765       {
6766       /* If previous was a character or negated character match, abolish the
6767       item and generate a repeat item instead. If a char item has a minimum of
6768       more than one, ensure that it is set in reqcu - it might not be if a
6769       sequence such as x{3} is the first thing in a branch because the x will
6770       have gone into firstcu instead.  */
6771 
6772       case OP_CHAR:
6773       case OP_CHARI:
6774       case OP_NOT:
6775       case OP_NOTI:
6776       op_type = chartypeoffset[op_previous - OP_CHAR];
6777 
6778       /* Deal with UTF characters that take up more than one code unit. */
6779 
6780 #ifdef MAYBE_UTF_MULTI
6781       if (utf && NOT_FIRSTCU(code[-1]))
6782         {
6783         PCRE2_UCHAR *lastchar = code - 1;
6784         BACKCHAR(lastchar);
6785         mclength = (uint32_t)(code - lastchar);   /* Length of UTF character */
6786         memcpy(mcbuffer, lastchar, CU2BYTES(mclength));  /* Save the char */
6787         }
6788       else
6789 #endif  /* MAYBE_UTF_MULTI */
6790 
6791       /* Handle the case of a single code unit - either with no UTF support, or
6792       with UTF disabled, or for a single-code-unit UTF character. */
6793         {
6794         mcbuffer[0] = code[-1];
6795         mclength = 1;
6796         if (op_previous <= OP_CHARI && repeat_min > 1)
6797           {
6798           reqcu = mcbuffer[0];
6799           reqcuflags = req_caseopt | cb->req_varyopt;
6800           }
6801         }
6802       goto OUTPUT_SINGLE_REPEAT;  /* Code shared with single character types */
6803 
6804       /* If previous was a character class or a back reference, we put the
6805       repeat stuff after it, but just skip the item if the repeat was {0,0}. */
6806 
6807 #ifdef SUPPORT_WIDE_CHARS
6808       case OP_XCLASS:
6809 #endif
6810       case OP_CLASS:
6811       case OP_NCLASS:
6812       case OP_REF:
6813       case OP_REFI:
6814       case OP_DNREF:
6815       case OP_DNREFI:
6816 
6817       if (repeat_max == 0)
6818         {
6819         code = previous;
6820         goto END_REPEAT;
6821         }
6822 
6823       if (repeat_min == 0 && repeat_max == REPEAT_UNLIMITED)
6824         *code++ = OP_CRSTAR + repeat_type;
6825       else if (repeat_min == 1 && repeat_max == REPEAT_UNLIMITED)
6826         *code++ = OP_CRPLUS + repeat_type;
6827       else if (repeat_min == 0 && repeat_max == 1)
6828         *code++ = OP_CRQUERY + repeat_type;
6829       else
6830         {
6831         *code++ = OP_CRRANGE + repeat_type;
6832         PUT2INC(code, 0, repeat_min);
6833         if (repeat_max == REPEAT_UNLIMITED) repeat_max = 0;  /* 2-byte encoding for max */
6834         PUT2INC(code, 0, repeat_max);
6835         }
6836       break;
6837 
6838       /* If previous is OP_FAIL, it was generated by an empty class []
6839       (PCRE2_ALLOW_EMPTY_CLASS is set). The other ways in which OP_FAIL can be
6840       generated, that is by (*FAIL) or (?!), disallow a quantifier at parse
6841       time. We can just ignore this repeat. */
6842 
6843       case OP_FAIL:
6844       goto END_REPEAT;
6845 
6846       /* Prior to 10.30, repeated recursions were wrapped in OP_ONCE brackets
6847       because pcre2_match() could not handle backtracking into recursively
6848       called groups. Now that this backtracking is available, we no longer need
6849       to do this. However, we still need to replicate recursions as we do for
6850       groups so as to have independent backtracking points. We can replicate
6851       for the minimum number of repeats directly. For optional repeats we now
6852       wrap the recursion in OP_BRA brackets and make use of the bracket
6853       repetition. */
6854 
6855       case OP_RECURSE:
6856 
6857       /* Generate unwrapped repeats for a non-zero minimum, except when the
6858       minimum is 1 and the maximum unlimited, because that can be handled with
6859       OP_BRA terminated by OP_KETRMAX/MIN. When the maximum is equal to the
6860       minimum, we just need to generate the appropriate additional copies.
6861       Otherwise we need to generate one more, to simulate the situation when
6862       the minimum is zero. */
6863 
6864       if (repeat_min > 0 && (repeat_min != 1 || repeat_max != REPEAT_UNLIMITED))
6865         {
6866         int replicate = repeat_min;
6867         if (repeat_min == repeat_max) replicate--;
6868 
6869         /* In the pre-compile phase, we don't actually do the replication. We
6870         just adjust the length as if we had. Do some paranoid checks for
6871         potential integer overflow. The INT64_OR_DOUBLE type is a 64-bit
6872         integer type when available, otherwise double. */
6873 
6874         if (lengthptr != NULL)
6875           {
6876           PCRE2_SIZE delta = replicate*(1 + LINK_SIZE);
6877           if ((INT64_OR_DOUBLE)replicate*
6878                 (INT64_OR_DOUBLE)(1 + LINK_SIZE) >
6879                   (INT64_OR_DOUBLE)INT_MAX ||
6880               OFLOW_MAX - *lengthptr < delta)
6881             {
6882             *errorcodeptr = ERR20;
6883             return 0;
6884             }
6885           *lengthptr += delta;
6886           }
6887 
6888         else for (i = 0; i < replicate; i++)
6889           {
6890           memcpy(code, previous, CU2BYTES(1 + LINK_SIZE));
6891           previous = code;
6892           code += 1 + LINK_SIZE;
6893           }
6894 
6895         /* If the number of repeats is fixed, we are done. Otherwise, adjust
6896         the counts and fall through. */
6897 
6898         if (repeat_min == repeat_max) break;
6899         if (repeat_max != REPEAT_UNLIMITED) repeat_max -= repeat_min;
6900         repeat_min = 0;
6901         }
6902 
6903       /* Wrap the recursion call in OP_BRA brackets. */
6904 
6905       (void)memmove(previous + 1 + LINK_SIZE, previous, CU2BYTES(1 + LINK_SIZE));
6906       op_previous = *previous = OP_BRA;
6907       PUT(previous, 1, 2 + 2*LINK_SIZE);
6908       previous[2 + 2*LINK_SIZE] = OP_KET;
6909       PUT(previous, 3 + 2*LINK_SIZE, 2 + 2*LINK_SIZE);
6910       code += 2 + 2 * LINK_SIZE;
6911       length_prevgroup = 3 + 3*LINK_SIZE;
6912       group_return = -1;  /* Set "may match empty string" */
6913 
6914       /* Now treat as a repeated OP_BRA. */
6915       /* Fall through */
6916 
6917       /* If previous was a bracket group, we may have to replicate it in
6918       certain cases. Note that at this point we can encounter only the "basic"
6919       bracket opcodes such as BRA and CBRA, as this is the place where they get
6920       converted into the more special varieties such as BRAPOS and SBRA.
6921       Originally, PCRE did not allow repetition of assertions, but now it does,
6922       for Perl compatibility. */
6923 
6924       case OP_ASSERT:
6925       case OP_ASSERT_NOT:
6926       case OP_ASSERTBACK:
6927       case OP_ASSERTBACK_NOT:
6928       case OP_ONCE:
6929       case OP_SCRIPT_RUN:
6930       case OP_BRA:
6931       case OP_CBRA:
6932       case OP_COND:
6933         {
6934         int len = (int)(code - previous);
6935         PCRE2_UCHAR *bralink = NULL;
6936         PCRE2_UCHAR *brazeroptr = NULL;
6937 
6938         /* Repeating a DEFINE group (or any group where the condition is always
6939         FALSE and there is only one branch) is pointless, but Perl allows the
6940         syntax, so we just ignore the repeat. */
6941 
6942         if (op_previous == OP_COND && previous[LINK_SIZE+1] == OP_FALSE &&
6943             previous[GET(previous, 1)] != OP_ALT)
6944           goto END_REPEAT;
6945 
6946         /* There is no sense in actually repeating assertions. The only
6947         potential use of repetition is in cases when the assertion is optional.
6948         Therefore, if the minimum is greater than zero, just ignore the repeat.
6949         If the maximum is not zero or one, set it to 1. */
6950 
6951         if (op_previous < OP_ONCE)    /* Assertion */
6952           {
6953           if (repeat_min > 0) goto END_REPEAT;
6954           if (repeat_max > 1) repeat_max = 1;
6955           }
6956 
6957         /* The case of a zero minimum is special because of the need to stick
6958         OP_BRAZERO in front of it, and because the group appears once in the
6959         data, whereas in other cases it appears the minimum number of times. For
6960         this reason, it is simplest to treat this case separately, as otherwise
6961         the code gets far too messy. There are several special subcases when the
6962         minimum is zero. */
6963 
6964         if (repeat_min == 0)
6965           {
6966           /* If the maximum is also zero, we used to just omit the group from
6967           the output altogether, like this:
6968 
6969           ** if (repeat_max == 0)
6970           **   {
6971           **   code = previous;
6972           **   goto END_REPEAT;
6973           **   }
6974 
6975           However, that fails when a group or a subgroup within it is
6976           referenced as a subroutine from elsewhere in the pattern, so now we
6977           stick in OP_SKIPZERO in front of it so that it is skipped on
6978           execution. As we don't have a list of which groups are referenced, we
6979           cannot do this selectively.
6980 
6981           If the maximum is 1 or unlimited, we just have to stick in the
6982           BRAZERO and do no more at this point. */
6983 
6984           if (repeat_max <= 1 || repeat_max == REPEAT_UNLIMITED)
6985             {
6986             (void)memmove(previous + 1, previous, CU2BYTES(len));
6987             code++;
6988             if (repeat_max == 0)
6989               {
6990               *previous++ = OP_SKIPZERO;
6991               goto END_REPEAT;
6992               }
6993             brazeroptr = previous;    /* Save for possessive optimizing */
6994             *previous++ = OP_BRAZERO + repeat_type;
6995             }
6996 
6997           /* If the maximum is greater than 1 and limited, we have to replicate
6998           in a nested fashion, sticking OP_BRAZERO before each set of brackets.
6999           The first one has to be handled carefully because it's the original
7000           copy, which has to be moved up. The remainder can be handled by code
7001           that is common with the non-zero minimum case below. We have to
7002           adjust the value or repeat_max, since one less copy is required. */
7003 
7004           else
7005             {
7006             int linkoffset;
7007             (void)memmove(previous + 2 + LINK_SIZE, previous, CU2BYTES(len));
7008             code += 2 + LINK_SIZE;
7009             *previous++ = OP_BRAZERO + repeat_type;
7010             *previous++ = OP_BRA;
7011 
7012             /* We chain together the bracket link offset fields that have to be
7013             filled in later when the ends of the brackets are reached. */
7014 
7015             linkoffset = (bralink == NULL)? 0 : (int)(previous - bralink);
7016             bralink = previous;
7017             PUTINC(previous, 0, linkoffset);
7018             }
7019 
7020           if (repeat_max != REPEAT_UNLIMITED) repeat_max--;
7021           }
7022 
7023         /* If the minimum is greater than zero, replicate the group as many
7024         times as necessary, and adjust the maximum to the number of subsequent
7025         copies that we need. */
7026 
7027         else
7028           {
7029           if (repeat_min > 1)
7030             {
7031             /* In the pre-compile phase, we don't actually do the replication.
7032             We just adjust the length as if we had. Do some paranoid checks for
7033             potential integer overflow. The INT64_OR_DOUBLE type is a 64-bit
7034             integer type when available, otherwise double. */
7035 
7036             if (lengthptr != NULL)
7037               {
7038               PCRE2_SIZE delta = (repeat_min - 1)*length_prevgroup;
7039               if ((INT64_OR_DOUBLE)(repeat_min - 1)*
7040                     (INT64_OR_DOUBLE)length_prevgroup >
7041                       (INT64_OR_DOUBLE)INT_MAX ||
7042                   OFLOW_MAX - *lengthptr < delta)
7043                 {
7044                 *errorcodeptr = ERR20;
7045                 return 0;
7046                 }
7047               *lengthptr += delta;
7048               }
7049 
7050             /* This is compiling for real. If there is a set first code unit
7051             for the group, and we have not yet set a "required code unit", set
7052             it. */
7053 
7054             else
7055               {
7056               if (groupsetfirstcu && reqcuflags < 0)
7057                 {
7058                 reqcu = firstcu;
7059                 reqcuflags = firstcuflags;
7060                 }
7061               for (i = 1; (uint32_t)i < repeat_min; i++)
7062                 {
7063                 memcpy(code, previous, CU2BYTES(len));
7064                 code += len;
7065                 }
7066               }
7067             }
7068 
7069           if (repeat_max != REPEAT_UNLIMITED) repeat_max -= repeat_min;
7070           }
7071 
7072         /* This code is common to both the zero and non-zero minimum cases. If
7073         the maximum is limited, it replicates the group in a nested fashion,
7074         remembering the bracket starts on a stack. In the case of a zero
7075         minimum, the first one was set up above. In all cases the repeat_max
7076         now specifies the number of additional copies needed. Again, we must
7077         remember to replicate entries on the forward reference list. */
7078 
7079         if (repeat_max != REPEAT_UNLIMITED)
7080           {
7081           /* In the pre-compile phase, we don't actually do the replication. We
7082           just adjust the length as if we had. For each repetition we must add
7083           1 to the length for BRAZERO and for all but the last repetition we
7084           must add 2 + 2*LINKSIZE to allow for the nesting that occurs. Do some
7085           paranoid checks to avoid integer overflow. The INT64_OR_DOUBLE type
7086           is a 64-bit integer type when available, otherwise double. */
7087 
7088           if (lengthptr != NULL && repeat_max > 0)
7089             {
7090             PCRE2_SIZE delta = repeat_max*(length_prevgroup + 1 + 2 + 2*LINK_SIZE) -
7091                         2 - 2*LINK_SIZE;   /* Last one doesn't nest */
7092             if ((INT64_OR_DOUBLE)repeat_max *
7093                   (INT64_OR_DOUBLE)(length_prevgroup + 1 + 2 + 2*LINK_SIZE)
7094                     > (INT64_OR_DOUBLE)INT_MAX ||
7095                 OFLOW_MAX - *lengthptr < delta)
7096               {
7097               *errorcodeptr = ERR20;
7098               return 0;
7099               }
7100             *lengthptr += delta;
7101             }
7102 
7103           /* This is compiling for real */
7104 
7105           else for (i = repeat_max - 1; i >= 0; i--)
7106             {
7107             *code++ = OP_BRAZERO + repeat_type;
7108 
7109             /* All but the final copy start a new nesting, maintaining the
7110             chain of brackets outstanding. */
7111 
7112             if (i != 0)
7113               {
7114               int linkoffset;
7115               *code++ = OP_BRA;
7116               linkoffset = (bralink == NULL)? 0 : (int)(code - bralink);
7117               bralink = code;
7118               PUTINC(code, 0, linkoffset);
7119               }
7120 
7121             memcpy(code, previous, CU2BYTES(len));
7122             code += len;
7123             }
7124 
7125           /* Now chain through the pending brackets, and fill in their length
7126           fields (which are holding the chain links pro tem). */
7127 
7128           while (bralink != NULL)
7129             {
7130             int oldlinkoffset;
7131             int linkoffset = (int)(code - bralink + 1);
7132             PCRE2_UCHAR *bra = code - linkoffset;
7133             oldlinkoffset = GET(bra, 1);
7134             bralink = (oldlinkoffset == 0)? NULL : bralink - oldlinkoffset;
7135             *code++ = OP_KET;
7136             PUTINC(code, 0, linkoffset);
7137             PUT(bra, 1, linkoffset);
7138             }
7139           }
7140 
7141         /* If the maximum is unlimited, set a repeater in the final copy. For
7142         SCRIPT_RUN and ONCE brackets, that's all we need to do. However,
7143         possessively repeated ONCE brackets can be converted into non-capturing
7144         brackets, as the behaviour of (?:xx)++ is the same as (?>xx)++ and this
7145         saves having to deal with possessive ONCEs specially.
7146 
7147         Otherwise, when we are doing the actual compile phase, check to see
7148         whether this group is one that could match an empty string. If so,
7149         convert the initial operator to the S form (e.g. OP_BRA -> OP_SBRA) so
7150         that runtime checking can be done. [This check is also applied to ONCE
7151         and SCRIPT_RUN groups at runtime, but in a different way.]
7152 
7153         Then, if the quantifier was possessive and the bracket is not a
7154         conditional, we convert the BRA code to the POS form, and the KET code to
7155         KETRPOS. (It turns out to be convenient at runtime to detect this kind of
7156         subpattern at both the start and at the end.) The use of special opcodes
7157         makes it possible to reduce greatly the stack usage in pcre2_match(). If
7158         the group is preceded by OP_BRAZERO, convert this to OP_BRAPOSZERO.
7159 
7160         Then, if the minimum number of matches is 1 or 0, cancel the possessive
7161         flag so that the default action below, of wrapping everything inside
7162         atomic brackets, does not happen. When the minimum is greater than 1,
7163         there will be earlier copies of the group, and so we still have to wrap
7164         the whole thing. */
7165 
7166         else
7167           {
7168           PCRE2_UCHAR *ketcode = code - 1 - LINK_SIZE;
7169           PCRE2_UCHAR *bracode = ketcode - GET(ketcode, 1);
7170 
7171           /* Convert possessive ONCE brackets to non-capturing */
7172 
7173           if (*bracode == OP_ONCE && possessive_quantifier) *bracode = OP_BRA;
7174 
7175           /* For non-possessive ONCE and for SCRIPT_RUN brackets, all we need
7176           to do is to set the KET. */
7177 
7178           if (*bracode == OP_ONCE || *bracode == OP_SCRIPT_RUN)
7179             *ketcode = OP_KETRMAX + repeat_type;
7180 
7181           /* Handle non-SCRIPT_RUN and non-ONCE brackets and possessive ONCEs
7182           (which have been converted to non-capturing above). */
7183 
7184           else
7185             {
7186             /* In the compile phase, adjust the opcode if the group can match
7187             an empty string. For a conditional group with only one branch, the
7188             value of group_return will not show "could be empty", so we must
7189             check that separately. */
7190 
7191             if (lengthptr == NULL)
7192               {
7193               if (group_return < 0) *bracode += OP_SBRA - OP_BRA;
7194               if (*bracode == OP_COND && bracode[GET(bracode,1)] != OP_ALT)
7195                 *bracode = OP_SCOND;
7196               }
7197 
7198             /* Handle possessive quantifiers. */
7199 
7200             if (possessive_quantifier)
7201               {
7202               /* For COND brackets, we wrap the whole thing in a possessively
7203               repeated non-capturing bracket, because we have not invented POS
7204               versions of the COND opcodes. */
7205 
7206               if (*bracode == OP_COND || *bracode == OP_SCOND)
7207                 {
7208                 int nlen = (int)(code - bracode);
7209                 (void)memmove(bracode + 1 + LINK_SIZE, bracode, CU2BYTES(nlen));
7210                 code += 1 + LINK_SIZE;
7211                 nlen += 1 + LINK_SIZE;
7212                 *bracode = (*bracode == OP_COND)? OP_BRAPOS : OP_SBRAPOS;
7213                 *code++ = OP_KETRPOS;
7214                 PUTINC(code, 0, nlen);
7215                 PUT(bracode, 1, nlen);
7216                 }
7217 
7218               /* For non-COND brackets, we modify the BRA code and use KETRPOS. */
7219 
7220               else
7221                 {
7222                 *bracode += 1;              /* Switch to xxxPOS opcodes */
7223                 *ketcode = OP_KETRPOS;
7224                 }
7225 
7226               /* If the minimum is zero, mark it as possessive, then unset the
7227               possessive flag when the minimum is 0 or 1. */
7228 
7229               if (brazeroptr != NULL) *brazeroptr = OP_BRAPOSZERO;
7230               if (repeat_min < 2) possessive_quantifier = FALSE;
7231               }
7232 
7233             /* Non-possessive quantifier */
7234 
7235             else *ketcode = OP_KETRMAX + repeat_type;
7236             }
7237           }
7238         }
7239       break;
7240 
7241       /* If previous was a character type match (\d or similar), abolish it and
7242       create a suitable repeat item. The code is shared with single-character
7243       repeats by setting op_type to add a suitable offset into repeat_type.
7244       Note the the Unicode property types will be present only when
7245       SUPPORT_UNICODE is defined, but we don't wrap the little bits of code
7246       here because it just makes it horribly messy. */
7247 
7248       default:
7249       if (op_previous >= OP_EODN)   /* Not a character type - internal error */
7250         {
7251         *errorcodeptr = ERR10;
7252         return 0;
7253         }
7254       else
7255         {
7256         int prop_type, prop_value;
7257         PCRE2_UCHAR *oldcode;
7258 
7259         op_type = OP_TYPESTAR - OP_STAR;      /* Use type opcodes */
7260         mclength = 0;                         /* Not a character */
7261 
7262         if (op_previous == OP_PROP || op_previous == OP_NOTPROP)
7263           {
7264           prop_type = previous[1];
7265           prop_value = previous[2];
7266           }
7267         else
7268           {
7269           /* Come here from just above with a character in mcbuffer/mclength. */
7270           OUTPUT_SINGLE_REPEAT:
7271           prop_type = prop_value = -1;
7272           }
7273 
7274         /* At this point, if prop_type == prop_value == -1 we either have a
7275         character in mcbuffer when mclength is greater than zero, or we have
7276         mclength zero, in which case there is a non-property character type in
7277         op_previous. If prop_type/value are not negative, we have a property
7278         character type in op_previous. */
7279 
7280         oldcode = code;                   /* Save where we were */
7281         code = previous;                  /* Usually overwrite previous item */
7282 
7283         /* If the maximum is zero then the minimum must also be zero; Perl allows
7284         this case, so we do too - by simply omitting the item altogether. */
7285 
7286         if (repeat_max == 0) goto END_REPEAT;
7287 
7288         /* Combine the op_type with the repeat_type */
7289 
7290         repeat_type += op_type;
7291 
7292         /* A minimum of zero is handled either as the special case * or ?, or as
7293         an UPTO, with the maximum given. */
7294 
7295         if (repeat_min == 0)
7296           {
7297           if (repeat_max == REPEAT_UNLIMITED) *code++ = OP_STAR + repeat_type;
7298             else if (repeat_max == 1) *code++ = OP_QUERY + repeat_type;
7299           else
7300             {
7301             *code++ = OP_UPTO + repeat_type;
7302             PUT2INC(code, 0, repeat_max);
7303             }
7304           }
7305 
7306         /* A repeat minimum of 1 is optimized into some special cases. If the
7307         maximum is unlimited, we use OP_PLUS. Otherwise, the original item is
7308         left in place and, if the maximum is greater than 1, we use OP_UPTO with
7309         one less than the maximum. */
7310 
7311         else if (repeat_min == 1)
7312           {
7313           if (repeat_max == REPEAT_UNLIMITED)
7314             *code++ = OP_PLUS + repeat_type;
7315           else
7316             {
7317             code = oldcode;  /* Leave previous item in place */
7318             if (repeat_max == 1) goto END_REPEAT;
7319             *code++ = OP_UPTO + repeat_type;
7320             PUT2INC(code, 0, repeat_max - 1);
7321             }
7322           }
7323 
7324         /* The case {n,n} is just an EXACT, while the general case {n,m} is
7325         handled as an EXACT followed by an UPTO or STAR or QUERY. */
7326 
7327         else
7328           {
7329           *code++ = OP_EXACT + op_type;  /* NB EXACT doesn't have repeat_type */
7330           PUT2INC(code, 0, repeat_min);
7331 
7332           /* Unless repeat_max equals repeat_min, fill in the data for EXACT,
7333           and then generate the second opcode. For a repeated Unicode property
7334           match, there are two extra values that define the required property,
7335           and mclength is set zero to indicate this. */
7336 
7337           if (repeat_max != repeat_min)
7338             {
7339             if (mclength > 0)
7340               {
7341               memcpy(code, mcbuffer, CU2BYTES(mclength));
7342               code += mclength;
7343               }
7344             else
7345               {
7346               *code++ = op_previous;
7347               if (prop_type >= 0)
7348                 {
7349                 *code++ = prop_type;
7350                 *code++ = prop_value;
7351                 }
7352               }
7353 
7354             /* Now set up the following opcode */
7355 
7356             if (repeat_max == REPEAT_UNLIMITED)
7357               *code++ = OP_STAR + repeat_type;
7358             else
7359               {
7360               repeat_max -= repeat_min;
7361               if (repeat_max == 1)
7362                 {
7363                 *code++ = OP_QUERY + repeat_type;
7364                 }
7365               else
7366                 {
7367                 *code++ = OP_UPTO + repeat_type;
7368                 PUT2INC(code, 0, repeat_max);
7369                 }
7370               }
7371             }
7372           }
7373 
7374         /* Fill in the character or character type for the final opcode. */
7375 
7376         if (mclength > 0)
7377           {
7378           memcpy(code, mcbuffer, CU2BYTES(mclength));
7379           code += mclength;
7380           }
7381         else
7382           {
7383           *code++ = op_previous;
7384           if (prop_type >= 0)
7385             {
7386             *code++ = prop_type;
7387             *code++ = prop_value;
7388             }
7389           }
7390         }
7391       break;
7392       }  /* End of switch on different op_previous values */
7393 
7394 
7395     /* If the character following a repeat is '+', possessive_quantifier is
7396     TRUE. For some opcodes, there are special alternative opcodes for this
7397     case. For anything else, we wrap the entire repeated item inside OP_ONCE
7398     brackets. Logically, the '+' notation is just syntactic sugar, taken from
7399     Sun's Java package, but the special opcodes can optimize it.
7400 
7401     Some (but not all) possessively repeated subpatterns have already been
7402     completely handled in the code just above. For them, possessive_quantifier
7403     is always FALSE at this stage. Note that the repeated item starts at
7404     tempcode, not at previous, which might be the first part of a string whose
7405     (former) last char we repeated. */
7406 
7407     if (possessive_quantifier)
7408       {
7409       int len;
7410 
7411       /* Possessifying an EXACT quantifier has no effect, so we can ignore it.
7412       However, QUERY, STAR, or UPTO may follow (for quantifiers such as {5,6},
7413       {5,}, or {5,10}). We skip over an EXACT item; if the length of what
7414       remains is greater than zero, there's a further opcode that can be
7415       handled. If not, do nothing, leaving the EXACT alone. */
7416 
7417       switch(*tempcode)
7418         {
7419         case OP_TYPEEXACT:
7420         tempcode += PRIV(OP_lengths)[*tempcode] +
7421           ((tempcode[1 + IMM2_SIZE] == OP_PROP
7422           || tempcode[1 + IMM2_SIZE] == OP_NOTPROP)? 2 : 0);
7423         break;
7424 
7425         /* CHAR opcodes are used for exacts whose count is 1. */
7426 
7427         case OP_CHAR:
7428         case OP_CHARI:
7429         case OP_NOT:
7430         case OP_NOTI:
7431         case OP_EXACT:
7432         case OP_EXACTI:
7433         case OP_NOTEXACT:
7434         case OP_NOTEXACTI:
7435         tempcode += PRIV(OP_lengths)[*tempcode];
7436 #ifdef SUPPORT_UNICODE
7437         if (utf && HAS_EXTRALEN(tempcode[-1]))
7438           tempcode += GET_EXTRALEN(tempcode[-1]);
7439 #endif
7440         break;
7441 
7442         /* For the class opcodes, the repeat operator appears at the end;
7443         adjust tempcode to point to it. */
7444 
7445         case OP_CLASS:
7446         case OP_NCLASS:
7447         tempcode += 1 + 32/sizeof(PCRE2_UCHAR);
7448         break;
7449 
7450 #ifdef SUPPORT_WIDE_CHARS
7451         case OP_XCLASS:
7452         tempcode += GET(tempcode, 1);
7453         break;
7454 #endif
7455         }
7456 
7457       /* If tempcode is equal to code (which points to the end of the repeated
7458       item), it means we have skipped an EXACT item but there is no following
7459       QUERY, STAR, or UPTO; the value of len will be 0, and we do nothing. In
7460       all other cases, tempcode will be pointing to the repeat opcode, and will
7461       be less than code, so the value of len will be greater than 0. */
7462 
7463       len = (int)(code - tempcode);
7464       if (len > 0)
7465         {
7466         unsigned int repcode = *tempcode;
7467 
7468         /* There is a table for possessifying opcodes, all of which are less
7469         than OP_CALLOUT. A zero entry means there is no possessified version.
7470         */
7471 
7472         if (repcode < OP_CALLOUT && opcode_possessify[repcode] > 0)
7473           *tempcode = opcode_possessify[repcode];
7474 
7475         /* For opcode without a special possessified version, wrap the item in
7476         ONCE brackets. */
7477 
7478         else
7479           {
7480           (void)memmove(tempcode + 1 + LINK_SIZE, tempcode, CU2BYTES(len));
7481           code += 1 + LINK_SIZE;
7482           len += 1 + LINK_SIZE;
7483           tempcode[0] = OP_ONCE;
7484           *code++ = OP_KET;
7485           PUTINC(code, 0, len);
7486           PUT(tempcode, 1, len);
7487           }
7488         }
7489       }
7490 
7491     /* We set the "follows varying string" flag for subsequently encountered
7492     reqcus if it isn't already set and we have just passed a varying length
7493     item. */
7494 
7495     END_REPEAT:
7496     cb->req_varyopt |= reqvary;
7497     break;
7498 
7499 
7500     /* ===================================================================*/
7501     /* Handle a 32-bit data character with a value greater than META_END. */
7502 
7503     case META_BIGVALUE:
7504     pptr++;
7505     goto NORMAL_CHAR;
7506 
7507 
7508     /* ===============================================================*/
7509     /* Handle a back reference by number, which is the meta argument. The
7510     pattern offsets for back references to group numbers less than 10 are held
7511     in a special vector, to avoid using more than two parsed pattern elements
7512     in 64-bit environments. We only need the offset to the first occurrence,
7513     because if that doesn't fail, subsequent ones will also be OK. */
7514 
7515     case META_BACKREF:
7516     if (meta_arg < 10) offset = cb->small_ref_offset[meta_arg];
7517       else GETPLUSOFFSET(offset, pptr);
7518 
7519     if (meta_arg > cb->bracount)
7520       {
7521       cb->erroroffset = offset;
7522       *errorcodeptr = ERR15;  /* Non-existent subpattern */
7523       return 0;
7524       }
7525 
7526     /* Come here from named backref handling when the reference is to a
7527     single group (that is, not to a duplicated name). The back reference
7528     data will have already been updated. We must disable firstcu if not
7529     set, to cope with cases like (?=(\w+))\1: which would otherwise set ':'
7530     later. */
7531 
7532     HANDLE_SINGLE_REFERENCE:
7533     if (firstcuflags == REQ_UNSET) zerofirstcuflags = firstcuflags = REQ_NONE;
7534     *code++ = ((options & PCRE2_CASELESS) != 0)? OP_REFI : OP_REF;
7535     PUT2INC(code, 0, meta_arg);
7536 
7537     /* Update the map of back references, and keep the highest one. We
7538     could do this in parse_regex() for numerical back references, but not
7539     for named back references, because we don't know the numbers to which
7540     named back references refer. So we do it all in this function. */
7541 
7542     cb->backref_map |= (meta_arg < 32)? (1u << meta_arg) : 1;
7543     if (meta_arg > cb->top_backref) cb->top_backref = meta_arg;
7544 
7545     /* Check to see if this back reference is recursive, that it, it
7546     is inside the group that it references. A flag is set so that the
7547     group can be made atomic. */
7548 
7549     for (oc = cb->open_caps; oc != NULL; oc = oc->next)
7550       {
7551       if (oc->number == meta_arg)
7552         {
7553         oc->flag = TRUE;
7554         break;
7555         }
7556       }
7557     break;
7558 
7559 
7560     /* ===============================================================*/
7561     /* Handle recursion by inserting the number of the called group (which is
7562     the meta argument) after OP_RECURSE. At the end of compiling the pattern is
7563     scanned and these numbers are replaced by offsets within the pattern. It is
7564     done like this to avoid problems with forward references and adjusting
7565     offsets when groups are duplicated and moved (as discovered in previous
7566     implementations). Note that a recursion does not have a set first
7567     character. */
7568 
7569     case META_RECURSE:
7570     GETPLUSOFFSET(offset, pptr);
7571     if (meta_arg > cb->bracount)
7572       {
7573       cb->erroroffset = offset;
7574       *errorcodeptr = ERR15;  /* Non-existent subpattern */
7575       return 0;
7576       }
7577     HANDLE_NUMERICAL_RECURSION:
7578     *code = OP_RECURSE;
7579     PUT(code, 1, meta_arg);
7580     code += 1 + LINK_SIZE;
7581     groupsetfirstcu = FALSE;
7582     cb->had_recurse = TRUE;
7583     if (firstcuflags == REQ_UNSET) firstcuflags = REQ_NONE;
7584     zerofirstcu = firstcu;
7585     zerofirstcuflags = firstcuflags;
7586     break;
7587 
7588 
7589     /* ===============================================================*/
7590     /* Handle capturing parentheses; the number is the meta argument. */
7591 
7592     case META_CAPTURE:
7593     bravalue = OP_CBRA;
7594     skipunits = IMM2_SIZE;
7595     PUT2(code, 1+LINK_SIZE, meta_arg);
7596     cb->lastcapture = meta_arg;
7597     goto GROUP_PROCESS_NOTE_EMPTY;
7598 
7599 
7600     /* ===============================================================*/
7601     /* Handle escape sequence items. For ones like \d, the ESC_values are
7602     arranged to be the same as the corresponding OP_values in the default case
7603     when PCRE2_UCP is not set (which is the only case in which they will appear
7604     here).
7605 
7606     Note: \Q and \E are never seen here, as they were dealt with in
7607     parse_pattern(). Neither are numerical back references or recursions, which
7608     were turned into META_BACKREF or META_RECURSE items, respectively. \k and
7609     \g, when followed by names, are turned into META_BACKREF_BYNAME or
7610     META_RECURSE_BYNAME. */
7611 
7612     case META_ESCAPE:
7613 
7614     /* We can test for escape sequences that consume a character because their
7615     values lie between ESC_b and ESC_Z; this may have to change if any new ones
7616     are ever created. For these sequences, we disable the setting of a first
7617     character if it hasn't already been set. */
7618 
7619     if (meta_arg > ESC_b && meta_arg < ESC_Z)
7620       {
7621       matched_char = TRUE;
7622       if (firstcuflags == REQ_UNSET) firstcuflags = REQ_NONE;
7623       }
7624 
7625     /* Set values to reset to if this is followed by a zero repeat. */
7626 
7627     zerofirstcu = firstcu;
7628     zerofirstcuflags = firstcuflags;
7629     zeroreqcu = reqcu;
7630     zeroreqcuflags = reqcuflags;
7631 
7632     /* If Unicode is not supported, \P and \p are not allowed and are
7633     faulted at parse time, so will never appear here. */
7634 
7635 #ifdef SUPPORT_UNICODE
7636     if (meta_arg == ESC_P || meta_arg == ESC_p)
7637       {
7638       uint32_t ptype = *(++pptr) >> 16;
7639       uint32_t pdata = *pptr & 0xffff;
7640 
7641       /* The special case of \p{Any} is compiled to OP_ALLANY so as to benefit
7642       from the auto-anchoring code. */
7643 
7644       if (meta_arg == ESC_p && ptype == PT_ANY)
7645         {
7646         *code++ = OP_ALLANY;
7647         }
7648       else
7649         {
7650         *code++ = (meta_arg == ESC_p)? OP_PROP : OP_NOTPROP;
7651         *code++ = ptype;
7652         *code++ = pdata;
7653         }
7654       break;  /* End META_ESCAPE */
7655       }
7656 #endif
7657 
7658     /* For the rest (including \X when Unicode is supported - if not it's
7659     faulted at parse time), the OP value is the escape value when PCRE2_UCP is
7660     not set; if it is set, these escapes do not show up here because they are
7661     converted into Unicode property tests in parse_regex(). Note that \b and \B
7662     do a one-character lookbehind, and \A also behaves as if it does. */
7663 
7664     if (meta_arg == ESC_C) cb->external_flags |= PCRE2_HASBKC; /* Record */
7665     if ((meta_arg == ESC_b || meta_arg == ESC_B || meta_arg == ESC_A) &&
7666          cb->max_lookbehind == 0)
7667       cb->max_lookbehind = 1;
7668 
7669     /* In non-UTF mode, and for both 32-bit modes, we turn \C into OP_ALLANY
7670     instead of OP_ANYBYTE so that it works in DFA mode and in lookbehinds. */
7671 
7672 #if PCRE2_CODE_UNIT_WIDTH == 32
7673     *code++ = (meta_arg == ESC_C)? OP_ALLANY : meta_arg;
7674 #else
7675     *code++ = (!utf && meta_arg == ESC_C)? OP_ALLANY : meta_arg;
7676 #endif
7677     break;  /* End META_ESCAPE */
7678 
7679 
7680     /* ===================================================================*/
7681     /* Handle an unrecognized meta value. A parsed pattern value less than
7682     META_END is a literal. Otherwise we have a problem. */
7683 
7684     default:
7685     if (meta >= META_END)
7686       {
7687 #ifdef DEBUG_SHOW_PARSED
7688       fprintf(stderr, "** Unrecognized parsed pattern item 0x%.8x\n", *pptr);
7689 #endif
7690       *errorcodeptr = ERR89;  /* Internal error - unrecognized. */
7691       return 0;
7692       }
7693 
7694     /* Handle a literal character. We come here by goto in the case of a
7695     32-bit, non-UTF character whose value is greater than META_END. */
7696 
7697     NORMAL_CHAR:
7698     meta = *pptr;     /* Get the full 32 bits */
7699     NORMAL_CHAR_SET:  /* Character is already in meta */
7700     matched_char = TRUE;
7701 
7702     /* For caseless UTF mode, check whether this character has more than one
7703     other case. If so, generate a special OP_PROP item instead of OP_CHARI. */
7704 
7705 #ifdef SUPPORT_UNICODE
7706     if (utf && (options & PCRE2_CASELESS) != 0)
7707       {
7708       uint32_t caseset = UCD_CASESET(meta);
7709       if (caseset != 0)
7710         {
7711         *code++ = OP_PROP;
7712         *code++ = PT_CLIST;
7713         *code++ = caseset;
7714         if (firstcuflags == REQ_UNSET)
7715           firstcuflags = zerofirstcuflags = REQ_NONE;
7716         break;  /* End handling this meta item */
7717         }
7718       }
7719 #endif
7720 
7721     /* Caseful matches, or not one of the multicase characters. Get the
7722     character's code units into mcbuffer, with the length in mclength. When not
7723     in UTF mode, the length is always 1. */
7724 
7725 #ifdef SUPPORT_UNICODE
7726     if (utf) mclength = PRIV(ord2utf)(meta, mcbuffer); else
7727 #endif
7728       {
7729       mclength = 1;
7730       mcbuffer[0] = meta;
7731       }
7732 
7733     /* Generate the appropriate code */
7734 
7735     *code++ = ((options & PCRE2_CASELESS) != 0)? OP_CHARI : OP_CHAR;
7736     memcpy(code, mcbuffer, CU2BYTES(mclength));
7737     code += mclength;
7738 
7739     /* Remember if \r or \n were seen */
7740 
7741     if (mcbuffer[0] == CHAR_CR || mcbuffer[0] == CHAR_NL)
7742       cb->external_flags |= PCRE2_HASCRORLF;
7743 
7744     /* Set the first and required code units appropriately. If no previous
7745     first code unit, set it from this character, but revert to none on a zero
7746     repeat. Otherwise, leave the firstcu value alone, and don't change it on
7747     a zero repeat. */
7748 
7749     if (firstcuflags == REQ_UNSET)
7750       {
7751       zerofirstcuflags = REQ_NONE;
7752       zeroreqcu = reqcu;
7753       zeroreqcuflags = reqcuflags;
7754 
7755       /* If the character is more than one code unit long, we can set firstcu
7756       only if it is not to be matched caselessly. */
7757 
7758       if (mclength == 1 || req_caseopt == 0)
7759         {
7760         firstcu = mcbuffer[0];
7761         firstcuflags = req_caseopt;
7762         if (mclength != 1)
7763           {
7764           reqcu = code[-1];
7765           reqcuflags = cb->req_varyopt;
7766           }
7767         }
7768       else firstcuflags = reqcuflags = REQ_NONE;
7769       }
7770 
7771     /* firstcu was previously set; we can set reqcu only if the length is
7772     1 or the matching is caseful. */
7773 
7774     else
7775       {
7776       zerofirstcu = firstcu;
7777       zerofirstcuflags = firstcuflags;
7778       zeroreqcu = reqcu;
7779       zeroreqcuflags = reqcuflags;
7780       if (mclength == 1 || req_caseopt == 0)
7781         {
7782         reqcu = code[-1];
7783         reqcuflags = req_caseopt | cb->req_varyopt;
7784         }
7785       }
7786     break;    /* End default meta handling */
7787     }         /* End of big switch */
7788   }           /* End of big loop */
7789 
7790 /* Control never reaches here. */
7791 }
7792 
7793 
7794 
7795 /*************************************************
7796 *   Compile regex: a sequence of alternatives    *
7797 *************************************************/
7798 
7799 /* On entry, pptr is pointing past the bracket meta, but on return it points to
7800 the closing bracket or META_END. The code variable is pointing at the code unit
7801 into which the BRA operator has been stored. This function is used during the
7802 pre-compile phase when we are trying to find out the amount of memory needed,
7803 as well as during the real compile phase. The value of lengthptr distinguishes
7804 the two phases.
7805 
7806 Arguments:
7807   options           option bits, including any changes for this subpattern
7808   codeptr           -> the address of the current code pointer
7809   pptrptr           -> the address of the current parsed pattern pointer
7810   errorcodeptr      -> pointer to error code variable
7811   skipunits         skip this many code units at start (for brackets and OP_COND)
7812   firstcuptr        place to put the first required code unit
7813   firstcuflagsptr   place to put the first code unit flags, or a negative number
7814   reqcuptr          place to put the last required code unit
7815   reqcuflagsptr     place to put the last required code unit flags, or a negative number
7816   bcptr             pointer to the chain of currently open branches
7817   cb                points to the data block with tables pointers etc.
7818   lengthptr         NULL during the real compile phase
7819                     points to length accumulator during pre-compile phase
7820 
7821 Returns:            0 There has been an error
7822                    +1 Success, this group must match at least one character
7823                    -1 Success, this group may match an empty string
7824 */
7825 
7826 static int
compile_regex(uint32_t options,PCRE2_UCHAR ** codeptr,uint32_t ** pptrptr,int * errorcodeptr,uint32_t skipunits,uint32_t * firstcuptr,int32_t * firstcuflagsptr,uint32_t * reqcuptr,int32_t * reqcuflagsptr,branch_chain * bcptr,compile_block * cb,PCRE2_SIZE * lengthptr)7827 compile_regex(uint32_t options, PCRE2_UCHAR **codeptr, uint32_t **pptrptr,
7828   int *errorcodeptr, uint32_t skipunits, uint32_t *firstcuptr,
7829   int32_t *firstcuflagsptr, uint32_t *reqcuptr,int32_t *reqcuflagsptr,
7830   branch_chain *bcptr, compile_block *cb, PCRE2_SIZE *lengthptr)
7831 {
7832 PCRE2_UCHAR *code = *codeptr;
7833 PCRE2_UCHAR *last_branch = code;
7834 PCRE2_UCHAR *start_bracket = code;
7835 BOOL lookbehind;
7836 open_capitem capitem;
7837 int capnumber = 0;
7838 int okreturn = 1;
7839 uint32_t *pptr = *pptrptr;
7840 uint32_t firstcu, reqcu;
7841 uint32_t lookbehindlength;
7842 int32_t firstcuflags, reqcuflags;
7843 uint32_t branchfirstcu, branchreqcu;
7844 int32_t branchfirstcuflags, branchreqcuflags;
7845 PCRE2_SIZE length;
7846 branch_chain bc;
7847 
7848 /* If set, call the external function that checks for stack availability. */
7849 
7850 if (cb->cx->stack_guard != NULL &&
7851     cb->cx->stack_guard(cb->parens_depth, cb->cx->stack_guard_data))
7852   {
7853   *errorcodeptr= ERR33;
7854   return 0;
7855   }
7856 
7857 /* Miscellaneous initialization */
7858 
7859 bc.outer = bcptr;
7860 bc.current_branch = code;
7861 
7862 firstcu = reqcu = 0;
7863 firstcuflags = reqcuflags = REQ_UNSET;
7864 
7865 /* Accumulate the length for use in the pre-compile phase. Start with the
7866 length of the BRA and KET and any extra code units that are required at the
7867 beginning. We accumulate in a local variable to save frequent testing of
7868 lengthptr for NULL. We cannot do this by looking at the value of 'code' at the
7869 start and end of each alternative, because compiled items are discarded during
7870 the pre-compile phase so that the workspace is not exceeded. */
7871 
7872 length = 2 + 2*LINK_SIZE + skipunits;
7873 
7874 /* Remember if this is a lookbehind assertion, and if it is, save its length
7875 and skip over the pattern offset. */
7876 
7877 lookbehind = *code == OP_ASSERTBACK || *code == OP_ASSERTBACK_NOT;
7878 if (lookbehind)
7879   {
7880   lookbehindlength = META_DATA(pptr[-1]);
7881   pptr += SIZEOFFSET;
7882   }
7883 else lookbehindlength = 0;
7884 
7885 /* If this is a capturing subpattern, add to the chain of open capturing items
7886 so that we can detect them if (*ACCEPT) is encountered. Note that only OP_CBRA
7887 need be tested here; changing this opcode to one of its variants, e.g.
7888 OP_SCBRAPOS, happens later, after the group has been compiled. */
7889 
7890 if (*code == OP_CBRA)
7891   {
7892   capnumber = GET2(code, 1 + LINK_SIZE);
7893   capitem.number = capnumber;
7894   capitem.next = cb->open_caps;
7895   capitem.flag = FALSE;
7896   capitem.assert_depth = cb->assert_depth;
7897   cb->open_caps = &capitem;
7898   }
7899 
7900 /* Offset is set zero to mark that this bracket is still open */
7901 
7902 PUT(code, 1, 0);
7903 code += 1 + LINK_SIZE + skipunits;
7904 
7905 /* Loop for each alternative branch */
7906 
7907 for (;;)
7908   {
7909   int branch_return;
7910 
7911   /* Insert OP_REVERSE if this is as lookbehind assertion. */
7912 
7913   if (lookbehind && lookbehindlength > 0)
7914     {
7915     *code++ = OP_REVERSE;
7916     PUTINC(code, 0, lookbehindlength);
7917     length += 1 + LINK_SIZE;
7918     }
7919 
7920   /* Now compile the branch; in the pre-compile phase its length gets added
7921   into the length. */
7922 
7923   if ((branch_return =
7924         compile_branch(&options, &code, &pptr, errorcodeptr, &branchfirstcu,
7925           &branchfirstcuflags, &branchreqcu, &branchreqcuflags, &bc,
7926           cb, (lengthptr == NULL)? NULL : &length)) == 0)
7927     return 0;
7928 
7929   /* If a branch can match an empty string, so can the whole group. */
7930 
7931   if (branch_return < 0) okreturn = -1;
7932 
7933   /* In the real compile phase, there is some post-processing to be done. */
7934 
7935   if (lengthptr == NULL)
7936     {
7937     /* If this is the first branch, the firstcu and reqcu values for the
7938     branch become the values for the regex. */
7939 
7940     if (*last_branch != OP_ALT)
7941       {
7942       firstcu = branchfirstcu;
7943       firstcuflags = branchfirstcuflags;
7944       reqcu = branchreqcu;
7945       reqcuflags = branchreqcuflags;
7946       }
7947 
7948     /* If this is not the first branch, the first char and reqcu have to
7949     match the values from all the previous branches, except that if the
7950     previous value for reqcu didn't have REQ_VARY set, it can still match,
7951     and we set REQ_VARY for the regex. */
7952 
7953     else
7954       {
7955       /* If we previously had a firstcu, but it doesn't match the new branch,
7956       we have to abandon the firstcu for the regex, but if there was
7957       previously no reqcu, it takes on the value of the old firstcu. */
7958 
7959       if (firstcuflags != branchfirstcuflags || firstcu != branchfirstcu)
7960         {
7961         if (firstcuflags >= 0)
7962           {
7963           if (reqcuflags < 0)
7964             {
7965             reqcu = firstcu;
7966             reqcuflags = firstcuflags;
7967             }
7968           }
7969         firstcuflags = REQ_NONE;
7970         }
7971 
7972       /* If we (now or from before) have no firstcu, a firstcu from the
7973       branch becomes a reqcu if there isn't a branch reqcu. */
7974 
7975       if (firstcuflags < 0 && branchfirstcuflags >= 0 &&
7976           branchreqcuflags < 0)
7977         {
7978         branchreqcu = branchfirstcu;
7979         branchreqcuflags = branchfirstcuflags;
7980         }
7981 
7982       /* Now ensure that the reqcus match */
7983 
7984       if (((reqcuflags & ~REQ_VARY) != (branchreqcuflags & ~REQ_VARY)) ||
7985           reqcu != branchreqcu)
7986         reqcuflags = REQ_NONE;
7987       else
7988         {
7989         reqcu = branchreqcu;
7990         reqcuflags |= branchreqcuflags; /* To "or" REQ_VARY */
7991         }
7992       }
7993     }
7994 
7995   /* Handle reaching the end of the expression, either ')' or end of pattern.
7996   In the real compile phase, go back through the alternative branches and
7997   reverse the chain of offsets, with the field in the BRA item now becoming an
7998   offset to the first alternative. If there are no alternatives, it points to
7999   the end of the group. The length in the terminating ket is always the length
8000   of the whole bracketed item. Return leaving the pointer at the terminating
8001   char. */
8002 
8003   if (META_CODE(*pptr) != META_ALT)
8004     {
8005     if (lengthptr == NULL)
8006       {
8007       PCRE2_SIZE branch_length = code - last_branch;
8008       do
8009         {
8010         PCRE2_SIZE prev_length = GET(last_branch, 1);
8011         PUT(last_branch, 1, branch_length);
8012         branch_length = prev_length;
8013         last_branch -= branch_length;
8014         }
8015       while (branch_length > 0);
8016       }
8017 
8018     /* Fill in the ket */
8019 
8020     *code = OP_KET;
8021     PUT(code, 1, (int)(code - start_bracket));
8022     code += 1 + LINK_SIZE;
8023 
8024     /* If it was a capturing subpattern, check to see if it contained any
8025     recursive back references. If so, we must wrap it in atomic brackets. In
8026     any event, remove the block from the chain. */
8027 
8028     if (capnumber > 0)
8029       {
8030       if (cb->open_caps->flag)
8031         {
8032         (void)memmove(start_bracket + 1 + LINK_SIZE, start_bracket,
8033           CU2BYTES(code - start_bracket));
8034         *start_bracket = OP_ONCE;
8035         code += 1 + LINK_SIZE;
8036         PUT(start_bracket, 1, (int)(code - start_bracket));
8037         *code = OP_KET;
8038         PUT(code, 1, (int)(code - start_bracket));
8039         code += 1 + LINK_SIZE;
8040         length += 2 + 2*LINK_SIZE;
8041         }
8042       cb->open_caps = cb->open_caps->next;
8043       }
8044 
8045     /* Set values to pass back */
8046 
8047     *codeptr = code;
8048     *pptrptr = pptr;
8049     *firstcuptr = firstcu;
8050     *firstcuflagsptr = firstcuflags;
8051     *reqcuptr = reqcu;
8052     *reqcuflagsptr = reqcuflags;
8053     if (lengthptr != NULL)
8054       {
8055       if (OFLOW_MAX - *lengthptr < length)
8056         {
8057         *errorcodeptr = ERR20;
8058         return 0;
8059         }
8060       *lengthptr += length;
8061       }
8062     return okreturn;
8063     }
8064 
8065   /* Another branch follows. In the pre-compile phase, we can move the code
8066   pointer back to where it was for the start of the first branch. (That is,
8067   pretend that each branch is the only one.)
8068 
8069   In the real compile phase, insert an ALT node. Its length field points back
8070   to the previous branch while the bracket remains open. At the end the chain
8071   is reversed. It's done like this so that the start of the bracket has a
8072   zero offset until it is closed, making it possible to detect recursion. */
8073 
8074   if (lengthptr != NULL)
8075     {
8076     code = *codeptr + 1 + LINK_SIZE + skipunits;
8077     length += 1 + LINK_SIZE;
8078     }
8079   else
8080     {
8081     *code = OP_ALT;
8082     PUT(code, 1, (int)(code - last_branch));
8083     bc.current_branch = last_branch = code;
8084     code += 1 + LINK_SIZE;
8085     }
8086 
8087   /* Set the lookbehind length (if not in a lookbehind the value will be zero)
8088   and then advance past the vertical bar. */
8089 
8090   lookbehindlength = META_DATA(*pptr);
8091   pptr++;
8092   }
8093 /* Control never reaches here */
8094 }
8095 
8096 
8097 
8098 /*************************************************
8099 *          Check for anchored pattern            *
8100 *************************************************/
8101 
8102 /* Try to find out if this is an anchored regular expression. Consider each
8103 alternative branch. If they all start with OP_SOD or OP_CIRC, or with a bracket
8104 all of whose alternatives start with OP_SOD or OP_CIRC (recurse ad lib), then
8105 it's anchored. However, if this is a multiline pattern, then only OP_SOD will
8106 be found, because ^ generates OP_CIRCM in that mode.
8107 
8108 We can also consider a regex to be anchored if OP_SOM starts all its branches.
8109 This is the code for \G, which means "match at start of match position, taking
8110 into account the match offset".
8111 
8112 A branch is also implicitly anchored if it starts with .* and DOTALL is set,
8113 because that will try the rest of the pattern at all possible matching points,
8114 so there is no point trying again.... er ....
8115 
8116 .... except when the .* appears inside capturing parentheses, and there is a
8117 subsequent back reference to those parentheses. We haven't enough information
8118 to catch that case precisely.
8119 
8120 At first, the best we could do was to detect when .* was in capturing brackets
8121 and the highest back reference was greater than or equal to that level.
8122 However, by keeping a bitmap of the first 31 back references, we can catch some
8123 of the more common cases more precisely.
8124 
8125 ... A second exception is when the .* appears inside an atomic group, because
8126 this prevents the number of characters it matches from being adjusted.
8127 
8128 Arguments:
8129   code           points to start of the compiled pattern
8130   bracket_map    a bitmap of which brackets we are inside while testing; this
8131                    handles up to substring 31; after that we just have to take
8132                    the less precise approach
8133   cb             points to the compile data block
8134   atomcount      atomic group level
8135   inassert       TRUE if in an assertion
8136 
8137 Returns:     TRUE or FALSE
8138 */
8139 
8140 static BOOL
is_anchored(PCRE2_SPTR code,unsigned int bracket_map,compile_block * cb,int atomcount,BOOL inassert)8141 is_anchored(PCRE2_SPTR code, unsigned int bracket_map, compile_block *cb,
8142   int atomcount, BOOL inassert)
8143 {
8144 do {
8145    PCRE2_SPTR scode = first_significant_code(
8146      code + PRIV(OP_lengths)[*code], FALSE);
8147    int op = *scode;
8148 
8149    /* Non-capturing brackets */
8150 
8151    if (op == OP_BRA  || op == OP_BRAPOS ||
8152        op == OP_SBRA || op == OP_SBRAPOS)
8153      {
8154      if (!is_anchored(scode, bracket_map, cb, atomcount, inassert))
8155        return FALSE;
8156      }
8157 
8158    /* Capturing brackets */
8159 
8160    else if (op == OP_CBRA  || op == OP_CBRAPOS ||
8161             op == OP_SCBRA || op == OP_SCBRAPOS)
8162      {
8163      int n = GET2(scode, 1+LINK_SIZE);
8164      int new_map = bracket_map | ((n < 32)? (1u << n) : 1);
8165      if (!is_anchored(scode, new_map, cb, atomcount, inassert)) return FALSE;
8166      }
8167 
8168    /* Positive forward assertion */
8169 
8170    else if (op == OP_ASSERT)
8171      {
8172      if (!is_anchored(scode, bracket_map, cb, atomcount, TRUE)) return FALSE;
8173      }
8174 
8175    /* Condition. If there is no second branch, it can't be anchored. */
8176 
8177    else if (op == OP_COND || op == OP_SCOND)
8178      {
8179      if (scode[GET(scode,1)] != OP_ALT) return FALSE;
8180      if (!is_anchored(scode, bracket_map, cb, atomcount, inassert))
8181        return FALSE;
8182      }
8183 
8184    /* Atomic groups */
8185 
8186    else if (op == OP_ONCE)
8187      {
8188      if (!is_anchored(scode, bracket_map, cb, atomcount + 1, inassert))
8189        return FALSE;
8190      }
8191 
8192    /* .* is not anchored unless DOTALL is set (which generates OP_ALLANY) and
8193    it isn't in brackets that are or may be referenced or inside an atomic
8194    group or an assertion. Also the pattern must not contain *PRUNE or *SKIP,
8195    because these break the feature. Consider, for example, /(?s).*?(*PRUNE)b/
8196    with the subject "aab", which matches "b", i.e. not at the start of a line.
8197    There is also an option that disables auto-anchoring. */
8198 
8199    else if ((op == OP_TYPESTAR || op == OP_TYPEMINSTAR ||
8200              op == OP_TYPEPOSSTAR))
8201      {
8202      if (scode[1] != OP_ALLANY || (bracket_map & cb->backref_map) != 0 ||
8203          atomcount > 0 || cb->had_pruneorskip || inassert ||
8204          (cb->external_options & PCRE2_NO_DOTSTAR_ANCHOR) != 0)
8205        return FALSE;
8206      }
8207 
8208    /* Check for explicit anchoring */
8209 
8210    else if (op != OP_SOD && op != OP_SOM && op != OP_CIRC) return FALSE;
8211 
8212    code += GET(code, 1);
8213    }
8214 while (*code == OP_ALT);   /* Loop for each alternative */
8215 return TRUE;
8216 }
8217 
8218 
8219 
8220 /*************************************************
8221 *         Check for starting with ^ or .*        *
8222 *************************************************/
8223 
8224 /* This is called to find out if every branch starts with ^ or .* so that
8225 "first char" processing can be done to speed things up in multiline
8226 matching and for non-DOTALL patterns that start with .* (which must start at
8227 the beginning or after \n). As in the case of is_anchored() (see above), we
8228 have to take account of back references to capturing brackets that contain .*
8229 because in that case we can't make the assumption. Also, the appearance of .*
8230 inside atomic brackets or in an assertion, or in a pattern that contains *PRUNE
8231 or *SKIP does not count, because once again the assumption no longer holds.
8232 
8233 Arguments:
8234   code           points to start of the compiled pattern or a group
8235   bracket_map    a bitmap of which brackets we are inside while testing; this
8236                    handles up to substring 31; after that we just have to take
8237                    the less precise approach
8238   cb             points to the compile data
8239   atomcount      atomic group level
8240   inassert       TRUE if in an assertion
8241 
8242 Returns:         TRUE or FALSE
8243 */
8244 
8245 static BOOL
is_startline(PCRE2_SPTR code,unsigned int bracket_map,compile_block * cb,int atomcount,BOOL inassert)8246 is_startline(PCRE2_SPTR code, unsigned int bracket_map, compile_block *cb,
8247   int atomcount, BOOL inassert)
8248 {
8249 do {
8250    PCRE2_SPTR scode = first_significant_code(
8251      code + PRIV(OP_lengths)[*code], FALSE);
8252    int op = *scode;
8253 
8254    /* If we are at the start of a conditional assertion group, *both* the
8255    conditional assertion *and* what follows the condition must satisfy the test
8256    for start of line. Other kinds of condition fail. Note that there may be an
8257    auto-callout at the start of a condition. */
8258 
8259    if (op == OP_COND)
8260      {
8261      scode += 1 + LINK_SIZE;
8262 
8263      if (*scode == OP_CALLOUT) scode += PRIV(OP_lengths)[OP_CALLOUT];
8264        else if (*scode == OP_CALLOUT_STR) scode += GET(scode, 1 + 2*LINK_SIZE);
8265 
8266      switch (*scode)
8267        {
8268        case OP_CREF:
8269        case OP_DNCREF:
8270        case OP_RREF:
8271        case OP_DNRREF:
8272        case OP_FAIL:
8273        case OP_FALSE:
8274        case OP_TRUE:
8275        return FALSE;
8276 
8277        default:     /* Assertion */
8278        if (!is_startline(scode, bracket_map, cb, atomcount, TRUE)) return FALSE;
8279        do scode += GET(scode, 1); while (*scode == OP_ALT);
8280        scode += 1 + LINK_SIZE;
8281        break;
8282        }
8283      scode = first_significant_code(scode, FALSE);
8284      op = *scode;
8285      }
8286 
8287    /* Non-capturing brackets */
8288 
8289    if (op == OP_BRA  || op == OP_BRAPOS ||
8290        op == OP_SBRA || op == OP_SBRAPOS)
8291      {
8292      if (!is_startline(scode, bracket_map, cb, atomcount, inassert))
8293        return FALSE;
8294      }
8295 
8296    /* Capturing brackets */
8297 
8298    else if (op == OP_CBRA  || op == OP_CBRAPOS ||
8299             op == OP_SCBRA || op == OP_SCBRAPOS)
8300      {
8301      int n = GET2(scode, 1+LINK_SIZE);
8302      int new_map = bracket_map | ((n < 32)? (1u << n) : 1);
8303      if (!is_startline(scode, new_map, cb, atomcount, inassert)) return FALSE;
8304      }
8305 
8306    /* Positive forward assertions */
8307 
8308    else if (op == OP_ASSERT)
8309      {
8310      if (!is_startline(scode, bracket_map, cb, atomcount, TRUE))
8311        return FALSE;
8312      }
8313 
8314    /* Atomic brackets */
8315 
8316    else if (op == OP_ONCE)
8317      {
8318      if (!is_startline(scode, bracket_map, cb, atomcount + 1, inassert))
8319        return FALSE;
8320      }
8321 
8322    /* .* means "start at start or after \n" if it isn't in atomic brackets or
8323    brackets that may be referenced or an assertion, and as long as the pattern
8324    does not contain *PRUNE or *SKIP, because these break the feature. Consider,
8325    for example, /.*?a(*PRUNE)b/ with the subject "aab", which matches "ab",
8326    i.e. not at the start of a line. There is also an option that disables this
8327    optimization. */
8328 
8329    else if (op == OP_TYPESTAR || op == OP_TYPEMINSTAR || op == OP_TYPEPOSSTAR)
8330      {
8331      if (scode[1] != OP_ANY || (bracket_map & cb->backref_map) != 0 ||
8332          atomcount > 0 || cb->had_pruneorskip || inassert ||
8333          (cb->external_options & PCRE2_NO_DOTSTAR_ANCHOR) != 0)
8334        return FALSE;
8335      }
8336 
8337    /* Check for explicit circumflex; anything else gives a FALSE result. Note
8338    in particular that this includes atomic brackets OP_ONCE because the number
8339    of characters matched by .* cannot be adjusted inside them. */
8340 
8341    else if (op != OP_CIRC && op != OP_CIRCM) return FALSE;
8342 
8343    /* Move on to the next alternative */
8344 
8345    code += GET(code, 1);
8346    }
8347 while (*code == OP_ALT);  /* Loop for each alternative */
8348 return TRUE;
8349 }
8350 
8351 
8352 
8353 /*************************************************
8354 *   Scan compiled regex for recursion reference  *
8355 *************************************************/
8356 
8357 /* This function scans through a compiled pattern until it finds an instance of
8358 OP_RECURSE.
8359 
8360 Arguments:
8361   code        points to start of expression
8362   utf         TRUE in UTF mode
8363 
8364 Returns:      pointer to the opcode for OP_RECURSE, or NULL if not found
8365 */
8366 
8367 static PCRE2_SPTR
find_recurse(PCRE2_SPTR code,BOOL utf)8368 find_recurse(PCRE2_SPTR code, BOOL utf)
8369 {
8370 for (;;)
8371   {
8372   PCRE2_UCHAR c = *code;
8373   if (c == OP_END) return NULL;
8374   if (c == OP_RECURSE) return code;
8375 
8376   /* XCLASS is used for classes that cannot be represented just by a bit map.
8377   This includes negated single high-valued characters. CALLOUT_STR is used for
8378   callouts with string arguments. In both cases the length in the table is
8379   zero; the actual length is stored in the compiled code. */
8380 
8381   if (c == OP_XCLASS) code += GET(code, 1);
8382     else if (c == OP_CALLOUT_STR) code += GET(code, 1 + 2*LINK_SIZE);
8383 
8384   /* Otherwise, we can get the item's length from the table, except that for
8385   repeated character types, we have to test for \p and \P, which have an extra
8386   two code units of parameters, and for MARK/PRUNE/SKIP/THEN with an argument,
8387   we must add in its length. */
8388 
8389   else
8390     {
8391     switch(c)
8392       {
8393       case OP_TYPESTAR:
8394       case OP_TYPEMINSTAR:
8395       case OP_TYPEPLUS:
8396       case OP_TYPEMINPLUS:
8397       case OP_TYPEQUERY:
8398       case OP_TYPEMINQUERY:
8399       case OP_TYPEPOSSTAR:
8400       case OP_TYPEPOSPLUS:
8401       case OP_TYPEPOSQUERY:
8402       if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
8403       break;
8404 
8405       case OP_TYPEPOSUPTO:
8406       case OP_TYPEUPTO:
8407       case OP_TYPEMINUPTO:
8408       case OP_TYPEEXACT:
8409       if (code[1 + IMM2_SIZE] == OP_PROP || code[1 + IMM2_SIZE] == OP_NOTPROP)
8410         code += 2;
8411       break;
8412 
8413       case OP_MARK:
8414       case OP_COMMIT_ARG:
8415       case OP_PRUNE_ARG:
8416       case OP_SKIP_ARG:
8417       case OP_THEN_ARG:
8418       code += code[1];
8419       break;
8420       }
8421 
8422     /* Add in the fixed length from the table */
8423 
8424     code += PRIV(OP_lengths)[c];
8425 
8426     /* In UTF-8 and UTF-16 modes, opcodes that are followed by a character may
8427     be followed by a multi-unit character. The length in the table is a
8428     minimum, so we have to arrange to skip the extra units. */
8429 
8430 #ifdef MAYBE_UTF_MULTI
8431     if (utf) switch(c)
8432       {
8433       case OP_CHAR:
8434       case OP_CHARI:
8435       case OP_NOT:
8436       case OP_NOTI:
8437       case OP_EXACT:
8438       case OP_EXACTI:
8439       case OP_NOTEXACT:
8440       case OP_NOTEXACTI:
8441       case OP_UPTO:
8442       case OP_UPTOI:
8443       case OP_NOTUPTO:
8444       case OP_NOTUPTOI:
8445       case OP_MINUPTO:
8446       case OP_MINUPTOI:
8447       case OP_NOTMINUPTO:
8448       case OP_NOTMINUPTOI:
8449       case OP_POSUPTO:
8450       case OP_POSUPTOI:
8451       case OP_NOTPOSUPTO:
8452       case OP_NOTPOSUPTOI:
8453       case OP_STAR:
8454       case OP_STARI:
8455       case OP_NOTSTAR:
8456       case OP_NOTSTARI:
8457       case OP_MINSTAR:
8458       case OP_MINSTARI:
8459       case OP_NOTMINSTAR:
8460       case OP_NOTMINSTARI:
8461       case OP_POSSTAR:
8462       case OP_POSSTARI:
8463       case OP_NOTPOSSTAR:
8464       case OP_NOTPOSSTARI:
8465       case OP_PLUS:
8466       case OP_PLUSI:
8467       case OP_NOTPLUS:
8468       case OP_NOTPLUSI:
8469       case OP_MINPLUS:
8470       case OP_MINPLUSI:
8471       case OP_NOTMINPLUS:
8472       case OP_NOTMINPLUSI:
8473       case OP_POSPLUS:
8474       case OP_POSPLUSI:
8475       case OP_NOTPOSPLUS:
8476       case OP_NOTPOSPLUSI:
8477       case OP_QUERY:
8478       case OP_QUERYI:
8479       case OP_NOTQUERY:
8480       case OP_NOTQUERYI:
8481       case OP_MINQUERY:
8482       case OP_MINQUERYI:
8483       case OP_NOTMINQUERY:
8484       case OP_NOTMINQUERYI:
8485       case OP_POSQUERY:
8486       case OP_POSQUERYI:
8487       case OP_NOTPOSQUERY:
8488       case OP_NOTPOSQUERYI:
8489       if (HAS_EXTRALEN(code[-1])) code += GET_EXTRALEN(code[-1]);
8490       break;
8491       }
8492 #else
8493     (void)(utf);  /* Keep compiler happy by referencing function argument */
8494 #endif  /* MAYBE_UTF_MULTI */
8495     }
8496   }
8497 }
8498 
8499 
8500 
8501 /*************************************************
8502 *    Check for asserted fixed first code unit    *
8503 *************************************************/
8504 
8505 /* During compilation, the "first code unit" settings from forward assertions
8506 are discarded, because they can cause conflicts with actual literals that
8507 follow. However, if we end up without a first code unit setting for an
8508 unanchored pattern, it is worth scanning the regex to see if there is an
8509 initial asserted first code unit. If all branches start with the same asserted
8510 code unit, or with a non-conditional bracket all of whose alternatives start
8511 with the same asserted code unit (recurse ad lib), then we return that code
8512 unit, with the flags set to zero or REQ_CASELESS; otherwise return zero with
8513 REQ_NONE in the flags.
8514 
8515 Arguments:
8516   code       points to start of compiled pattern
8517   flags      points to the first code unit flags
8518   inassert   non-zero if in an assertion
8519 
8520 Returns:     the fixed first code unit, or 0 with REQ_NONE in flags
8521 */
8522 
8523 static uint32_t
find_firstassertedcu(PCRE2_SPTR code,int32_t * flags,uint32_t inassert)8524 find_firstassertedcu(PCRE2_SPTR code, int32_t *flags, uint32_t inassert)
8525 {
8526 uint32_t c = 0;
8527 int cflags = REQ_NONE;
8528 
8529 *flags = REQ_NONE;
8530 do {
8531    uint32_t d;
8532    int dflags;
8533    int xl = (*code == OP_CBRA || *code == OP_SCBRA ||
8534              *code == OP_CBRAPOS || *code == OP_SCBRAPOS)? IMM2_SIZE:0;
8535    PCRE2_SPTR scode = first_significant_code(code + 1+LINK_SIZE + xl, TRUE);
8536    PCRE2_UCHAR op = *scode;
8537 
8538    switch(op)
8539      {
8540      default:
8541      return 0;
8542 
8543      case OP_BRA:
8544      case OP_BRAPOS:
8545      case OP_CBRA:
8546      case OP_SCBRA:
8547      case OP_CBRAPOS:
8548      case OP_SCBRAPOS:
8549      case OP_ASSERT:
8550      case OP_ONCE:
8551      case OP_SCRIPT_RUN:
8552      d = find_firstassertedcu(scode, &dflags, inassert + ((op==OP_ASSERT)?1:0));
8553      if (dflags < 0)
8554        return 0;
8555      if (cflags < 0) { c = d; cflags = dflags; }
8556        else if (c != d || cflags != dflags) return 0;
8557      break;
8558 
8559      case OP_EXACT:
8560      scode += IMM2_SIZE;
8561      /* Fall through */
8562 
8563      case OP_CHAR:
8564      case OP_PLUS:
8565      case OP_MINPLUS:
8566      case OP_POSPLUS:
8567      if (inassert == 0) return 0;
8568      if (cflags < 0) { c = scode[1]; cflags = 0; }
8569        else if (c != scode[1]) return 0;
8570      break;
8571 
8572      case OP_EXACTI:
8573      scode += IMM2_SIZE;
8574      /* Fall through */
8575 
8576      case OP_CHARI:
8577      case OP_PLUSI:
8578      case OP_MINPLUSI:
8579      case OP_POSPLUSI:
8580      if (inassert == 0) return 0;
8581      if (cflags < 0) { c = scode[1]; cflags = REQ_CASELESS; }
8582        else if (c != scode[1]) return 0;
8583      break;
8584      }
8585 
8586    code += GET(code, 1);
8587    }
8588 while (*code == OP_ALT);
8589 
8590 *flags = cflags;
8591 return c;
8592 }
8593 
8594 
8595 
8596 /*************************************************
8597 *     Add an entry to the name/number table      *
8598 *************************************************/
8599 
8600 /* This function is called between compiling passes to add an entry to the
8601 name/number table, maintaining alphabetical order. Checking for permitted
8602 and forbidden duplicates has already been done.
8603 
8604 Arguments:
8605   cb           the compile data block
8606   name         the name to add
8607   length       the length of the name
8608   groupno      the group number
8609   tablecount   the count of names in the table so far
8610 
8611 Returns:       nothing
8612 */
8613 
8614 static void
add_name_to_table(compile_block * cb,PCRE2_SPTR name,int length,unsigned int groupno,uint32_t tablecount)8615 add_name_to_table(compile_block *cb, PCRE2_SPTR name, int length,
8616   unsigned int groupno, uint32_t tablecount)
8617 {
8618 uint32_t i;
8619 PCRE2_UCHAR *slot = cb->name_table;
8620 
8621 for (i = 0; i < tablecount; i++)
8622   {
8623   int crc = memcmp(name, slot+IMM2_SIZE, CU2BYTES(length));
8624   if (crc == 0 && slot[IMM2_SIZE+length] != 0)
8625     crc = -1; /* Current name is a substring */
8626 
8627   /* Make space in the table and break the loop for an earlier name. For a
8628   duplicate or later name, carry on. We do this for duplicates so that in the
8629   simple case (when ?(| is not used) they are in order of their numbers. In all
8630   cases they are in the order in which they appear in the pattern. */
8631 
8632   if (crc < 0)
8633     {
8634     (void)memmove(slot + cb->name_entry_size, slot,
8635       CU2BYTES((tablecount - i) * cb->name_entry_size));
8636     break;
8637     }
8638 
8639   /* Continue the loop for a later or duplicate name */
8640 
8641   slot += cb->name_entry_size;
8642   }
8643 
8644 PUT2(slot, 0, groupno);
8645 memcpy(slot + IMM2_SIZE, name, CU2BYTES(length));
8646 
8647 /* Add a terminating zero and fill the rest of the slot with zeroes so that
8648 the memory is all initialized. Otherwise valgrind moans about uninitialized
8649 memory when saving serialized compiled patterns. */
8650 
8651 memset(slot + IMM2_SIZE + length, 0,
8652   CU2BYTES(cb->name_entry_size - length - IMM2_SIZE));
8653 }
8654 
8655 
8656 
8657 /*************************************************
8658 *             Skip in parsed pattern             *
8659 *************************************************/
8660 
8661 /* This function is called to skip parts of the parsed pattern when finding the
8662 length of a lookbehind branch. It is called after (*ACCEPT) and (*FAIL) to find
8663 the end of the branch, it is called to skip over an internal lookaround, and it
8664 is also called to skip to the end of a class, during which it will never
8665 encounter nested groups (but there's no need to have special code for that).
8666 
8667 When called to find the end of a branch or group, pptr must point to the first
8668 meta code inside the branch, not the branch-starting code. In other cases it
8669 can point to the item that causes the function to be called.
8670 
8671 Arguments:
8672   pptr       current pointer to skip from
8673   skiptype   PSKIP_CLASS when skipping to end of class
8674              PSKIP_ALT when META_ALT ends the skip
8675              PSKIP_KET when only META_KET ends the skip
8676 
8677 Returns:     new value of pptr
8678              NULL if META_END is reached - should never occur
8679                or for an unknown meta value - likewise
8680 */
8681 
8682 static uint32_t *
parsed_skip(uint32_t * pptr,uint32_t skiptype)8683 parsed_skip(uint32_t *pptr, uint32_t skiptype)
8684 {
8685 uint32_t nestlevel = 0;
8686 
8687 for (;; pptr++)
8688   {
8689   uint32_t meta = META_CODE(*pptr);
8690 
8691   switch(meta)
8692     {
8693     default:  /* Just skip over most items */
8694     if (meta < META_END) continue;  /* Literal */
8695     break;
8696 
8697     /* This should never occur. */
8698 
8699     case META_END:
8700     return NULL;
8701 
8702     /* The data for these items is variable in length. */
8703 
8704     case META_BACKREF:  /* Offset is present only if group >= 10 */
8705     if (META_DATA(*pptr) >= 10) pptr += SIZEOFFSET;
8706     break;
8707 
8708     case META_ESCAPE:   /* A few escapes are followed by data items. */
8709     switch (META_DATA(*pptr))
8710       {
8711       case ESC_P:
8712       case ESC_p:
8713       pptr += 1;
8714       break;
8715 
8716       case ESC_g:
8717       case ESC_k:
8718       pptr += 1 + SIZEOFFSET;
8719       break;
8720       }
8721     break;
8722 
8723     case META_MARK:     /* Add the length of the name. */
8724     case META_COMMIT_ARG:
8725     case META_PRUNE_ARG:
8726     case META_SKIP_ARG:
8727     case META_THEN_ARG:
8728     pptr += pptr[1];
8729     break;
8730 
8731     /* These are the "active" items in this loop. */
8732 
8733     case META_CLASS_END:
8734     if (skiptype == PSKIP_CLASS) return pptr;
8735     break;
8736 
8737     case META_ATOMIC:
8738     case META_CAPTURE:
8739     case META_COND_ASSERT:
8740     case META_COND_DEFINE:
8741     case META_COND_NAME:
8742     case META_COND_NUMBER:
8743     case META_COND_RNAME:
8744     case META_COND_RNUMBER:
8745     case META_COND_VERSION:
8746     case META_LOOKAHEAD:
8747     case META_LOOKAHEADNOT:
8748     case META_LOOKBEHIND:
8749     case META_LOOKBEHINDNOT:
8750     case META_NOCAPTURE:
8751     case META_SCRIPT_RUN:
8752     nestlevel++;
8753     break;
8754 
8755     case META_ALT:
8756     if (nestlevel == 0 && skiptype == PSKIP_ALT) return pptr;
8757     break;
8758 
8759     case META_KET:
8760     if (nestlevel == 0) return pptr;
8761     nestlevel--;
8762     break;
8763     }
8764 
8765   /* The extra data item length for each meta is in a table. */
8766 
8767   meta = (meta >> 16) & 0x7fff;
8768   if (meta >= sizeof(meta_extra_lengths)) return NULL;
8769   pptr += meta_extra_lengths[meta];
8770   }
8771 /* Control never reaches here */
8772 return pptr;
8773 }
8774 
8775 
8776 
8777 /*************************************************
8778 *       Find length of a parsed group            *
8779 *************************************************/
8780 
8781 /* This is called for nested groups within a branch of a lookbehind whose
8782 length is being computed. If all the branches in the nested group have the same
8783 length, that is OK. On entry, the pointer must be at the first element after
8784 the group initializing code. On exit it points to OP_KET. Caching is used to
8785 improve processing speed when the same capturing group occurs many times.
8786 
8787 Arguments:
8788   pptrptr     pointer to pointer in the parsed pattern
8789   isinline    FALSE if a reference or recursion; TRUE for inline group
8790   errcodeptr  pointer to the errorcode
8791   lcptr       pointer to the loop counter
8792   group       number of captured group or -1 for a non-capturing group
8793   recurses    chain of recurse_check to catch mutual recursion
8794   cb          pointer to the compile data
8795 
8796 Returns:      the group length or a negative number
8797 */
8798 
8799 static int
get_grouplength(uint32_t ** pptrptr,BOOL isinline,int * errcodeptr,int * lcptr,int group,parsed_recurse_check * recurses,compile_block * cb)8800 get_grouplength(uint32_t **pptrptr, BOOL isinline, int *errcodeptr, int *lcptr,
8801   int group, parsed_recurse_check *recurses, compile_block *cb)
8802 {
8803 int branchlength;
8804 int grouplength = -1;
8805 
8806 /* The cache can be used only if there is no possibility of there being two
8807 groups with the same number. We do not need to set the end pointer for a group
8808 that is being processed as a back reference or recursion, but we must do so for
8809 an inline group. */
8810 
8811 if (group > 0 && (cb->external_flags & PCRE2_DUPCAPUSED) == 0)
8812   {
8813   uint32_t groupinfo = cb->groupinfo[group];
8814   if ((groupinfo & GI_NOT_FIXED_LENGTH) != 0) return -1;
8815   if ((groupinfo & GI_SET_FIXED_LENGTH) != 0)
8816     {
8817     if (isinline) *pptrptr = parsed_skip(*pptrptr, PSKIP_KET);
8818     return groupinfo & GI_FIXED_LENGTH_MASK;
8819     }
8820   }
8821 
8822 /* Scan the group. In this case we find the end pointer of necessity. */
8823 
8824 for(;;)
8825   {
8826   branchlength = get_branchlength(pptrptr, errcodeptr, lcptr, recurses, cb);
8827   if (branchlength < 0) goto ISNOTFIXED;
8828   if (grouplength == -1) grouplength = branchlength;
8829     else if (grouplength != branchlength) goto ISNOTFIXED;
8830   if (**pptrptr == META_KET) break;
8831   *pptrptr += 1;   /* Skip META_ALT */
8832   }
8833 
8834 if (group > 0)
8835   cb->groupinfo[group] |= (uint32_t)(GI_SET_FIXED_LENGTH | grouplength);
8836 return grouplength;
8837 
8838 ISNOTFIXED:
8839 if (group > 0) cb->groupinfo[group] |= GI_NOT_FIXED_LENGTH;
8840 return -1;
8841 }
8842 
8843 
8844 
8845 /*************************************************
8846 *        Find length of a parsed branch          *
8847 *************************************************/
8848 
8849 /* Return a fixed length for a branch in a lookbehind, giving an error if the
8850 length is not fixed. If any lookbehinds are encountered on the way, they get
8851 their length set. On entry, *pptrptr points to the first element inside the
8852 branch. On exit it is set to point to the ALT or KET.
8853 
8854 Arguments:
8855   pptrptr     pointer to pointer in the parsed pattern
8856   errcodeptr  pointer to error code
8857   lcptr       pointer to loop counter
8858   recurses    chain of recurse_check to catch mutual recursion
8859   cb          pointer to compile block
8860 
8861 Returns:      the length, or a negative value on error
8862 */
8863 
8864 static int
get_branchlength(uint32_t ** pptrptr,int * errcodeptr,int * lcptr,parsed_recurse_check * recurses,compile_block * cb)8865 get_branchlength(uint32_t **pptrptr, int *errcodeptr, int *lcptr,
8866   parsed_recurse_check *recurses, compile_block *cb)
8867 {
8868 int branchlength = 0;
8869 int grouplength;
8870 uint32_t lastitemlength = 0;
8871 uint32_t *pptr = *pptrptr;
8872 PCRE2_SIZE offset;
8873 parsed_recurse_check this_recurse;
8874 
8875 /* A large and/or complex regex can take too long to process. This can happen
8876 more often when (?| groups are present in the pattern because their length
8877 cannot be cached. */
8878 
8879 if ((*lcptr)++ > 2000)
8880   {
8881   *errcodeptr = ERR35;  /* Lookbehind is too complicated */
8882   return -1;
8883   }
8884 
8885 /* Scan the branch, accumulating the length. */
8886 
8887 for (;; pptr++)
8888   {
8889   parsed_recurse_check *r;
8890   uint32_t *gptr, *gptrend;
8891   uint32_t escape;
8892   uint32_t group = 0;
8893   uint32_t itemlength = 0;
8894 
8895   if (*pptr < META_END)
8896     {
8897     itemlength = 1;
8898     }
8899 
8900   else switch (META_CODE(*pptr))
8901     {
8902     case META_KET:
8903     case META_ALT:
8904     goto EXIT;
8905 
8906     /* (*ACCEPT) and (*FAIL) terminate the branch, but we must skip to the
8907     actual termination. */
8908 
8909     case META_ACCEPT:
8910     case META_FAIL:
8911     pptr = parsed_skip(pptr, PSKIP_ALT);
8912     if (pptr == NULL) goto PARSED_SKIP_FAILED;
8913     goto EXIT;
8914 
8915     case META_MARK:
8916     case META_COMMIT_ARG:
8917     case META_PRUNE_ARG:
8918     case META_SKIP_ARG:
8919     case META_THEN_ARG:
8920     pptr += pptr[1] + 1;
8921     break;
8922 
8923     case META_CIRCUMFLEX:
8924     case META_COMMIT:
8925     case META_DOLLAR:
8926     case META_PRUNE:
8927     case META_SKIP:
8928     case META_THEN:
8929     break;
8930 
8931     case META_OPTIONS:
8932     pptr += 1;
8933     break;
8934 
8935     case META_BIGVALUE:
8936     itemlength = 1;
8937     pptr += 1;
8938     break;
8939 
8940     case META_CLASS:
8941     case META_CLASS_NOT:
8942     itemlength = 1;
8943     pptr = parsed_skip(pptr, PSKIP_CLASS);
8944     if (pptr == NULL) goto PARSED_SKIP_FAILED;
8945     break;
8946 
8947     case META_CLASS_EMPTY_NOT:
8948     case META_DOT:
8949     itemlength = 1;
8950     break;
8951 
8952     case META_CALLOUT_NUMBER:
8953     pptr += 3;
8954     break;
8955 
8956     case META_CALLOUT_STRING:
8957     pptr += 3 + SIZEOFFSET;
8958     break;
8959 
8960     /* Only some escapes consume a character. Of those, \R and \X are never
8961     allowed because they might match more than character. \C is allowed only in
8962     32-bit and non-UTF 8/16-bit modes. */
8963 
8964     case META_ESCAPE:
8965     escape = META_DATA(*pptr);
8966     if (escape == ESC_R || escape == ESC_X) return -1;
8967     if (escape > ESC_b && escape < ESC_Z)
8968       {
8969 #if PCRE2_CODE_UNIT_WIDTH != 32
8970       if ((cb->external_options & PCRE2_UTF) != 0 && escape == ESC_C)
8971         {
8972         *errcodeptr = ERR36;
8973         return -1;
8974         }
8975 #endif
8976       itemlength = 1;
8977       if (escape == ESC_p || escape == ESC_P) pptr++;  /* Skip prop data */
8978       }
8979     break;
8980 
8981     /* Lookaheads can be ignored, but we must start the skip inside the group
8982     so that it isn't treated as a group within the branch. */
8983 
8984     case META_LOOKAHEAD:
8985     case META_LOOKAHEADNOT:
8986     pptr = parsed_skip(pptr + 1, PSKIP_KET);
8987     if (pptr == NULL) goto PARSED_SKIP_FAILED;
8988 
8989     /* Also ignore any qualifiers that follow a lookahead assertion. */
8990 
8991     switch (pptr[1])
8992       {
8993       case META_ASTERISK:
8994       case META_ASTERISK_PLUS:
8995       case META_ASTERISK_QUERY:
8996       case META_PLUS:
8997       case META_PLUS_PLUS:
8998       case META_PLUS_QUERY:
8999       case META_QUERY:
9000       case META_QUERY_PLUS:
9001       case META_QUERY_QUERY:
9002       pptr++;
9003       break;
9004 
9005       case META_MINMAX:
9006       case META_MINMAX_PLUS:
9007       case META_MINMAX_QUERY:
9008       pptr += 3;
9009       break;
9010 
9011       default:
9012       break;
9013       }
9014     break;
9015 
9016     /* Lookbehinds can be ignored, but must themselves be checked. */
9017 
9018     case META_LOOKBEHIND:
9019     case META_LOOKBEHINDNOT:
9020     if (!set_lookbehind_lengths(&pptr, errcodeptr, lcptr, recurses, cb))
9021       return -1;
9022     break;
9023 
9024     /* Back references and recursions are handled by very similar code. At this
9025     stage, the names generated in the parsing pass are available, but the main
9026     name table has not yet been created. So for the named varieties, scan the
9027     list of names in order to get the number of the first one in the pattern,
9028     and whether or not this name is duplicated. */
9029 
9030     case META_BACKREF_BYNAME:
9031     if ((cb->external_options & PCRE2_MATCH_UNSET_BACKREF) != 0)
9032       goto ISNOTFIXED;
9033     /* Fall through */
9034 
9035     case META_RECURSE_BYNAME:
9036       {
9037       int i;
9038       PCRE2_SPTR name;
9039       BOOL is_dupname = FALSE;
9040       named_group *ng = cb->named_groups;
9041       uint32_t meta_code = META_CODE(*pptr);
9042       uint32_t length = *(++pptr);
9043 
9044       GETPLUSOFFSET(offset, pptr);
9045       name = cb->start_pattern + offset;
9046       for (i = 0; i < cb->names_found; i++, ng++)
9047         {
9048         if (length == ng->length && PRIV(strncmp)(name, ng->name, length) == 0)
9049           {
9050           group = ng->number;
9051           is_dupname = ng->isdup;
9052           break;
9053           }
9054         }
9055 
9056       if (group == 0)
9057         {
9058         *errcodeptr = ERR15;  /* Non-existent subpattern */
9059         cb->erroroffset = offset;
9060         return -1;
9061         }
9062 
9063       /* A numerical back reference can be fixed length if duplicate capturing
9064       groups are not being used. A non-duplicate named back reference can also
9065       be handled. */
9066 
9067       if (meta_code == META_RECURSE_BYNAME ||
9068           (!is_dupname && (cb->external_flags & PCRE2_DUPCAPUSED) == 0))
9069         goto RECURSE_OR_BACKREF_LENGTH;  /* Handle as a numbered version. */
9070       }
9071     goto ISNOTFIXED;                     /* Duplicate name or number */
9072 
9073     /* The offset values for back references < 10 are in a separate vector
9074     because otherwise they would use more than two parsed pattern elements on
9075     64-bit systems. */
9076 
9077     case META_BACKREF:
9078     if ((cb->external_options & PCRE2_MATCH_UNSET_BACKREF) != 0 ||
9079         (cb->external_flags & PCRE2_DUPCAPUSED) != 0)
9080       goto ISNOTFIXED;
9081     group = META_DATA(*pptr);
9082     if (group < 10)
9083       {
9084       offset = cb->small_ref_offset[group];
9085       goto RECURSE_OR_BACKREF_LENGTH;
9086       }
9087 
9088     /* Fall through */
9089     /* For groups >= 10 - picking up group twice does no harm. */
9090 
9091     /* A true recursion implies not fixed length, but a subroutine call may
9092     be OK. Back reference "recursions" are also failed. */
9093 
9094     case META_RECURSE:
9095     group = META_DATA(*pptr);
9096     GETPLUSOFFSET(offset, pptr);
9097 
9098     RECURSE_OR_BACKREF_LENGTH:
9099     if (group > cb->bracount)
9100       {
9101       cb->erroroffset = offset;
9102       *errcodeptr = ERR15;  /* Non-existent subpattern */
9103       return -1;
9104       }
9105     if (group == 0) goto ISNOTFIXED;  /* Local recursion */
9106     for (gptr = cb->parsed_pattern; *gptr != META_END; gptr++)
9107       {
9108       if (META_CODE(*gptr) == META_BIGVALUE) gptr++;
9109         else if (*gptr == (META_CAPTURE | group)) break;
9110       }
9111 
9112     /* We must start the search for the end of the group at the first meta code
9113     inside the group. Otherwise it will be treated as an enclosed group. */
9114 
9115     gptrend = parsed_skip(gptr + 1, PSKIP_KET);
9116     if (gptrend == NULL) goto PARSED_SKIP_FAILED;
9117     if (pptr > gptr && pptr < gptrend) goto ISNOTFIXED;  /* Local recursion */
9118     for (r = recurses; r != NULL; r = r->prev) if (r->groupptr == gptr) break;
9119     if (r != NULL) goto ISNOTFIXED;   /* Mutual recursion */
9120     this_recurse.prev = recurses;
9121     this_recurse.groupptr = gptr;
9122 
9123     /* We do not need to know the position of the end of the group, that is,
9124     gptr is not used after the call to get_grouplength(). Setting the second
9125     argument FALSE stops it scanning for the end when the length can be found
9126     in the cache. */
9127 
9128     gptr++;
9129     grouplength = get_grouplength(&gptr, FALSE, errcodeptr, lcptr, group,
9130       &this_recurse, cb);
9131     if (grouplength < 0)
9132       {
9133       if (*errcodeptr == 0) goto ISNOTFIXED;
9134       return -1;  /* Error already set */
9135       }
9136     itemlength = grouplength;
9137     break;
9138 
9139     /* Check nested groups - advance past the initial data for each type and
9140     then seek a fixed length with get_grouplength(). */
9141 
9142     case META_COND_NAME:
9143     case META_COND_NUMBER:
9144     case META_COND_RNAME:
9145     case META_COND_RNUMBER:
9146     case META_COND_DEFINE:
9147     pptr += 2 + SIZEOFFSET;
9148     goto CHECK_GROUP;
9149 
9150     case META_COND_ASSERT:
9151     pptr += 1;
9152     goto CHECK_GROUP;
9153 
9154     case META_COND_VERSION:
9155     pptr += 4;
9156     goto CHECK_GROUP;
9157 
9158     case META_CAPTURE:
9159     group = META_DATA(*pptr);
9160     /* Fall through */
9161 
9162     case META_ATOMIC:
9163     case META_NOCAPTURE:
9164     case META_SCRIPT_RUN:
9165     pptr++;
9166     CHECK_GROUP:
9167     grouplength = get_grouplength(&pptr, TRUE, errcodeptr, lcptr, group,
9168       recurses, cb);
9169     if (grouplength < 0) return -1;
9170     itemlength = grouplength;
9171     break;
9172 
9173     /* Exact repetition is OK; variable repetition is not. A repetition of zero
9174     must subtract the length that has already been added. */
9175 
9176     case META_MINMAX:
9177     case META_MINMAX_PLUS:
9178     case META_MINMAX_QUERY:
9179     if (pptr[1] == pptr[2])
9180       {
9181       if (pptr[1] == 0) branchlength -= lastitemlength;
9182         else itemlength = (pptr[1] - 1) * lastitemlength;
9183       pptr += 2;
9184       break;
9185       }
9186     /* Fall through */
9187 
9188     /* Any other item means this branch does not have a fixed length. */
9189 
9190     default:
9191     ISNOTFIXED:
9192     *errcodeptr = ERR25;   /* Not fixed length */
9193     return -1;
9194     }
9195 
9196   /* Add the item length to the branchlength, and save it for use if the next
9197   thing is a quantifier. */
9198 
9199   branchlength += itemlength;
9200   lastitemlength = itemlength;
9201 
9202   /* Ensure that the length does not overflow the limit. */
9203 
9204   if (branchlength > LOOKBEHIND_MAX)
9205     {
9206     *errcodeptr = ERR87;
9207     return -1;
9208     }
9209   }
9210 
9211 EXIT:
9212 *pptrptr = pptr;
9213 if (branchlength > cb->max_lookbehind) cb->max_lookbehind = branchlength;
9214 return branchlength;
9215 
9216 PARSED_SKIP_FAILED:
9217 *errcodeptr = ERR90;
9218 return -1;
9219 }
9220 
9221 
9222 
9223 /*************************************************
9224 *        Set lengths in a lookbehind             *
9225 *************************************************/
9226 
9227 /* This function is called for each lookbehind, to set the lengths in its
9228 branches. An error occurs if any branch does not have a fixed length that is
9229 less than the maximum (65535). On exit, the pointer must be left on the final
9230 ket.
9231 
9232 Arguments:
9233   pptrptr     pointer to pointer in the parsed pattern
9234   errcodeptr  pointer to error code
9235   lcptr       pointer to loop counter
9236   recurses    chain of recurse_check to catch mutual recursion
9237   cb          pointer to compile block
9238 
9239 Returns:      TRUE if all is well
9240               FALSE otherwise, with error code and offset set
9241 */
9242 
9243 static BOOL
set_lookbehind_lengths(uint32_t ** pptrptr,int * errcodeptr,int * lcptr,parsed_recurse_check * recurses,compile_block * cb)9244 set_lookbehind_lengths(uint32_t **pptrptr, int *errcodeptr, int *lcptr,
9245   parsed_recurse_check *recurses, compile_block *cb)
9246 {
9247 PCRE2_SIZE offset;
9248 int branchlength;
9249 uint32_t *bptr = *pptrptr;
9250 
9251 READPLUSOFFSET(offset, bptr);  /* Offset for error messages */
9252 *pptrptr += SIZEOFFSET;
9253 
9254 do
9255   {
9256   *pptrptr += 1;
9257   branchlength = get_branchlength(pptrptr, errcodeptr, lcptr, recurses, cb);
9258   if (branchlength < 0)
9259     {
9260     /* The errorcode and offset may already be set from a nested lookbehind. */
9261     if (*errcodeptr == 0) *errcodeptr = ERR25;
9262     if (cb->erroroffset == PCRE2_UNSET) cb->erroroffset = offset;
9263     return FALSE;
9264     }
9265   *bptr |= branchlength;  /* branchlength never more than 65535 */
9266   bptr = *pptrptr;
9267   }
9268 while (*bptr == META_ALT);
9269 
9270 return TRUE;
9271 }
9272 
9273 
9274 
9275 /*************************************************
9276 *         Check parsed pattern lookbehinds       *
9277 *************************************************/
9278 
9279 /* This function is called at the end of parsing a pattern if any lookbehinds
9280 were encountered. It scans the parsed pattern for them, calling
9281 set_lookbehind_lengths() for each one. At the start, the errorcode is zero and
9282 the error offset is marked unset. The enables the functions above not to
9283 override settings from deeper nestings.
9284 
9285 Arguments cb      points to the compile block
9286 Returns:          0 on success, or an errorcode (cb->erroroffset will be set)
9287 */
9288 
9289 static int
check_lookbehinds(compile_block * cb)9290 check_lookbehinds(compile_block *cb)
9291 {
9292 uint32_t *pptr;
9293 int errorcode = 0;
9294 int loopcount = 0;
9295 
9296 cb->erroroffset = PCRE2_UNSET;
9297 
9298 for (pptr = cb->parsed_pattern; *pptr != META_END; pptr++)
9299   {
9300   if (*pptr < META_END) continue;  /* Literal */
9301 
9302   switch (META_CODE(*pptr))
9303     {
9304     default:
9305     return ERR70;  /* Unrecognized meta code */
9306 
9307     case META_ESCAPE:
9308     if (*pptr - META_ESCAPE == ESC_P || *pptr - META_ESCAPE == ESC_p)
9309       pptr += 1;
9310     break;
9311 
9312     case META_ACCEPT:
9313     case META_ALT:
9314     case META_ASTERISK:
9315     case META_ASTERISK_PLUS:
9316     case META_ASTERISK_QUERY:
9317     case META_ATOMIC:
9318     case META_BACKREF:
9319     case META_CAPTURE:
9320     case META_CIRCUMFLEX:
9321     case META_CLASS:
9322     case META_CLASS_EMPTY:
9323     case META_CLASS_EMPTY_NOT:
9324     case META_CLASS_END:
9325     case META_CLASS_NOT:
9326     case META_COMMIT:
9327     case META_COND_ASSERT:
9328     case META_DOLLAR:
9329     case META_DOT:
9330     case META_FAIL:
9331     case META_KET:
9332     case META_LOOKAHEAD:
9333     case META_LOOKAHEADNOT:
9334     case META_NOCAPTURE:
9335     case META_PLUS:
9336     case META_PLUS_PLUS:
9337     case META_PLUS_QUERY:
9338     case META_PRUNE:
9339     case META_QUERY:
9340     case META_QUERY_PLUS:
9341     case META_QUERY_QUERY:
9342     case META_RANGE_ESCAPED:
9343     case META_RANGE_LITERAL:
9344     case META_SCRIPT_RUN:
9345     case META_SKIP:
9346     case META_THEN:
9347     break;
9348 
9349     case META_RECURSE:
9350     pptr += SIZEOFFSET;
9351     break;
9352 
9353     case META_BACKREF_BYNAME:
9354     case META_COND_DEFINE:
9355     case META_COND_NAME:
9356     case META_COND_NUMBER:
9357     case META_COND_RNAME:
9358     case META_COND_RNUMBER:
9359     case META_RECURSE_BYNAME:
9360     pptr += 1 + SIZEOFFSET;
9361     break;
9362 
9363     case META_CALLOUT_STRING:
9364     pptr += 3 + SIZEOFFSET;
9365     break;
9366 
9367     case META_BIGVALUE:
9368     case META_OPTIONS:
9369     case META_POSIX:
9370     case META_POSIX_NEG:
9371     pptr += 1;
9372     break;
9373 
9374     case META_MINMAX:
9375     case META_MINMAX_QUERY:
9376     case META_MINMAX_PLUS:
9377     pptr += 2;
9378     break;
9379 
9380     case META_CALLOUT_NUMBER:
9381     case META_COND_VERSION:
9382     pptr += 3;
9383     break;
9384 
9385     case META_MARK:
9386     case META_COMMIT_ARG:
9387     case META_PRUNE_ARG:
9388     case META_SKIP_ARG:
9389     case META_THEN_ARG:
9390     pptr += 1 + pptr[1];
9391     break;
9392 
9393     case META_LOOKBEHIND:
9394     case META_LOOKBEHINDNOT:
9395     if (!set_lookbehind_lengths(&pptr, &errorcode, &loopcount, NULL, cb))
9396       return errorcode;
9397     break;
9398     }
9399   }
9400 
9401 return 0;
9402 }
9403 
9404 
9405 
9406 /*************************************************
9407 *     External function to compile a pattern     *
9408 *************************************************/
9409 
9410 /* This function reads a regular expression in the form of a string and returns
9411 a pointer to a block of store holding a compiled version of the expression.
9412 
9413 Arguments:
9414   pattern       the regular expression
9415   patlen        the length of the pattern, or PCRE2_ZERO_TERMINATED
9416   options       option bits
9417   errorptr      pointer to errorcode
9418   erroroffset   pointer to error offset
9419   ccontext      points to a compile context or is NULL
9420 
9421 Returns:        pointer to compiled data block, or NULL on error,
9422                 with errorcode and erroroffset set
9423 */
9424 
9425 PCRE2_EXP_DEFN pcre2_code * PCRE2_CALL_CONVENTION
pcre2_compile(PCRE2_SPTR pattern,PCRE2_SIZE patlen,uint32_t options,int * errorptr,PCRE2_SIZE * erroroffset,pcre2_compile_context * ccontext)9426 pcre2_compile(PCRE2_SPTR pattern, PCRE2_SIZE patlen, uint32_t options,
9427    int *errorptr, PCRE2_SIZE *erroroffset, pcre2_compile_context *ccontext)
9428 {
9429 BOOL utf;                             /* Set TRUE for UTF mode */
9430 BOOL has_lookbehind = FALSE;          /* Set TRUE if a lookbehind is found */
9431 BOOL zero_terminated;                 /* Set TRUE for zero-terminated pattern */
9432 pcre2_real_code *re = NULL;           /* What we will return */
9433 compile_block cb;                     /* "Static" compile-time data */
9434 const uint8_t *tables;                /* Char tables base pointer */
9435 
9436 PCRE2_UCHAR *code;                    /* Current pointer in compiled code */
9437 PCRE2_SPTR codestart;                 /* Start of compiled code */
9438 PCRE2_SPTR ptr;                       /* Current pointer in pattern */
9439 uint32_t *pptr;                       /* Current pointer in parsed pattern */
9440 
9441 PCRE2_SIZE length = 1;                /* Allow for final END opcode */
9442 PCRE2_SIZE usedlength;                /* Actual length used */
9443 PCRE2_SIZE re_blocksize;              /* Size of memory block */
9444 PCRE2_SIZE big32count = 0;            /* 32-bit literals >= 0x80000000 */
9445 PCRE2_SIZE parsed_size_needed;        /* Needed for parsed pattern */
9446 
9447 int32_t firstcuflags, reqcuflags;     /* Type of first/req code unit */
9448 uint32_t firstcu, reqcu;              /* Value of first/req code unit */
9449 uint32_t setflags = 0;                /* NL and BSR set flags */
9450 
9451 uint32_t skipatstart;                 /* When checking (*UTF) etc */
9452 uint32_t limit_heap  = UINT32_MAX;
9453 uint32_t limit_match = UINT32_MAX;    /* Unset match limits */
9454 uint32_t limit_depth = UINT32_MAX;
9455 
9456 int newline = 0;                      /* Unset; can be set by the pattern */
9457 int bsr = 0;                          /* Unset; can be set by the pattern */
9458 int errorcode = 0;                    /* Initialize to avoid compiler warn */
9459 int regexrc;                          /* Return from compile */
9460 
9461 uint32_t i;                           /* Local loop counter */
9462 
9463 /* Comments at the head of this file explain about these variables. */
9464 
9465 uint32_t stack_groupinfo[GROUPINFO_DEFAULT_SIZE];
9466 uint32_t stack_parsed_pattern[PARSED_PATTERN_DEFAULT_SIZE];
9467 named_group named_groups[NAMED_GROUP_LIST_SIZE];
9468 
9469 /* The workspace is used in different ways in the different compiling phases.
9470 It needs to be 16-bit aligned for the preliminary parsing scan. */
9471 
9472 uint32_t c16workspace[C16_WORK_SIZE];
9473 PCRE2_UCHAR *cworkspace = (PCRE2_UCHAR *)c16workspace;
9474 
9475 
9476 /* -------------- Check arguments and set up the pattern ----------------- */
9477 
9478 /* There must be error code and offset pointers. */
9479 
9480 if (errorptr == NULL || erroroffset == NULL) return NULL;
9481 *errorptr = ERR0;
9482 *erroroffset = 0;
9483 
9484 /* There must be a pattern! */
9485 
9486 if (pattern == NULL)
9487   {
9488   *errorptr = ERR16;
9489   return NULL;
9490   }
9491 
9492 /* A NULL compile context means "use a default context" */
9493 
9494 if (ccontext == NULL)
9495   ccontext = (pcre2_compile_context *)(&PRIV(default_compile_context));
9496 
9497 /* Check that all undefined public option bits are zero. */
9498 
9499 if ((options & ~PUBLIC_COMPILE_OPTIONS) != 0 ||
9500     (ccontext->extra_options & ~PUBLIC_COMPILE_EXTRA_OPTIONS) != 0)
9501   {
9502   *errorptr = ERR17;
9503   return NULL;
9504   }
9505 
9506 if ((options & PCRE2_LITERAL) != 0 &&
9507     ((options & ~PUBLIC_LITERAL_COMPILE_OPTIONS) != 0 ||
9508      (ccontext->extra_options & ~PUBLIC_LITERAL_COMPILE_EXTRA_OPTIONS) != 0))
9509   {
9510   *errorptr = ERR92;
9511   return NULL;
9512   }
9513 
9514 /* A zero-terminated pattern is indicated by the special length value
9515 PCRE2_ZERO_TERMINATED. Check for an overlong pattern. */
9516 
9517 if ((zero_terminated = (patlen == PCRE2_ZERO_TERMINATED)))
9518   patlen = PRIV(strlen)(pattern);
9519 
9520 if (patlen > ccontext->max_pattern_length)
9521   {
9522   *errorptr = ERR88;
9523   return NULL;
9524   }
9525 
9526 /* From here on, all returns from this function should end up going via the
9527 EXIT label. */
9528 
9529 
9530 /* ------------ Initialize the "static" compile data -------------- */
9531 
9532 tables = (ccontext->tables != NULL)? ccontext->tables : PRIV(default_tables);
9533 
9534 cb.lcc = tables + lcc_offset;          /* Individual */
9535 cb.fcc = tables + fcc_offset;          /*   character */
9536 cb.cbits = tables + cbits_offset;      /*      tables */
9537 cb.ctypes = tables + ctypes_offset;
9538 
9539 cb.assert_depth = 0;
9540 cb.bracount = 0;
9541 cb.cx = ccontext;
9542 cb.dupnames = FALSE;
9543 cb.end_pattern = pattern + patlen;
9544 cb.erroroffset = 0;
9545 cb.external_flags = 0;
9546 cb.external_options = options;
9547 cb.groupinfo = stack_groupinfo;
9548 cb.had_recurse = FALSE;
9549 cb.lastcapture = 0;
9550 cb.max_lookbehind = 0;
9551 cb.name_entry_size = 0;
9552 cb.name_table = NULL;
9553 cb.named_groups = named_groups;
9554 cb.named_group_list_size = NAMED_GROUP_LIST_SIZE;
9555 cb.names_found = 0;
9556 cb.open_caps = NULL;
9557 cb.parens_depth = 0;
9558 cb.parsed_pattern = stack_parsed_pattern;
9559 cb.req_varyopt = 0;
9560 cb.start_code = cworkspace;
9561 cb.start_pattern = pattern;
9562 cb.start_workspace = cworkspace;
9563 cb.workspace_size = COMPILE_WORK_SIZE;
9564 
9565 /* Maximum back reference and backref bitmap. The bitmap records up to 31 back
9566 references to help in deciding whether (.*) can be treated as anchored or not.
9567 */
9568 
9569 cb.top_backref = 0;
9570 cb.backref_map = 0;
9571 
9572 /* Escape sequences \1 to \9 are always back references, but as they are only
9573 two characters long, only two elements can be used in the parsed_pattern
9574 vector. The first contains the reference, and we'd like to use the second to
9575 record the offset in the pattern, so that forward references to non-existent
9576 groups can be diagnosed later with an offset. However, on 64-bit systems,
9577 PCRE2_SIZE won't fit. Instead, we have a vector of offsets for the first
9578 occurrence of \1 to \9, indexed by the second parsed_pattern value. All other
9579 references have enough space for the offset to be put into the parsed pattern.
9580 */
9581 
9582 for (i = 0; i < 10; i++) cb.small_ref_offset[i] = PCRE2_UNSET;
9583 
9584 
9585 /* --------------- Start looking at the pattern --------------- */
9586 
9587 /* Unless PCRE2_LITERAL is set, check for global one-time option settings at
9588 the start of the pattern, and remember the offset to the actual regex. With
9589 valgrind support, make the terminator of a zero-terminated pattern
9590 inaccessible. This catches bugs that would otherwise only show up for
9591 non-zero-terminated patterns. */
9592 
9593 #ifdef SUPPORT_VALGRIND
9594 if (zero_terminated) VALGRIND_MAKE_MEM_NOACCESS(pattern + patlen, CU2BYTES(1));
9595 #endif
9596 
9597 ptr = pattern;
9598 skipatstart = 0;
9599 
9600 if ((options & PCRE2_LITERAL) == 0)
9601   {
9602   while (patlen - skipatstart >= 2 &&
9603          ptr[skipatstart] == CHAR_LEFT_PARENTHESIS &&
9604          ptr[skipatstart+1] == CHAR_ASTERISK)
9605     {
9606     for (i = 0; i < sizeof(pso_list)/sizeof(pso); i++)
9607       {
9608       uint32_t c, pp;
9609       pso *p = pso_list + i;
9610 
9611       if (patlen - skipatstart - 2 >= p->length &&
9612           PRIV(strncmp_c8)(ptr + skipatstart + 2, (char *)(p->name),
9613             p->length) == 0)
9614         {
9615         skipatstart += p->length + 2;
9616         switch(p->type)
9617           {
9618           case PSO_OPT:
9619           cb.external_options |= p->value;
9620           break;
9621 
9622           case PSO_FLG:
9623           setflags |= p->value;
9624           break;
9625 
9626           case PSO_NL:
9627           newline = p->value;
9628           setflags |= PCRE2_NL_SET;
9629           break;
9630 
9631           case PSO_BSR:
9632           bsr = p->value;
9633           setflags |= PCRE2_BSR_SET;
9634           break;
9635 
9636           case PSO_LIMM:
9637           case PSO_LIMD:
9638           case PSO_LIMH:
9639           c = 0;
9640           pp = skipatstart;
9641           if (!IS_DIGIT(ptr[pp]))
9642             {
9643             errorcode = ERR60;
9644             ptr += pp;
9645             goto HAD_EARLY_ERROR;
9646             }
9647           while (IS_DIGIT(ptr[pp]))
9648             {
9649             if (c > UINT32_MAX / 10 - 1) break;   /* Integer overflow */
9650             c = c*10 + (ptr[pp++] - CHAR_0);
9651             }
9652           if (ptr[pp++] != CHAR_RIGHT_PARENTHESIS)
9653             {
9654             errorcode = ERR60;
9655             ptr += pp;
9656             goto HAD_EARLY_ERROR;
9657             }
9658           if (p->type == PSO_LIMH) limit_heap = c;
9659             else if (p->type == PSO_LIMM) limit_match = c;
9660             else limit_depth = c;
9661           skipatstart += pp - skipatstart;
9662           break;
9663           }
9664         break;   /* Out of the table scan loop */
9665         }
9666       }
9667     if (i >= sizeof(pso_list)/sizeof(pso)) break;   /* Out of pso loop */
9668     }
9669   }
9670 
9671 /* End of pattern-start options; advance to start of real regex. */
9672 
9673 ptr += skipatstart;
9674 
9675 /* Can't support UTF or UCP unless PCRE2 has been compiled with UTF support. */
9676 
9677 #ifndef SUPPORT_UNICODE
9678 if ((cb.external_options & (PCRE2_UTF|PCRE2_UCP)) != 0)
9679   {
9680   errorcode = ERR32;
9681   goto HAD_EARLY_ERROR;
9682   }
9683 #endif
9684 
9685 /* Check UTF. We have the original options in 'options', with that value as
9686 modified by (*UTF) etc in cb->external_options. The extra option
9687 PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES is not permitted in UTF-16 mode because the
9688 surrogate code points cannot be represented in UTF-16. */
9689 
9690 utf = (cb.external_options & PCRE2_UTF) != 0;
9691 if (utf)
9692   {
9693   if ((options & PCRE2_NEVER_UTF) != 0)
9694     {
9695     errorcode = ERR74;
9696     goto HAD_EARLY_ERROR;
9697     }
9698   if ((options & PCRE2_NO_UTF_CHECK) == 0 &&
9699        (errorcode = PRIV(valid_utf)(pattern, patlen, erroroffset)) != 0)
9700     goto HAD_ERROR;  /* Offset was set by valid_utf() */
9701 
9702 #if PCRE2_CODE_UNIT_WIDTH == 16
9703   if ((ccontext->extra_options & PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES) != 0)
9704     {
9705     errorcode = ERR91;
9706     goto HAD_EARLY_ERROR;
9707     }
9708 #endif
9709   }
9710 
9711 /* Check UCP lockout. */
9712 
9713 if ((cb.external_options & (PCRE2_UCP|PCRE2_NEVER_UCP)) ==
9714     (PCRE2_UCP|PCRE2_NEVER_UCP))
9715   {
9716   errorcode = ERR75;
9717   goto HAD_EARLY_ERROR;
9718   }
9719 
9720 /* Process the BSR setting. */
9721 
9722 if (bsr == 0) bsr = ccontext->bsr_convention;
9723 
9724 /* Process the newline setting. */
9725 
9726 if (newline == 0) newline = ccontext->newline_convention;
9727 cb.nltype = NLTYPE_FIXED;
9728 switch(newline)
9729   {
9730   case PCRE2_NEWLINE_CR:
9731   cb.nllen = 1;
9732   cb.nl[0] = CHAR_CR;
9733   break;
9734 
9735   case PCRE2_NEWLINE_LF:
9736   cb.nllen = 1;
9737   cb.nl[0] = CHAR_NL;
9738   break;
9739 
9740   case PCRE2_NEWLINE_NUL:
9741   cb.nllen = 1;
9742   cb.nl[0] = CHAR_NUL;
9743   break;
9744 
9745   case PCRE2_NEWLINE_CRLF:
9746   cb.nllen = 2;
9747   cb.nl[0] = CHAR_CR;
9748   cb.nl[1] = CHAR_NL;
9749   break;
9750 
9751   case PCRE2_NEWLINE_ANY:
9752   cb.nltype = NLTYPE_ANY;
9753   break;
9754 
9755   case PCRE2_NEWLINE_ANYCRLF:
9756   cb.nltype = NLTYPE_ANYCRLF;
9757   break;
9758 
9759   default:
9760   errorcode = ERR56;
9761   goto HAD_EARLY_ERROR;
9762   }
9763 
9764 /* Pre-scan the pattern to do two things: (1) Discover the named groups and
9765 their numerical equivalents, so that this information is always available for
9766 the remaining processing. (2) At the same time, parse the pattern and put a
9767 processed version into the parsed_pattern vector. This has escapes interpreted
9768 and comments removed (amongst other things).
9769 
9770 In all but one case, when PCRE2_AUTO_CALLOUT is not set, the number of unsigned
9771 32-bit ints in the parsed pattern is bounded by the length of the pattern plus
9772 one (for the terminator) plus four if PCRE2_EXTRA_WORD or PCRE2_EXTRA_LINE is
9773 set. The exceptional case is when running in 32-bit, non-UTF mode, when literal
9774 characters greater than META_END (0x80000000) have to be coded as two units. In
9775 this case, therefore, we scan the pattern to check for such values. */
9776 
9777 #if PCRE2_CODE_UNIT_WIDTH == 32
9778 if (!utf)
9779   {
9780   PCRE2_SPTR p;
9781   for (p = ptr; p < cb.end_pattern; p++) if (*p >= META_END) big32count++;
9782   }
9783 #endif
9784 
9785 /* Ensure that the parsed pattern buffer is big enough. When PCRE2_AUTO_CALLOUT
9786 is set we have to assume a numerical callout (4 elements) for each character
9787 plus one at the end. This is overkill, but memory is plentiful these days. For
9788 many smaller patterns the vector on the stack (which was set up above) can be
9789 used. */
9790 
9791 parsed_size_needed = patlen - skipatstart + big32count;
9792 
9793 if ((ccontext->extra_options &
9794      (PCRE2_EXTRA_MATCH_WORD|PCRE2_EXTRA_MATCH_LINE)) != 0)
9795   parsed_size_needed += 4;
9796 
9797 if ((options & PCRE2_AUTO_CALLOUT) != 0)
9798   parsed_size_needed = (parsed_size_needed + 1) * 5;
9799 
9800 if (parsed_size_needed >= PARSED_PATTERN_DEFAULT_SIZE)
9801   {
9802   uint32_t *heap_parsed_pattern = ccontext->memctl.malloc(
9803     (parsed_size_needed + 1) * sizeof(uint32_t), ccontext->memctl.memory_data);
9804   if (heap_parsed_pattern == NULL)
9805     {
9806     *errorptr = ERR21;
9807     goto EXIT;
9808     }
9809   cb.parsed_pattern = heap_parsed_pattern;
9810   }
9811 cb.parsed_pattern_end = cb.parsed_pattern + parsed_size_needed + 1;
9812 
9813 /* Do the parsing scan. */
9814 
9815 errorcode = parse_regex(ptr, cb.external_options, &has_lookbehind, &cb);
9816 if (errorcode != 0) goto HAD_CB_ERROR;
9817 
9818 /* Workspace is needed to remember information about numbered groups: whether a
9819 group can match an empty string and what its fixed length is. This is done to
9820 avoid the possibility of recursive references causing very long compile times
9821 when checking these features. Unnumbered groups do not have this exposure since
9822 they cannot be referenced. We use an indexed vector for this purpose. If there
9823 are sufficiently few groups, the default vector on the stack, as set up above,
9824 can be used. Otherwise we have to get/free a special vector. The vector must be
9825 initialized to zero. */
9826 
9827 if (cb.bracount >= GROUPINFO_DEFAULT_SIZE)
9828   {
9829   cb.groupinfo = ccontext->memctl.malloc(
9830     (cb.bracount + 1)*sizeof(uint32_t), ccontext->memctl.memory_data);
9831   if (cb.groupinfo == NULL)
9832     {
9833     errorcode = ERR21;
9834     cb.erroroffset = 0;
9835     goto HAD_CB_ERROR;
9836     }
9837   }
9838 memset(cb.groupinfo, 0, (cb.bracount + 1) * sizeof(uint32_t));
9839 
9840 /* If there were any lookbehinds, scan the parsed pattern to figure out their
9841 lengths. */
9842 
9843 if (has_lookbehind)
9844   {
9845   errorcode = check_lookbehinds(&cb);
9846   if (errorcode != 0) goto HAD_CB_ERROR;
9847   }
9848 
9849 /* For debugging, there is a function that shows the parsed data vector. */
9850 
9851 #ifdef DEBUG_SHOW_PARSED
9852 fprintf(stderr, "+++ Pre-scan complete:\n");
9853 show_parsed(&cb);
9854 #endif
9855 
9856 /* For debugging capturing information this code can be enabled. */
9857 
9858 #ifdef DEBUG_SHOW_CAPTURES
9859   {
9860   named_group *ng = cb.named_groups;
9861   fprintf(stderr, "+++Captures: %d\n", cb.bracount);
9862   for (i = 0; i < cb.names_found; i++, ng++)
9863     {
9864     fprintf(stderr, "+++%3d %.*s\n", ng->number, ng->length, ng->name);
9865     }
9866   }
9867 #endif
9868 
9869 /* Pretend to compile the pattern while actually just accumulating the amount
9870 of memory required in the 'length' variable. This behaviour is triggered by
9871 passing a non-NULL final argument to compile_regex(). We pass a block of
9872 workspace (cworkspace) for it to compile parts of the pattern into; the
9873 compiled code is discarded when it is no longer needed, so hopefully this
9874 workspace will never overflow, though there is a test for its doing so.
9875 
9876 On error, errorcode will be set non-zero, so we don't need to look at the
9877 result of the function. The initial options have been put into the cb block,
9878 but we still have to pass a separate options variable (the first argument)
9879 because the options may change as the pattern is processed. */
9880 
9881 cb.erroroffset = patlen;   /* For any subsequent errors that do not set it */
9882 pptr = cb.parsed_pattern;
9883 code = cworkspace;
9884 *code = OP_BRA;
9885 
9886 (void)compile_regex(cb.external_options, &code, &pptr, &errorcode, 0, &firstcu,
9887    &firstcuflags, &reqcu, &reqcuflags, NULL, &cb, &length);
9888 
9889 if (errorcode != 0) goto HAD_CB_ERROR;  /* Offset is in cb.erroroffset */
9890 
9891 /* This should be caught in compile_regex(), but just in case... */
9892 
9893 if (length > MAX_PATTERN_SIZE)
9894   {
9895   errorcode = ERR20;
9896   goto HAD_CB_ERROR;
9897   }
9898 
9899 /* Compute the size of, and then get and initialize, the data block for storing
9900 the compiled pattern and names table. Integer overflow should no longer be
9901 possible because nowadays we limit the maximum value of cb.names_found and
9902 cb.name_entry_size. */
9903 
9904 re_blocksize = sizeof(pcre2_real_code) +
9905   CU2BYTES(length +
9906   (PCRE2_SIZE)cb.names_found * (PCRE2_SIZE)cb.name_entry_size);
9907 re = (pcre2_real_code *)
9908   ccontext->memctl.malloc(re_blocksize, ccontext->memctl.memory_data);
9909 if (re == NULL)
9910   {
9911   errorcode = ERR21;
9912   goto HAD_CB_ERROR;
9913   }
9914 
9915 /* The compiler may put padding at the end of the pcre2_real_code structure in
9916 order to round it up to a multiple of 4 or 8 bytes. This means that when a
9917 compiled pattern is copied (for example, when serialized) undefined bytes are
9918 read, and this annoys debuggers such as valgrind. To avoid this, we explicitly
9919 write to the last 8 bytes of the structure before setting the fields. */
9920 
9921 memset((char *)re + sizeof(pcre2_real_code) - 8, 0, 8);
9922 re->memctl = ccontext->memctl;
9923 re->tables = tables;
9924 re->executable_jit = NULL;
9925 memset(re->start_bitmap, 0, 32 * sizeof(uint8_t));
9926 re->blocksize = re_blocksize;
9927 re->magic_number = MAGIC_NUMBER;
9928 re->compile_options = options;
9929 re->overall_options = cb.external_options;
9930 re->extra_options = ccontext->extra_options;
9931 re->flags = PCRE2_CODE_UNIT_WIDTH/8 | cb.external_flags | setflags;
9932 re->limit_heap = limit_heap;
9933 re->limit_match = limit_match;
9934 re->limit_depth = limit_depth;
9935 re->first_codeunit = 0;
9936 re->last_codeunit = 0;
9937 re->bsr_convention = bsr;
9938 re->newline_convention = newline;
9939 re->max_lookbehind = 0;
9940 re->minlength = 0;
9941 re->top_bracket = 0;
9942 re->top_backref = 0;
9943 re->name_entry_size = cb.name_entry_size;
9944 re->name_count = cb.names_found;
9945 
9946 /* The basic block is immediately followed by the name table, and the compiled
9947 code follows after that. */
9948 
9949 codestart = (PCRE2_SPTR)((uint8_t *)re + sizeof(pcre2_real_code)) +
9950   re->name_entry_size * re->name_count;
9951 
9952 /* Update the compile data block for the actual compile. The starting points of
9953 the name/number translation table and of the code are passed around in the
9954 compile data block. The start/end pattern and initial options are already set
9955 from the pre-compile phase, as is the name_entry_size field. */
9956 
9957 cb.parens_depth = 0;
9958 cb.assert_depth = 0;
9959 cb.lastcapture = 0;
9960 cb.name_table = (PCRE2_UCHAR *)((uint8_t *)re + sizeof(pcre2_real_code));
9961 cb.start_code = codestart;
9962 cb.req_varyopt = 0;
9963 cb.had_accept = FALSE;
9964 cb.had_pruneorskip = FALSE;
9965 cb.open_caps = NULL;
9966 
9967 /* If any named groups were found, create the name/number table from the list
9968 created in the pre-pass. */
9969 
9970 if (cb.names_found > 0)
9971   {
9972   named_group *ng = cb.named_groups;
9973   for (i = 0; i < cb.names_found; i++, ng++)
9974     add_name_to_table(&cb, ng->name, ng->length, ng->number, i);
9975   }
9976 
9977 /* Set up a starting, non-extracting bracket, then compile the expression. On
9978 error, errorcode will be set non-zero, so we don't need to look at the result
9979 of the function here. */
9980 
9981 pptr = cb.parsed_pattern;
9982 code = (PCRE2_UCHAR *)codestart;
9983 *code = OP_BRA;
9984 regexrc = compile_regex(re->overall_options, &code, &pptr, &errorcode, 0,
9985   &firstcu, &firstcuflags, &reqcu, &reqcuflags, NULL, &cb, NULL);
9986 if (regexrc < 0) re->flags |= PCRE2_MATCH_EMPTY;
9987 re->top_bracket = cb.bracount;
9988 re->top_backref = cb.top_backref;
9989 re->max_lookbehind = cb.max_lookbehind;
9990 
9991 if (cb.had_accept)
9992   {
9993   reqcu = 0;              /* Must disable after (*ACCEPT) */
9994   reqcuflags = REQ_NONE;
9995   }
9996 
9997 /* Fill in the final opcode and check for disastrous overflow. If no overflow,
9998 but the estimated length exceeds the really used length, adjust the value of
9999 re->blocksize, and if valgrind support is configured, mark the extra allocated
10000 memory as unaddressable, so that any out-of-bound reads can be detected. */
10001 
10002 *code++ = OP_END;
10003 usedlength = code - codestart;
10004 if (usedlength > length) errorcode = ERR23; else
10005   {
10006   re->blocksize -= CU2BYTES(length - usedlength);
10007 #ifdef SUPPORT_VALGRIND
10008   VALGRIND_MAKE_MEM_NOACCESS(code, CU2BYTES(length - usedlength));
10009 #endif
10010   }
10011 
10012 /* Scan the pattern for recursion/subroutine calls and convert the group
10013 numbers into offsets. Maintain a small cache so that repeated groups containing
10014 recursions are efficiently handled. */
10015 
10016 #define RSCAN_CACHE_SIZE 8
10017 
10018 if (errorcode == 0 && cb.had_recurse)
10019   {
10020   PCRE2_UCHAR *rcode;
10021   PCRE2_SPTR rgroup;
10022   unsigned int ccount = 0;
10023   int start = RSCAN_CACHE_SIZE;
10024   recurse_cache rc[RSCAN_CACHE_SIZE];
10025 
10026   for (rcode = (PCRE2_UCHAR *)find_recurse(codestart, utf);
10027        rcode != NULL;
10028        rcode = (PCRE2_UCHAR *)find_recurse(rcode + 1 + LINK_SIZE, utf))
10029     {
10030     int p, groupnumber;
10031 
10032     groupnumber = (int)GET(rcode, 1);
10033     if (groupnumber == 0) rgroup = codestart; else
10034       {
10035       PCRE2_SPTR search_from = codestart;
10036       rgroup = NULL;
10037       for (i = 0, p = start; i < ccount; i++, p = (p + 1) & 7)
10038         {
10039         if (groupnumber == rc[p].groupnumber)
10040           {
10041           rgroup = rc[p].group;
10042           break;
10043           }
10044 
10045         /* Group n+1 must always start to the right of group n, so we can save
10046         search time below when the new group number is greater than any of the
10047         previously found groups. */
10048 
10049         if (groupnumber > rc[p].groupnumber) search_from = rc[p].group;
10050         }
10051 
10052       if (rgroup == NULL)
10053         {
10054         rgroup = PRIV(find_bracket)(search_from, utf, groupnumber);
10055         if (rgroup == NULL)
10056           {
10057           errorcode = ERR53;
10058           break;
10059           }
10060         if (--start < 0) start = RSCAN_CACHE_SIZE - 1;
10061         rc[start].groupnumber = groupnumber;
10062         rc[start].group = rgroup;
10063         if (ccount < RSCAN_CACHE_SIZE) ccount++;
10064         }
10065       }
10066 
10067     PUT(rcode, 1, rgroup - codestart);
10068     }
10069   }
10070 
10071 /* In rare debugging situations we sometimes need to look at the compiled code
10072 at this stage. */
10073 
10074 #ifdef DEBUG_CALL_PRINTINT
10075 pcre2_printint(re, stderr, TRUE);
10076 fprintf(stderr, "Length=%lu Used=%lu\n", length, usedlength);
10077 #endif
10078 
10079 /* Unless disabled, check whether any single character iterators can be
10080 auto-possessified. The function overwrites the appropriate opcode values, so
10081 the type of the pointer must be cast. NOTE: the intermediate variable "temp" is
10082 used in this code because at least one compiler gives a warning about loss of
10083 "const" attribute if the cast (PCRE2_UCHAR *)codestart is used directly in the
10084 function call. */
10085 
10086 if (errorcode == 0 && (re->overall_options & PCRE2_NO_AUTO_POSSESS) == 0)
10087   {
10088   PCRE2_UCHAR *temp = (PCRE2_UCHAR *)codestart;
10089   if (PRIV(auto_possessify)(temp, utf, &cb) != 0) errorcode = ERR80;
10090   }
10091 
10092 /* Failed to compile, or error while post-processing. */
10093 
10094 if (errorcode != 0) goto HAD_CB_ERROR;
10095 
10096 /* Successful compile. If the anchored option was not passed, set it if
10097 we can determine that the pattern is anchored by virtue of ^ characters or \A
10098 or anything else, such as starting with non-atomic .* when DOTALL is set and
10099 there are no occurrences of *PRUNE or *SKIP (though there is an option to
10100 disable this case). */
10101 
10102 if ((re->overall_options & PCRE2_ANCHORED) == 0 &&
10103      is_anchored(codestart, 0, &cb, 0, FALSE))
10104   re->overall_options |= PCRE2_ANCHORED;
10105 
10106 /* Set up the first code unit or startline flag, the required code unit, and
10107 then study the pattern. This code need not be obeyed if PCRE2_NO_START_OPTIMIZE
10108 is set, as the data it would create will not be used. Note that a first code
10109 unit (but not the startline flag) is useful for anchored patterns because it
10110 can still give a quick "no match" and also avoid searching for a last code
10111 unit. */
10112 
10113 if ((re->overall_options & PCRE2_NO_START_OPTIMIZE) == 0)
10114   {
10115   /* If we do not have a first code unit, see if there is one that is asserted
10116   (these are not saved during the compile because they can cause conflicts with
10117   actual literals that follow). */
10118 
10119   if (firstcuflags < 0)
10120     firstcu = find_firstassertedcu(codestart, &firstcuflags, 0);
10121 
10122   /* Save the data for a first code unit. */
10123 
10124   if (firstcuflags >= 0)
10125     {
10126     re->first_codeunit = firstcu;
10127     re->flags |= PCRE2_FIRSTSET;
10128 
10129     /* Handle caseless first code units. */
10130 
10131     if ((firstcuflags & REQ_CASELESS) != 0)
10132       {
10133       if (firstcu < 128 || (!utf && firstcu < 255))
10134         {
10135         if (cb.fcc[firstcu] != firstcu) re->flags |= PCRE2_FIRSTCASELESS;
10136         }
10137 
10138       /* The first code unit is > 128 in UTF mode, or > 255 otherwise. In
10139       8-bit UTF mode, codepoints in the range 128-255 are introductory code
10140       points and cannot have another case. In 16-bit and 32-bit modes, we can
10141       check wide characters when UTF (and therefore UCP) is supported. */
10142 
10143 #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 8
10144       else if (firstcu <= MAX_UTF_CODE_POINT &&
10145                UCD_OTHERCASE(firstcu) != firstcu)
10146         re->flags |= PCRE2_FIRSTCASELESS;
10147 #endif
10148       }
10149     }
10150 
10151   /* When there is no first code unit, for non-anchored patterns, see if we can
10152   set the PCRE2_STARTLINE flag. This is helpful for multiline matches when all
10153   branches start with ^ and also when all branches start with non-atomic .* for
10154   non-DOTALL matches when *PRUNE and SKIP are not present. (There is an option
10155   that disables this case.) */
10156 
10157   else if ((re->overall_options & PCRE2_ANCHORED) == 0 &&
10158            is_startline(codestart, 0, &cb, 0, FALSE))
10159     re->flags |= PCRE2_STARTLINE;
10160 
10161   /* Handle the "required code unit", if one is set. In the case of an anchored
10162   pattern, do this only if it follows a variable length item in the pattern. */
10163 
10164   if (reqcuflags >= 0 &&
10165        ((re->overall_options & PCRE2_ANCHORED) == 0 ||
10166         (reqcuflags & REQ_VARY) != 0))
10167     {
10168     re->last_codeunit = reqcu;
10169     re->flags |= PCRE2_LASTSET;
10170 
10171     /* Handle caseless required code units as for first code units (above). */
10172 
10173     if ((reqcuflags & REQ_CASELESS) != 0)
10174       {
10175       if (reqcu < 128 || (!utf && reqcu < 255))
10176         {
10177         if (cb.fcc[reqcu] != reqcu) re->flags |= PCRE2_LASTCASELESS;
10178         }
10179 #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 8
10180       else if (reqcu <= MAX_UTF_CODE_POINT && UCD_OTHERCASE(reqcu) != reqcu)
10181         re->flags |= PCRE2_LASTCASELESS;
10182 #endif
10183       }
10184     }
10185 
10186   /* Finally, study the compiled pattern to set up information such as a bitmap
10187   of starting code units and a minimum matching length. */
10188 
10189   if (PRIV(study)(re) != 0)
10190     {
10191     errorcode = ERR31;
10192     goto HAD_CB_ERROR;
10193     }
10194   }   /* End of start-of-match optimizations. */
10195 
10196 /* Control ends up here in all cases. When running under valgrind, make a
10197 pattern's terminating zero defined again. If memory was obtained for the parsed
10198 version of the pattern, free it before returning. Also free the list of named
10199 groups if a larger one had to be obtained, and likewise the group information
10200 vector. */
10201 
10202 EXIT:
10203 #ifdef SUPPORT_VALGRIND
10204 if (zero_terminated) VALGRIND_MAKE_MEM_DEFINED(pattern + patlen, CU2BYTES(1));
10205 #endif
10206 if (cb.parsed_pattern != stack_parsed_pattern)
10207   ccontext->memctl.free(cb.parsed_pattern, ccontext->memctl.memory_data);
10208 if (cb.named_group_list_size > NAMED_GROUP_LIST_SIZE)
10209   ccontext->memctl.free((void *)cb.named_groups, ccontext->memctl.memory_data);
10210 if (cb.groupinfo != stack_groupinfo)
10211   ccontext->memctl.free((void *)cb.groupinfo, ccontext->memctl.memory_data);
10212 return re;    /* Will be NULL after an error */
10213 
10214 /* Errors discovered in parse_regex() set the offset value in the compile
10215 block. Errors discovered before it is called must compute it from the ptr
10216 value. After parse_regex() is called, the offset in the compile block is set to
10217 the end of the pattern, but certain errors in compile_regex() may reset it if
10218 an offset is available in the parsed pattern. */
10219 
10220 HAD_CB_ERROR:
10221 ptr = pattern + cb.erroroffset;
10222 
10223 HAD_EARLY_ERROR:
10224 *erroroffset = ptr - pattern;
10225 
10226 HAD_ERROR:
10227 *errorptr = errorcode;
10228 pcre2_code_free(re);
10229 re = NULL;
10230 goto EXIT;
10231 }
10232 
10233 /* End of pcre2_compile.c */
10234