• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*************************************************
2 *      Perl-Compatible Regular Expressions       *
3 *************************************************/
4 
5 /* PCRE is a library of functions to support regular expressions whose syntax
6 and semantics are as close as possible to those of the Perl 5 language.
7 
8                        Written by Philip Hazel
9      Original API code Copyright (c) 1997-2012 University of Cambridge
10           New API code Copyright (c) 2016-2020 University of Cambridge
11 
12 -----------------------------------------------------------------------------
13 Redistribution and use in source and binary forms, with or without
14 modification, are permitted provided that the following conditions are met:
15 
16     * Redistributions of source code must retain the above copyright notice,
17       this list of conditions and the following disclaimer.
18 
19     * Redistributions in binary form must reproduce the above copyright
20       notice, this list of conditions and the following disclaimer in the
21       documentation and/or other materials provided with the distribution.
22 
23     * Neither the name of the University of Cambridge nor the names of its
24       contributors may be used to endorse or promote products derived from
25       this software without specific prior written permission.
26 
27 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
28 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
29 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
30 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
31 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
32 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
33 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
34 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
35 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
36 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
37 POSSIBILITY OF SUCH DAMAGE.
38 -----------------------------------------------------------------------------
39 */
40 
41 
42 #ifdef HAVE_CONFIG_H
43 #include "config.h"
44 #endif
45 
46 #define NLBLOCK cb             /* Block containing newline information */
47 #define PSSTART start_pattern  /* Field containing processed string start */
48 #define PSEND   end_pattern    /* Field containing processed string end */
49 
50 #include "pcre2_internal.h"
51 
52 /* In rare error cases debugging might require calling pcre2_printint(). */
53 
54 #if 0
55 #ifdef EBCDIC
56 #define PRINTABLE(c) ((c) >= 64 && (c) < 255)
57 #else
58 #define PRINTABLE(c) ((c) >= 32 && (c) < 127)
59 #endif
60 #include "pcre2_printint.c"
61 #define DEBUG_CALL_PRINTINT
62 #endif
63 
64 /* Other debugging code can be enabled by these defines. */
65 
66 /* #define DEBUG_SHOW_CAPTURES */
67 /* #define DEBUG_SHOW_PARSED */
68 
69 /* There are a few things that vary with different code unit sizes. Handle them
70 by defining macros in order to minimize #if usage. */
71 
72 #if PCRE2_CODE_UNIT_WIDTH == 8
73 #define STRING_UTFn_RIGHTPAR     STRING_UTF8_RIGHTPAR, 5
74 #define XDIGIT(c)                xdigitab[c]
75 
76 #else  /* Either 16-bit or 32-bit */
77 #define XDIGIT(c)                (MAX_255(c)? xdigitab[c] : 0xff)
78 
79 #if PCRE2_CODE_UNIT_WIDTH == 16
80 #define STRING_UTFn_RIGHTPAR     STRING_UTF16_RIGHTPAR, 6
81 
82 #else  /* 32-bit */
83 #define STRING_UTFn_RIGHTPAR     STRING_UTF32_RIGHTPAR, 6
84 #endif
85 #endif
86 
87 /* Macros to store and retrieve a PCRE2_SIZE value in the parsed pattern, which
88 consists of uint32_t elements. Assume that if uint32_t can't hold it, two of
89 them will be able to (i.e. assume a 64-bit world). */
90 
91 #if PCRE2_SIZE_MAX <= UINT32_MAX
92 #define PUTOFFSET(s,p) *p++ = s
93 #define GETOFFSET(s,p) s = *p++
94 #define GETPLUSOFFSET(s,p) s = *(++p)
95 #define READPLUSOFFSET(s,p) s = p[1]
96 #define SKIPOFFSET(p) p++
97 #define SIZEOFFSET 1
98 #else
99 #define PUTOFFSET(s,p) \
100   { *p++ = (uint32_t)(s >> 32); *p++ = (uint32_t)(s & 0xffffffff); }
101 #define GETOFFSET(s,p) \
102   { s = ((PCRE2_SIZE)p[0] << 32) | (PCRE2_SIZE)p[1]; p += 2; }
103 #define GETPLUSOFFSET(s,p) \
104   { s = ((PCRE2_SIZE)p[1] << 32) | (PCRE2_SIZE)p[2]; p += 2; }
105 #define READPLUSOFFSET(s,p) \
106   { s = ((PCRE2_SIZE)p[1] << 32) | (PCRE2_SIZE)p[2]; }
107 #define SKIPOFFSET(p) p += 2
108 #define SIZEOFFSET 2
109 #endif
110 
111 /* Macros for manipulating elements of the parsed pattern vector. */
112 
113 #define META_CODE(x)   (x & 0xffff0000u)
114 #define META_DATA(x)   (x & 0x0000ffffu)
115 #define META_DIFF(x,y) ((x-y)>>16)
116 
117 /* Function definitions to allow mutual recursion */
118 
119 #ifdef SUPPORT_UNICODE
120 static unsigned int
121   add_list_to_class_internal(uint8_t *, PCRE2_UCHAR **, uint32_t,
122     compile_block *, const uint32_t *, unsigned int);
123 #endif
124 
125 static int
126   compile_regex(uint32_t, PCRE2_UCHAR **, uint32_t **, int *, uint32_t,
127     uint32_t *, int32_t *, uint32_t *, int32_t *, branch_chain *,
128     compile_block *, PCRE2_SIZE *);
129 
130 static int
131   get_branchlength(uint32_t **, int *, int *, parsed_recurse_check *,
132     compile_block *);
133 
134 static BOOL
135   set_lookbehind_lengths(uint32_t **, int *, int *, parsed_recurse_check *,
136     compile_block *);
137 
138 static int
139   check_lookbehinds(uint32_t *, uint32_t **, parsed_recurse_check *,
140     compile_block *);
141 
142 
143 /*************************************************
144 *      Code parameters and static tables         *
145 *************************************************/
146 
147 #define MAX_GROUP_NUMBER   65535u
148 #define MAX_REPEAT_COUNT   65535u
149 #define REPEAT_UNLIMITED   (MAX_REPEAT_COUNT+1)
150 
151 /* COMPILE_WORK_SIZE specifies the size of stack workspace, which is used in
152 different ways in the different pattern scans. The parsing and group-
153 identifying pre-scan uses it to handle nesting, and needs it to be 16-bit
154 aligned for this. Having defined the size in code units, we set up
155 C16_WORK_SIZE as the number of elements in the 16-bit vector.
156 
157 During the first compiling phase, when determining how much memory is required,
158 the regex is partly compiled into this space, but the compiled parts are
159 discarded as soon as they can be, so that hopefully there will never be an
160 overrun. The code does, however, check for an overrun, which can occur for
161 pathological patterns. The size of the workspace depends on LINK_SIZE because
162 the length of compiled items varies with this.
163 
164 In the real compile phase, this workspace is not currently used. */
165 
166 #define COMPILE_WORK_SIZE (3000*LINK_SIZE)   /* Size in code units */
167 
168 #define C16_WORK_SIZE \
169   ((COMPILE_WORK_SIZE * sizeof(PCRE2_UCHAR))/sizeof(uint16_t))
170 
171 /* A uint32_t vector is used for caching information about the size of
172 capturing groups, to improve performance. A default is created on the stack of
173 this size. */
174 
175 #define GROUPINFO_DEFAULT_SIZE 256
176 
177 /* The overrun tests check for a slightly smaller size so that they detect the
178 overrun before it actually does run off the end of the data block. */
179 
180 #define WORK_SIZE_SAFETY_MARGIN (100)
181 
182 /* This value determines the size of the initial vector that is used for
183 remembering named groups during the pre-compile. It is allocated on the stack,
184 but if it is too small, it is expanded, in a similar way to the workspace. The
185 value is the number of slots in the list. */
186 
187 #define NAMED_GROUP_LIST_SIZE  20
188 
189 /* The pre-compiling pass over the pattern creates a parsed pattern in a vector
190 of uint32_t. For short patterns this lives on the stack, with this size. Heap
191 memory is used for longer patterns. */
192 
193 #define PARSED_PATTERN_DEFAULT_SIZE 1024
194 
195 /* Maximum length value to check against when making sure that the variable
196 that holds the compiled pattern length does not overflow. We make it a bit less
197 than INT_MAX to allow for adding in group terminating code units, so that we
198 don't have to check them every time. */
199 
200 #define OFLOW_MAX (INT_MAX - 20)
201 
202 /* Code values for parsed patterns, which are stored in a vector of 32-bit
203 unsigned ints. Values less than META_END are literal data values. The coding
204 for identifying the item is in the top 16-bits, leaving 16 bits for the
205 additional data that some of them need. The META_CODE, META_DATA, and META_DIFF
206 macros are used to manipulate parsed pattern elements.
207 
208 NOTE: When these definitions are changed, the table of extra lengths for each
209 code (meta_extra_lengths, just below) must be updated to remain in step. */
210 
211 #define META_END              0x80000000u  /* End of pattern */
212 
213 #define META_ALT              0x80010000u  /* alternation */
214 #define META_ATOMIC           0x80020000u  /* atomic group */
215 #define META_BACKREF          0x80030000u  /* Back ref */
216 #define META_BACKREF_BYNAME   0x80040000u  /* \k'name' */
217 #define META_BIGVALUE         0x80050000u  /* Next is a literal > META_END */
218 #define META_CALLOUT_NUMBER   0x80060000u  /* (?C with numerical argument */
219 #define META_CALLOUT_STRING   0x80070000u  /* (?C with string argument */
220 #define META_CAPTURE          0x80080000u  /* Capturing parenthesis */
221 #define META_CIRCUMFLEX       0x80090000u  /* ^ metacharacter */
222 #define META_CLASS            0x800a0000u  /* start non-empty class */
223 #define META_CLASS_EMPTY      0x800b0000u  /* empty class */
224 #define META_CLASS_EMPTY_NOT  0x800c0000u  /* negative empty class */
225 #define META_CLASS_END        0x800d0000u  /* end of non-empty class */
226 #define META_CLASS_NOT        0x800e0000u  /* start non-empty negative class */
227 #define META_COND_ASSERT      0x800f0000u  /* (?(?assertion)... */
228 #define META_COND_DEFINE      0x80100000u  /* (?(DEFINE)... */
229 #define META_COND_NAME        0x80110000u  /* (?(<name>)... */
230 #define META_COND_NUMBER      0x80120000u  /* (?(digits)... */
231 #define META_COND_RNAME       0x80130000u  /* (?(R&name)... */
232 #define META_COND_RNUMBER     0x80140000u  /* (?(Rdigits)... */
233 #define META_COND_VERSION     0x80150000u  /* (?(VERSION<op>x.y)... */
234 #define META_DOLLAR           0x80160000u  /* $ metacharacter */
235 #define META_DOT              0x80170000u  /* . metacharacter */
236 #define META_ESCAPE           0x80180000u  /* \d and friends */
237 #define META_KET              0x80190000u  /* closing parenthesis */
238 #define META_NOCAPTURE        0x801a0000u  /* no capture parens */
239 #define META_OPTIONS          0x801b0000u  /* (?i) and friends */
240 #define META_POSIX            0x801c0000u  /* POSIX class item */
241 #define META_POSIX_NEG        0x801d0000u  /* negative POSIX class item */
242 #define META_RANGE_ESCAPED    0x801e0000u  /* range with at least one escape */
243 #define META_RANGE_LITERAL    0x801f0000u  /* range defined literally */
244 #define META_RECURSE          0x80200000u  /* Recursion */
245 #define META_RECURSE_BYNAME   0x80210000u  /* (?&name) */
246 #define META_SCRIPT_RUN       0x80220000u  /* (*script_run:...) */
247 
248 /* These must be kept together to make it easy to check that an assertion
249 is present where expected in a conditional group. */
250 
251 #define META_LOOKAHEAD        0x80230000u  /* (?= */
252 #define META_LOOKAHEADNOT     0x80240000u  /* (?! */
253 #define META_LOOKBEHIND       0x80250000u  /* (?<= */
254 #define META_LOOKBEHINDNOT    0x80260000u  /* (?<! */
255 
256 /* These cannot be conditions */
257 
258 #define META_LOOKAHEAD_NA     0x80270000u  /* (*napla: */
259 #define META_LOOKBEHIND_NA    0x80280000u  /* (*naplb: */
260 
261 /* These must be kept in this order, with consecutive values, and the _ARG
262 versions of COMMIT, PRUNE, SKIP, and THEN immediately after their non-argument
263 versions. */
264 
265 #define META_MARK             0x80290000u  /* (*MARK) */
266 #define META_ACCEPT           0x802a0000u  /* (*ACCEPT) */
267 #define META_FAIL             0x802b0000u  /* (*FAIL) */
268 #define META_COMMIT           0x802c0000u  /* These               */
269 #define META_COMMIT_ARG       0x802d0000u  /*   pairs             */
270 #define META_PRUNE            0x802e0000u  /*     must            */
271 #define META_PRUNE_ARG        0x802f0000u  /*       be            */
272 #define META_SKIP             0x80300000u  /*         kept        */
273 #define META_SKIP_ARG         0x80310000u  /*           in        */
274 #define META_THEN             0x80320000u  /*             this    */
275 #define META_THEN_ARG         0x80330000u  /*               order */
276 
277 /* These must be kept in groups of adjacent 3 values, and all together. */
278 
279 #define META_ASTERISK         0x80340000u  /* *  */
280 #define META_ASTERISK_PLUS    0x80350000u  /* *+ */
281 #define META_ASTERISK_QUERY   0x80360000u  /* *? */
282 #define META_PLUS             0x80370000u  /* +  */
283 #define META_PLUS_PLUS        0x80380000u  /* ++ */
284 #define META_PLUS_QUERY       0x80390000u  /* +? */
285 #define META_QUERY            0x803a0000u  /* ?  */
286 #define META_QUERY_PLUS       0x803b0000u  /* ?+ */
287 #define META_QUERY_QUERY      0x803c0000u  /* ?? */
288 #define META_MINMAX           0x803d0000u  /* {n,m}  repeat */
289 #define META_MINMAX_PLUS      0x803e0000u  /* {n,m}+ repeat */
290 #define META_MINMAX_QUERY     0x803f0000u  /* {n,m}? repeat */
291 
292 #define META_FIRST_QUANTIFIER META_ASTERISK
293 #define META_LAST_QUANTIFIER  META_MINMAX_QUERY
294 
295 /* This is a special "meta code" that is used only to distinguish (*asr: from
296 (*sr: in the table of aphabetic assertions. It is never stored in the parsed
297 pattern because (*asr: is turned into (*sr:(*atomic: at that stage. There is
298 therefore no need for it to have a length entry, so use a high value. */
299 
300 #define META_ATOMIC_SCRIPT_RUN 0x8fff0000u
301 
302 /* Table of extra lengths for each of the meta codes. Must be kept in step with
303 the definitions above. For some items these values are a basic length to which
304 a variable amount has to be added. */
305 
306 static unsigned char meta_extra_lengths[] = {
307   0,             /* META_END */
308   0,             /* META_ALT */
309   0,             /* META_ATOMIC */
310   0,             /* META_BACKREF - more if group is >= 10 */
311   1+SIZEOFFSET,  /* META_BACKREF_BYNAME */
312   1,             /* META_BIGVALUE */
313   3,             /* META_CALLOUT_NUMBER */
314   3+SIZEOFFSET,  /* META_CALLOUT_STRING */
315   0,             /* META_CAPTURE */
316   0,             /* META_CIRCUMFLEX */
317   0,             /* META_CLASS */
318   0,             /* META_CLASS_EMPTY */
319   0,             /* META_CLASS_EMPTY_NOT */
320   0,             /* META_CLASS_END */
321   0,             /* META_CLASS_NOT */
322   0,             /* META_COND_ASSERT */
323   SIZEOFFSET,    /* META_COND_DEFINE */
324   1+SIZEOFFSET,  /* META_COND_NAME */
325   1+SIZEOFFSET,  /* META_COND_NUMBER */
326   1+SIZEOFFSET,  /* META_COND_RNAME */
327   1+SIZEOFFSET,  /* META_COND_RNUMBER */
328   3,             /* META_COND_VERSION */
329   0,             /* META_DOLLAR */
330   0,             /* META_DOT */
331   0,             /* META_ESCAPE - more for ESC_P, ESC_p, ESC_g, ESC_k */
332   0,             /* META_KET */
333   0,             /* META_NOCAPTURE */
334   1,             /* META_OPTIONS */
335   1,             /* META_POSIX */
336   1,             /* META_POSIX_NEG */
337   0,             /* META_RANGE_ESCAPED */
338   0,             /* META_RANGE_LITERAL */
339   SIZEOFFSET,    /* META_RECURSE */
340   1+SIZEOFFSET,  /* META_RECURSE_BYNAME */
341   0,             /* META_SCRIPT_RUN */
342   0,             /* META_LOOKAHEAD */
343   0,             /* META_LOOKAHEADNOT */
344   SIZEOFFSET,    /* META_LOOKBEHIND */
345   SIZEOFFSET,    /* META_LOOKBEHINDNOT */
346   0,             /* META_LOOKAHEAD_NA */
347   SIZEOFFSET,    /* META_LOOKBEHIND_NA */
348   1,             /* META_MARK - plus the string length */
349   0,             /* META_ACCEPT */
350   0,             /* META_FAIL */
351   0,             /* META_COMMIT */
352   1,             /* META_COMMIT_ARG - plus the string length */
353   0,             /* META_PRUNE */
354   1,             /* META_PRUNE_ARG - plus the string length */
355   0,             /* META_SKIP */
356   1,             /* META_SKIP_ARG - plus the string length */
357   0,             /* META_THEN */
358   1,             /* META_THEN_ARG - plus the string length */
359   0,             /* META_ASTERISK */
360   0,             /* META_ASTERISK_PLUS */
361   0,             /* META_ASTERISK_QUERY */
362   0,             /* META_PLUS */
363   0,             /* META_PLUS_PLUS */
364   0,             /* META_PLUS_QUERY */
365   0,             /* META_QUERY */
366   0,             /* META_QUERY_PLUS */
367   0,             /* META_QUERY_QUERY */
368   2,             /* META_MINMAX */
369   2,             /* META_MINMAX_PLUS */
370   2              /* META_MINMAX_QUERY */
371 };
372 
373 /* Types for skipping parts of a parsed pattern. */
374 
375 enum { PSKIP_ALT, PSKIP_CLASS, PSKIP_KET };
376 
377 /* Macro for setting individual bits in class bitmaps. It took some
378 experimenting to figure out how to stop gcc 5.3.0 from warning with
379 -Wconversion. This version gets a warning:
380 
381   #define SETBIT(a,b) a[(b)/8] |= (uint8_t)(1u << ((b)&7))
382 
383 Let's hope the apparently less efficient version isn't actually so bad if the
384 compiler is clever with identical subexpressions. */
385 
386 #define SETBIT(a,b) a[(b)/8] = (uint8_t)(a[(b)/8] | (1u << ((b)&7)))
387 
388 /* Private flags added to firstcu and reqcu. */
389 
390 #define REQ_CASELESS    (1u << 0)       /* Indicates caselessness */
391 #define REQ_VARY        (1u << 1)       /* reqcu followed non-literal item */
392 /* Negative values for the firstcu and reqcu flags */
393 #define REQ_UNSET       (-2)            /* Not yet found anything */
394 #define REQ_NONE        (-1)            /* Found not fixed char */
395 
396 /* These flags are used in the groupinfo vector. */
397 
398 #define GI_SET_FIXED_LENGTH    0x80000000u
399 #define GI_NOT_FIXED_LENGTH    0x40000000u
400 #define GI_FIXED_LENGTH_MASK   0x0000ffffu
401 
402 /* This simple test for a decimal digit works for both ASCII/Unicode and EBCDIC
403 and is fast (a good compiler can turn it into a subtraction and unsigned
404 comparison). */
405 
406 #define IS_DIGIT(x) ((x) >= CHAR_0 && (x) <= CHAR_9)
407 
408 /* Table to identify hex digits. The tables in chartables are dependent on the
409 locale, and may mark arbitrary characters as digits. We want to recognize only
410 0-9, a-z, and A-Z as hex digits, which is why we have a private table here. It
411 costs 256 bytes, but it is a lot faster than doing character value tests (at
412 least in some simple cases I timed), and in some applications one wants PCRE2
413 to compile efficiently as well as match efficiently. The value in the table is
414 the binary hex digit value, or 0xff for non-hex digits. */
415 
416 /* This is the "normal" case, for ASCII systems, and EBCDIC systems running in
417 UTF-8 mode. */
418 
419 #ifndef EBCDIC
420 static const uint8_t xdigitab[] =
421   {
422   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*   0-  7 */
423   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*   8- 15 */
424   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  16- 23 */
425   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  24- 31 */
426   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*    - '  */
427   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  ( - /  */
428   0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07, /*  0 - 7  */
429   0x08,0x09,0xff,0xff,0xff,0xff,0xff,0xff, /*  8 - ?  */
430   0xff,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f,0xff, /*  @ - G  */
431   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  H - O  */
432   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  P - W  */
433   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  X - _  */
434   0xff,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f,0xff, /*  ` - g  */
435   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  h - o  */
436   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  p - w  */
437   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  x -127 */
438   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 128-135 */
439   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 136-143 */
440   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 144-151 */
441   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 152-159 */
442   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 160-167 */
443   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 168-175 */
444   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 176-183 */
445   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 184-191 */
446   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 192-199 */
447   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 2ff-207 */
448   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 208-215 */
449   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 216-223 */
450   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 224-231 */
451   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 232-239 */
452   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 240-247 */
453   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff};/* 248-255 */
454 
455 #else
456 
457 /* This is the "abnormal" case, for EBCDIC systems not running in UTF-8 mode. */
458 
459 static const uint8_t xdigitab[] =
460   {
461   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*   0-  7  0 */
462   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*   8- 15    */
463   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  16- 23 10 */
464   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  24- 31    */
465   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  32- 39 20 */
466   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  40- 47    */
467   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  48- 55 30 */
468   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  56- 63    */
469   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*    - 71 40 */
470   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  72- |     */
471   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  & - 87 50 */
472   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  88- 95    */
473   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  - -103 60 */
474   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 104- ?     */
475   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 112-119 70 */
476   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 120- "     */
477   0xff,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f,0xff, /* 128- g  80 */
478   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  h -143    */
479   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 144- p  90 */
480   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  q -159    */
481   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 160- x  A0 */
482   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  y -175    */
483   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  ^ -183 B0 */
484   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 184-191    */
485   0xff,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f,0xff, /*  { - G  C0 */
486   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  H -207    */
487   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  } - P  D0 */
488   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  Q -223    */
489   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  \ - X  E0 */
490   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  Y -239    */
491   0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07, /*  0 - 7  F0 */
492   0x08,0x09,0xff,0xff,0xff,0xff,0xff,0xff};/*  8 -255    */
493 #endif  /* EBCDIC */
494 
495 
496 /* Table for handling alphanumeric escaped characters. Positive returns are
497 simple data values; negative values are for special things like \d and so on.
498 Zero means further processing is needed (for things like \x), or the escape is
499 invalid. */
500 
501 /* This is the "normal" table for ASCII systems or for EBCDIC systems running
502 in UTF-8 mode. It runs from '0' to 'z'. */
503 
504 #ifndef EBCDIC
505 #define ESCAPES_FIRST       CHAR_0
506 #define ESCAPES_LAST        CHAR_z
507 #define UPPER_CASE(c)       (c-32)
508 
509 static const short int escapes[] = {
510      0,                       0,
511      0,                       0,
512      0,                       0,
513      0,                       0,
514      0,                       0,
515      CHAR_COLON,              CHAR_SEMICOLON,
516      CHAR_LESS_THAN_SIGN,     CHAR_EQUALS_SIGN,
517      CHAR_GREATER_THAN_SIGN,  CHAR_QUESTION_MARK,
518      CHAR_COMMERCIAL_AT,      -ESC_A,
519      -ESC_B,                  -ESC_C,
520      -ESC_D,                  -ESC_E,
521      0,                       -ESC_G,
522      -ESC_H,                  0,
523      0,                       -ESC_K,
524      0,                       0,
525      -ESC_N,                  0,
526      -ESC_P,                  -ESC_Q,
527      -ESC_R,                  -ESC_S,
528      0,                       0,
529      -ESC_V,                  -ESC_W,
530      -ESC_X,                  0,
531      -ESC_Z,                  CHAR_LEFT_SQUARE_BRACKET,
532      CHAR_BACKSLASH,          CHAR_RIGHT_SQUARE_BRACKET,
533      CHAR_CIRCUMFLEX_ACCENT,  CHAR_UNDERSCORE,
534      CHAR_GRAVE_ACCENT,       CHAR_BEL,
535      -ESC_b,                  0,
536      -ESC_d,                  CHAR_ESC,
537      CHAR_FF,                 0,
538      -ESC_h,                  0,
539      0,                       -ESC_k,
540      0,                       0,
541      CHAR_LF,                 0,
542      -ESC_p,                  0,
543      CHAR_CR,                 -ESC_s,
544      CHAR_HT,                 0,
545      -ESC_v,                  -ESC_w,
546      0,                       0,
547      -ESC_z
548 };
549 
550 #else
551 
552 /* This is the "abnormal" table for EBCDIC systems without UTF-8 support.
553 It runs from 'a' to '9'. For some minimal testing of EBCDIC features, the code
554 is sometimes compiled on an ASCII system. In this case, we must not use CHAR_a
555 because it is defined as 'a', which of course picks up the ASCII value. */
556 
557 #if 'a' == 0x81                    /* Check for a real EBCDIC environment */
558 #define ESCAPES_FIRST       CHAR_a
559 #define ESCAPES_LAST        CHAR_9
560 #define UPPER_CASE(c)       (c+64)
561 #else                              /* Testing in an ASCII environment */
562 #define ESCAPES_FIRST  ((unsigned char)'\x81')   /* EBCDIC 'a' */
563 #define ESCAPES_LAST   ((unsigned char)'\xf9')   /* EBCDIC '9' */
564 #define UPPER_CASE(c)  (c-32)
565 #endif
566 
567 static const short int escapes[] = {
568 /*  80 */         CHAR_BEL, -ESC_b,       0, -ESC_d, CHAR_ESC, CHAR_FF,      0,
569 /*  88 */ -ESC_h,        0,      0,     '{',      0,        0,       0,      0,
570 /*  90 */      0,        0, -ESC_k,       0,      0,  CHAR_LF,       0, -ESC_p,
571 /*  98 */      0,  CHAR_CR,      0,     '}',      0,        0,       0,      0,
572 /*  A0 */      0,      '~', -ESC_s, CHAR_HT,      0,   -ESC_v,  -ESC_w,      0,
573 /*  A8 */      0,   -ESC_z,      0,       0,      0,      '[',       0,      0,
574 /*  B0 */      0,        0,      0,       0,      0,        0,       0,      0,
575 /*  B8 */      0,        0,      0,       0,      0,      ']',     '=',    '-',
576 /*  C0 */    '{',   -ESC_A, -ESC_B,  -ESC_C, -ESC_D,   -ESC_E,       0, -ESC_G,
577 /*  C8 */ -ESC_H,        0,      0,       0,      0,        0,       0,      0,
578 /*  D0 */    '}',        0, -ESC_K,       0,      0,   -ESC_N,       0, -ESC_P,
579 /*  D8 */ -ESC_Q,   -ESC_R,      0,       0,      0,        0,       0,      0,
580 /*  E0 */   '\\',        0, -ESC_S,       0,      0,   -ESC_V,  -ESC_W, -ESC_X,
581 /*  E8 */      0,   -ESC_Z,      0,       0,      0,        0,       0,      0,
582 /*  F0 */      0,        0,      0,       0,      0,        0,       0,      0,
583 /*  F8 */      0,        0
584 };
585 
586 /* We also need a table of characters that may follow \c in an EBCDIC
587 environment for characters 0-31. */
588 
589 static unsigned char ebcdic_escape_c[] = "@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_";
590 
591 #endif   /* EBCDIC */
592 
593 
594 /* Table of special "verbs" like (*PRUNE). This is a short table, so it is
595 searched linearly. Put all the names into a single string, in order to reduce
596 the number of relocations when a shared library is dynamically linked. The
597 string is built from string macros so that it works in UTF-8 mode on EBCDIC
598 platforms. */
599 
600 typedef struct verbitem {
601   unsigned int len;          /* Length of verb name */
602   uint32_t meta;             /* Base META_ code */
603   int has_arg;               /* Argument requirement */
604 } verbitem;
605 
606 static const char verbnames[] =
607   "\0"                       /* Empty name is a shorthand for MARK */
608   STRING_MARK0
609   STRING_ACCEPT0
610   STRING_F0
611   STRING_FAIL0
612   STRING_COMMIT0
613   STRING_PRUNE0
614   STRING_SKIP0
615   STRING_THEN;
616 
617 static const verbitem verbs[] = {
618   { 0, META_MARK,   +1 },  /* > 0 => must have an argument */
619   { 4, META_MARK,   +1 },
620   { 6, META_ACCEPT, -1 },  /* < 0 => Optional argument, convert to pre-MARK */
621   { 1, META_FAIL,   -1 },
622   { 4, META_FAIL,   -1 },
623   { 6, META_COMMIT,  0 },
624   { 5, META_PRUNE,   0 },  /* Optional argument; bump META code if found */
625   { 4, META_SKIP,    0 },
626   { 4, META_THEN,    0 }
627 };
628 
629 static const int verbcount = sizeof(verbs)/sizeof(verbitem);
630 
631 /* Verb opcodes, indexed by their META code offset from META_MARK. */
632 
633 static const uint32_t verbops[] = {
634   OP_MARK, OP_ACCEPT, OP_FAIL, OP_COMMIT, OP_COMMIT_ARG, OP_PRUNE,
635   OP_PRUNE_ARG, OP_SKIP, OP_SKIP_ARG, OP_THEN, OP_THEN_ARG };
636 
637 /* Table of "alpha assertions" like (*pla:...), similar to the (*VERB) table. */
638 
639 typedef struct alasitem {
640   unsigned int len;          /* Length of name */
641   uint32_t meta;             /* Base META_ code */
642 } alasitem;
643 
644 static const char alasnames[] =
645   STRING_pla0
646   STRING_plb0
647   STRING_napla0
648   STRING_naplb0
649   STRING_nla0
650   STRING_nlb0
651   STRING_positive_lookahead0
652   STRING_positive_lookbehind0
653   STRING_non_atomic_positive_lookahead0
654   STRING_non_atomic_positive_lookbehind0
655   STRING_negative_lookahead0
656   STRING_negative_lookbehind0
657   STRING_atomic0
658   STRING_sr0
659   STRING_asr0
660   STRING_script_run0
661   STRING_atomic_script_run;
662 
663 static const alasitem alasmeta[] = {
664   {  3, META_LOOKAHEAD         },
665   {  3, META_LOOKBEHIND        },
666   {  5, META_LOOKAHEAD_NA      },
667   {  5, META_LOOKBEHIND_NA     },
668   {  3, META_LOOKAHEADNOT      },
669   {  3, META_LOOKBEHINDNOT     },
670   { 18, META_LOOKAHEAD         },
671   { 19, META_LOOKBEHIND        },
672   { 29, META_LOOKAHEAD_NA      },
673   { 30, META_LOOKBEHIND_NA     },
674   { 18, META_LOOKAHEADNOT      },
675   { 19, META_LOOKBEHINDNOT     },
676   {  6, META_ATOMIC            },
677   {  2, META_SCRIPT_RUN        }, /* sr = script run */
678   {  3, META_ATOMIC_SCRIPT_RUN }, /* asr = atomic script run */
679   { 10, META_SCRIPT_RUN        }, /* script run */
680   { 17, META_ATOMIC_SCRIPT_RUN }  /* atomic script run */
681 };
682 
683 static const int alascount = sizeof(alasmeta)/sizeof(alasitem);
684 
685 /* Offsets from OP_STAR for case-independent and negative repeat opcodes. */
686 
687 static uint32_t chartypeoffset[] = {
688   OP_STAR - OP_STAR,    OP_STARI - OP_STAR,
689   OP_NOTSTAR - OP_STAR, OP_NOTSTARI - OP_STAR };
690 
691 /* Tables of names of POSIX character classes and their lengths. The names are
692 now all in a single string, to reduce the number of relocations when a shared
693 library is dynamically loaded. The list of lengths is terminated by a zero
694 length entry. The first three must be alpha, lower, upper, as this is assumed
695 for handling case independence. The indices for graph, print, and punct are
696 needed, so identify them. */
697 
698 static const char posix_names[] =
699   STRING_alpha0 STRING_lower0 STRING_upper0 STRING_alnum0
700   STRING_ascii0 STRING_blank0 STRING_cntrl0 STRING_digit0
701   STRING_graph0 STRING_print0 STRING_punct0 STRING_space0
702   STRING_word0  STRING_xdigit;
703 
704 static const uint8_t posix_name_lengths[] = {
705   5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 6, 0 };
706 
707 #define PC_GRAPH  8
708 #define PC_PRINT  9
709 #define PC_PUNCT 10
710 
711 /* Table of class bit maps for each POSIX class. Each class is formed from a
712 base map, with an optional addition or removal of another map. Then, for some
713 classes, there is some additional tweaking: for [:blank:] the vertical space
714 characters are removed, and for [:alpha:] and [:alnum:] the underscore
715 character is removed. The triples in the table consist of the base map offset,
716 second map offset or -1 if no second map, and a non-negative value for map
717 addition or a negative value for map subtraction (if there are two maps). The
718 absolute value of the third field has these meanings: 0 => no tweaking, 1 =>
719 remove vertical space characters, 2 => remove underscore. */
720 
721 static const int posix_class_maps[] = {
722   cbit_word,  cbit_digit, -2,             /* alpha */
723   cbit_lower, -1,          0,             /* lower */
724   cbit_upper, -1,          0,             /* upper */
725   cbit_word,  -1,          2,             /* alnum - word without underscore */
726   cbit_print, cbit_cntrl,  0,             /* ascii */
727   cbit_space, -1,          1,             /* blank - a GNU extension */
728   cbit_cntrl, -1,          0,             /* cntrl */
729   cbit_digit, -1,          0,             /* digit */
730   cbit_graph, -1,          0,             /* graph */
731   cbit_print, -1,          0,             /* print */
732   cbit_punct, -1,          0,             /* punct */
733   cbit_space, -1,          0,             /* space */
734   cbit_word,  -1,          0,             /* word - a Perl extension */
735   cbit_xdigit,-1,          0              /* xdigit */
736 };
737 
738 #ifdef SUPPORT_UNICODE
739 
740 /* The POSIX class Unicode property substitutes that are used in UCP mode must
741 be in the order of the POSIX class names, defined above. */
742 
743 static int posix_substitutes[] = {
744   PT_GC, ucp_L,     /* alpha */
745   PT_PC, ucp_Ll,    /* lower */
746   PT_PC, ucp_Lu,    /* upper */
747   PT_ALNUM, 0,      /* alnum */
748   -1, 0,            /* ascii, treat as non-UCP */
749   -1, 1,            /* blank, treat as \h */
750   PT_PC, ucp_Cc,    /* cntrl */
751   PT_PC, ucp_Nd,    /* digit */
752   PT_PXGRAPH, 0,    /* graph */
753   PT_PXPRINT, 0,    /* print */
754   PT_PXPUNCT, 0,    /* punct */
755   PT_PXSPACE, 0,    /* space */   /* Xps is POSIX space, but from 8.34 */
756   PT_WORD, 0,       /* word  */   /* Perl and POSIX space are the same */
757   -1, 0             /* xdigit, treat as non-UCP */
758 };
759 #define POSIX_SUBSIZE (sizeof(posix_substitutes) / (2*sizeof(uint32_t)))
760 #endif  /* SUPPORT_UNICODE */
761 
762 /* Masks for checking option settings. When PCRE2_LITERAL is set, only a subset
763 are allowed. */
764 
765 #define PUBLIC_LITERAL_COMPILE_OPTIONS \
766   (PCRE2_ANCHORED|PCRE2_AUTO_CALLOUT|PCRE2_CASELESS|PCRE2_ENDANCHORED| \
767    PCRE2_FIRSTLINE|PCRE2_LITERAL|PCRE2_MATCH_INVALID_UTF| \
768    PCRE2_NO_START_OPTIMIZE|PCRE2_NO_UTF_CHECK|PCRE2_USE_OFFSET_LIMIT|PCRE2_UTF)
769 
770 #define PUBLIC_COMPILE_OPTIONS \
771   (PUBLIC_LITERAL_COMPILE_OPTIONS| \
772    PCRE2_ALLOW_EMPTY_CLASS|PCRE2_ALT_BSUX|PCRE2_ALT_CIRCUMFLEX| \
773    PCRE2_ALT_VERBNAMES|PCRE2_DOLLAR_ENDONLY|PCRE2_DOTALL|PCRE2_DUPNAMES| \
774    PCRE2_EXTENDED|PCRE2_EXTENDED_MORE|PCRE2_MATCH_UNSET_BACKREF| \
775    PCRE2_MULTILINE|PCRE2_NEVER_BACKSLASH_C|PCRE2_NEVER_UCP| \
776    PCRE2_NEVER_UTF|PCRE2_NO_AUTO_CAPTURE|PCRE2_NO_AUTO_POSSESS| \
777    PCRE2_NO_DOTSTAR_ANCHOR|PCRE2_UCP|PCRE2_UNGREEDY)
778 
779 #define PUBLIC_LITERAL_COMPILE_EXTRA_OPTIONS \
780    (PCRE2_EXTRA_MATCH_LINE|PCRE2_EXTRA_MATCH_WORD)
781 
782 #define PUBLIC_COMPILE_EXTRA_OPTIONS \
783    (PUBLIC_LITERAL_COMPILE_EXTRA_OPTIONS| \
784     PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES|PCRE2_EXTRA_BAD_ESCAPE_IS_LITERAL| \
785     PCRE2_EXTRA_ESCAPED_CR_IS_LF|PCRE2_EXTRA_ALT_BSUX)
786 
787 /* Compile time error code numbers. They are given names so that they can more
788 easily be tracked. When a new number is added, the tables called eint1 and
789 eint2 in pcre2posix.c may need to be updated, and a new error text must be
790 added to compile_error_texts in pcre2_error.c. */
791 
792 enum { ERR0 = COMPILE_ERROR_BASE,
793        ERR1,  ERR2,  ERR3,  ERR4,  ERR5,  ERR6,  ERR7,  ERR8,  ERR9,  ERR10,
794        ERR11, ERR12, ERR13, ERR14, ERR15, ERR16, ERR17, ERR18, ERR19, ERR20,
795        ERR21, ERR22, ERR23, ERR24, ERR25, ERR26, ERR27, ERR28, ERR29, ERR30,
796        ERR31, ERR32, ERR33, ERR34, ERR35, ERR36, ERR37, ERR38, ERR39, ERR40,
797        ERR41, ERR42, ERR43, ERR44, ERR45, ERR46, ERR47, ERR48, ERR49, ERR50,
798        ERR51, ERR52, ERR53, ERR54, ERR55, ERR56, ERR57, ERR58, ERR59, ERR60,
799        ERR61, ERR62, ERR63, ERR64, ERR65, ERR66, ERR67, ERR68, ERR69, ERR70,
800        ERR71, ERR72, ERR73, ERR74, ERR75, ERR76, ERR77, ERR78, ERR79, ERR80,
801        ERR81, ERR82, ERR83, ERR84, ERR85, ERR86, ERR87, ERR88, ERR89, ERR90,
802        ERR91, ERR92, ERR93, ERR94, ERR95, ERR96, ERR97, ERR98 };
803 
804 /* This is a table of start-of-pattern options such as (*UTF) and settings such
805 as (*LIMIT_MATCH=nnnn) and (*CRLF). For completeness and backward
806 compatibility, (*UTFn) is supported in the relevant libraries, but (*UTF) is
807 generic and always supported. */
808 
809 enum { PSO_OPT,     /* Value is an option bit */
810        PSO_FLG,     /* Value is a flag bit */
811        PSO_NL,      /* Value is a newline type */
812        PSO_BSR,     /* Value is a \R type */
813        PSO_LIMH,    /* Read integer value for heap limit */
814        PSO_LIMM,    /* Read integer value for match limit */
815        PSO_LIMD };  /* Read integer value for depth limit */
816 
817 typedef struct pso {
818   const uint8_t *name;
819   uint16_t length;
820   uint16_t type;
821   uint32_t value;
822 } pso;
823 
824 /* NB: STRING_UTFn_RIGHTPAR contains the length as well */
825 
826 static pso pso_list[] = {
827   { (uint8_t *)STRING_UTFn_RIGHTPAR,                  PSO_OPT, PCRE2_UTF },
828   { (uint8_t *)STRING_UTF_RIGHTPAR,                4, PSO_OPT, PCRE2_UTF },
829   { (uint8_t *)STRING_UCP_RIGHTPAR,                4, PSO_OPT, PCRE2_UCP },
830   { (uint8_t *)STRING_NOTEMPTY_RIGHTPAR,           9, PSO_FLG, PCRE2_NOTEMPTY_SET },
831   { (uint8_t *)STRING_NOTEMPTY_ATSTART_RIGHTPAR,  17, PSO_FLG, PCRE2_NE_ATST_SET },
832   { (uint8_t *)STRING_NO_AUTO_POSSESS_RIGHTPAR,   16, PSO_OPT, PCRE2_NO_AUTO_POSSESS },
833   { (uint8_t *)STRING_NO_DOTSTAR_ANCHOR_RIGHTPAR, 18, PSO_OPT, PCRE2_NO_DOTSTAR_ANCHOR },
834   { (uint8_t *)STRING_NO_JIT_RIGHTPAR,             7, PSO_FLG, PCRE2_NOJIT },
835   { (uint8_t *)STRING_NO_START_OPT_RIGHTPAR,      13, PSO_OPT, PCRE2_NO_START_OPTIMIZE },
836   { (uint8_t *)STRING_LIMIT_HEAP_EQ,              11, PSO_LIMH, 0 },
837   { (uint8_t *)STRING_LIMIT_MATCH_EQ,             12, PSO_LIMM, 0 },
838   { (uint8_t *)STRING_LIMIT_DEPTH_EQ,             12, PSO_LIMD, 0 },
839   { (uint8_t *)STRING_LIMIT_RECURSION_EQ,         16, PSO_LIMD, 0 },
840   { (uint8_t *)STRING_CR_RIGHTPAR,                 3, PSO_NL,  PCRE2_NEWLINE_CR },
841   { (uint8_t *)STRING_LF_RIGHTPAR,                 3, PSO_NL,  PCRE2_NEWLINE_LF },
842   { (uint8_t *)STRING_CRLF_RIGHTPAR,               5, PSO_NL,  PCRE2_NEWLINE_CRLF },
843   { (uint8_t *)STRING_ANY_RIGHTPAR,                4, PSO_NL,  PCRE2_NEWLINE_ANY },
844   { (uint8_t *)STRING_NUL_RIGHTPAR,                4, PSO_NL,  PCRE2_NEWLINE_NUL },
845   { (uint8_t *)STRING_ANYCRLF_RIGHTPAR,            8, PSO_NL,  PCRE2_NEWLINE_ANYCRLF },
846   { (uint8_t *)STRING_BSR_ANYCRLF_RIGHTPAR,       12, PSO_BSR, PCRE2_BSR_ANYCRLF },
847   { (uint8_t *)STRING_BSR_UNICODE_RIGHTPAR,       12, PSO_BSR, PCRE2_BSR_UNICODE }
848 };
849 
850 /* This table is used when converting repeating opcodes into possessified
851 versions as a result of an explicit possessive quantifier such as ++. A zero
852 value means there is no possessified version - in those cases the item in
853 question must be wrapped in ONCE brackets. The table is truncated at OP_CALLOUT
854 because all relevant opcodes are less than that. */
855 
856 static const uint8_t opcode_possessify[] = {
857   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,   /* 0 - 15  */
858   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,   /* 16 - 31 */
859 
860   0,                       /* NOTI */
861   OP_POSSTAR, 0,           /* STAR, MINSTAR */
862   OP_POSPLUS, 0,           /* PLUS, MINPLUS */
863   OP_POSQUERY, 0,          /* QUERY, MINQUERY */
864   OP_POSUPTO, 0,           /* UPTO, MINUPTO */
865   0,                       /* EXACT */
866   0, 0, 0, 0,              /* POS{STAR,PLUS,QUERY,UPTO} */
867 
868   OP_POSSTARI, 0,          /* STARI, MINSTARI */
869   OP_POSPLUSI, 0,          /* PLUSI, MINPLUSI */
870   OP_POSQUERYI, 0,         /* QUERYI, MINQUERYI */
871   OP_POSUPTOI, 0,          /* UPTOI, MINUPTOI */
872   0,                       /* EXACTI */
873   0, 0, 0, 0,              /* POS{STARI,PLUSI,QUERYI,UPTOI} */
874 
875   OP_NOTPOSSTAR, 0,        /* NOTSTAR, NOTMINSTAR */
876   OP_NOTPOSPLUS, 0,        /* NOTPLUS, NOTMINPLUS */
877   OP_NOTPOSQUERY, 0,       /* NOTQUERY, NOTMINQUERY */
878   OP_NOTPOSUPTO, 0,        /* NOTUPTO, NOTMINUPTO */
879   0,                       /* NOTEXACT */
880   0, 0, 0, 0,              /* NOTPOS{STAR,PLUS,QUERY,UPTO} */
881 
882   OP_NOTPOSSTARI, 0,       /* NOTSTARI, NOTMINSTARI */
883   OP_NOTPOSPLUSI, 0,       /* NOTPLUSI, NOTMINPLUSI */
884   OP_NOTPOSQUERYI, 0,      /* NOTQUERYI, NOTMINQUERYI */
885   OP_NOTPOSUPTOI, 0,       /* NOTUPTOI, NOTMINUPTOI */
886   0,                       /* NOTEXACTI */
887   0, 0, 0, 0,              /* NOTPOS{STARI,PLUSI,QUERYI,UPTOI} */
888 
889   OP_TYPEPOSSTAR, 0,       /* TYPESTAR, TYPEMINSTAR */
890   OP_TYPEPOSPLUS, 0,       /* TYPEPLUS, TYPEMINPLUS */
891   OP_TYPEPOSQUERY, 0,      /* TYPEQUERY, TYPEMINQUERY */
892   OP_TYPEPOSUPTO, 0,       /* TYPEUPTO, TYPEMINUPTO */
893   0,                       /* TYPEEXACT */
894   0, 0, 0, 0,              /* TYPEPOS{STAR,PLUS,QUERY,UPTO} */
895 
896   OP_CRPOSSTAR, 0,         /* CRSTAR, CRMINSTAR */
897   OP_CRPOSPLUS, 0,         /* CRPLUS, CRMINPLUS */
898   OP_CRPOSQUERY, 0,        /* CRQUERY, CRMINQUERY */
899   OP_CRPOSRANGE, 0,        /* CRRANGE, CRMINRANGE */
900   0, 0, 0, 0,              /* CRPOS{STAR,PLUS,QUERY,RANGE} */
901 
902   0, 0, 0,                 /* CLASS, NCLASS, XCLASS */
903   0, 0,                    /* REF, REFI */
904   0, 0,                    /* DNREF, DNREFI */
905   0, 0                     /* RECURSE, CALLOUT */
906 };
907 
908 
909 #ifdef DEBUG_SHOW_PARSED
910 /*************************************************
911 *     Show the parsed pattern for debugging      *
912 *************************************************/
913 
914 /* For debugging the pre-scan, this code, which outputs the parsed data vector,
915 can be enabled. */
916 
show_parsed(compile_block * cb)917 static void show_parsed(compile_block *cb)
918 {
919 uint32_t *pptr = cb->parsed_pattern;
920 
921 for (;;)
922   {
923   int max, min;
924   PCRE2_SIZE offset;
925   uint32_t i;
926   uint32_t length;
927   uint32_t meta_arg = META_DATA(*pptr);
928 
929   fprintf(stderr, "+++ %02d %.8x ", (int)(pptr - cb->parsed_pattern), *pptr);
930 
931   if (*pptr < META_END)
932     {
933     if (*pptr > 32 && *pptr < 128) fprintf(stderr, "%c", *pptr);
934     pptr++;
935     }
936 
937   else switch (META_CODE(*pptr++))
938     {
939     default:
940     fprintf(stderr, "**** OOPS - unknown META value - giving up ****\n");
941     return;
942 
943     case META_END:
944     fprintf(stderr, "META_END\n");
945     return;
946 
947     case META_CAPTURE:
948     fprintf(stderr, "META_CAPTURE %d", meta_arg);
949     break;
950 
951     case META_RECURSE:
952     GETOFFSET(offset, pptr);
953     fprintf(stderr, "META_RECURSE %d %zd", meta_arg, offset);
954     break;
955 
956     case META_BACKREF:
957     if (meta_arg < 10)
958       offset = cb->small_ref_offset[meta_arg];
959     else
960       GETOFFSET(offset, pptr);
961     fprintf(stderr, "META_BACKREF %d %zd", meta_arg, offset);
962     break;
963 
964     case META_ESCAPE:
965     if (meta_arg == ESC_P || meta_arg == ESC_p)
966       {
967       uint32_t ptype = *pptr >> 16;
968       uint32_t pvalue = *pptr++ & 0xffff;
969       fprintf(stderr, "META \\%c %d %d", (meta_arg == ESC_P)? 'P':'p',
970         ptype, pvalue);
971       }
972     else
973       {
974       uint32_t cc;
975       /* There's just one escape we might have here that isn't negated in the
976       escapes table. */
977       if (meta_arg == ESC_g) cc = CHAR_g;
978       else for (cc = ESCAPES_FIRST; cc <= ESCAPES_LAST; cc++)
979         {
980         if (meta_arg == (uint32_t)(-escapes[cc - ESCAPES_FIRST])) break;
981         }
982       if (cc > ESCAPES_LAST) cc = CHAR_QUESTION_MARK;
983       fprintf(stderr, "META \\%c", cc);
984       }
985     break;
986 
987     case META_MINMAX:
988     min = *pptr++;
989     max = *pptr++;
990     if (max != REPEAT_UNLIMITED)
991       fprintf(stderr, "META {%d,%d}", min, max);
992     else
993       fprintf(stderr, "META {%d,}", min);
994     break;
995 
996     case META_MINMAX_QUERY:
997     min = *pptr++;
998     max = *pptr++;
999     if (max != REPEAT_UNLIMITED)
1000       fprintf(stderr, "META {%d,%d}?", min, max);
1001     else
1002       fprintf(stderr, "META {%d,}?", min);
1003     break;
1004 
1005     case META_MINMAX_PLUS:
1006     min = *pptr++;
1007     max = *pptr++;
1008     if (max != REPEAT_UNLIMITED)
1009       fprintf(stderr, "META {%d,%d}+", min, max);
1010     else
1011       fprintf(stderr, "META {%d,}+", min);
1012     break;
1013 
1014     case META_BIGVALUE: fprintf(stderr, "META_BIGVALUE %.8x", *pptr++); break;
1015     case META_CIRCUMFLEX: fprintf(stderr, "META_CIRCUMFLEX"); break;
1016     case META_COND_ASSERT: fprintf(stderr, "META_COND_ASSERT"); break;
1017     case META_DOLLAR: fprintf(stderr, "META_DOLLAR"); break;
1018     case META_DOT: fprintf(stderr, "META_DOT"); break;
1019     case META_ASTERISK: fprintf(stderr, "META *"); break;
1020     case META_ASTERISK_QUERY: fprintf(stderr, "META *?"); break;
1021     case META_ASTERISK_PLUS: fprintf(stderr, "META *+"); break;
1022     case META_PLUS: fprintf(stderr, "META +"); break;
1023     case META_PLUS_QUERY: fprintf(stderr, "META +?"); break;
1024     case META_PLUS_PLUS: fprintf(stderr, "META ++"); break;
1025     case META_QUERY: fprintf(stderr, "META ?"); break;
1026     case META_QUERY_QUERY: fprintf(stderr, "META ??"); break;
1027     case META_QUERY_PLUS: fprintf(stderr, "META ?+"); break;
1028 
1029     case META_ATOMIC: fprintf(stderr, "META (?>"); break;
1030     case META_NOCAPTURE: fprintf(stderr, "META (?:"); break;
1031     case META_LOOKAHEAD: fprintf(stderr, "META (?="); break;
1032     case META_LOOKAHEADNOT: fprintf(stderr, "META (?!"); break;
1033     case META_LOOKAHEAD_NA: fprintf(stderr, "META (*napla:"); break;
1034     case META_SCRIPT_RUN: fprintf(stderr, "META (*sr:"); break;
1035     case META_KET: fprintf(stderr, "META )"); break;
1036     case META_ALT: fprintf(stderr, "META | %d", meta_arg); break;
1037 
1038     case META_CLASS: fprintf(stderr, "META ["); break;
1039     case META_CLASS_NOT: fprintf(stderr, "META [^"); break;
1040     case META_CLASS_END: fprintf(stderr, "META ]"); break;
1041     case META_CLASS_EMPTY: fprintf(stderr, "META []"); break;
1042     case META_CLASS_EMPTY_NOT: fprintf(stderr, "META [^]"); break;
1043 
1044     case META_RANGE_LITERAL: fprintf(stderr, "META - (literal)"); break;
1045     case META_RANGE_ESCAPED: fprintf(stderr, "META - (escaped)"); break;
1046 
1047     case META_POSIX: fprintf(stderr, "META_POSIX %d", *pptr++); break;
1048     case META_POSIX_NEG: fprintf(stderr, "META_POSIX_NEG %d", *pptr++); break;
1049 
1050     case META_ACCEPT: fprintf(stderr, "META (*ACCEPT)"); break;
1051     case META_FAIL: fprintf(stderr, "META (*FAIL)"); break;
1052     case META_COMMIT: fprintf(stderr, "META (*COMMIT)"); break;
1053     case META_PRUNE: fprintf(stderr, "META (*PRUNE)"); break;
1054     case META_SKIP: fprintf(stderr, "META (*SKIP)"); break;
1055     case META_THEN: fprintf(stderr, "META (*THEN)"); break;
1056 
1057     case META_OPTIONS: fprintf(stderr, "META_OPTIONS 0x%02x", *pptr++); break;
1058 
1059     case META_LOOKBEHIND:
1060     fprintf(stderr, "META (?<= %d offset=", meta_arg);
1061     GETOFFSET(offset, pptr);
1062     fprintf(stderr, "%zd", offset);
1063     break;
1064 
1065     case META_LOOKBEHIND_NA:
1066     fprintf(stderr, "META (*naplb: %d offset=", meta_arg);
1067     GETOFFSET(offset, pptr);
1068     fprintf(stderr, "%zd", offset);
1069     break;
1070 
1071     case META_LOOKBEHINDNOT:
1072     fprintf(stderr, "META (?<! %d offset=", meta_arg);
1073     GETOFFSET(offset, pptr);
1074     fprintf(stderr, "%zd", offset);
1075     break;
1076 
1077     case META_CALLOUT_NUMBER:
1078     fprintf(stderr, "META (?C%d) next=%d/%d", pptr[2], pptr[0],
1079        pptr[1]);
1080     pptr += 3;
1081     break;
1082 
1083     case META_CALLOUT_STRING:
1084       {
1085       uint32_t patoffset = *pptr++;    /* Offset of next pattern item */
1086       uint32_t patlength = *pptr++;    /* Length of next pattern item */
1087       fprintf(stderr, "META (?Cstring) length=%d offset=", *pptr++);
1088       GETOFFSET(offset, pptr);
1089       fprintf(stderr, "%zd next=%d/%d", offset, patoffset, patlength);
1090       }
1091     break;
1092 
1093     case META_RECURSE_BYNAME:
1094     fprintf(stderr, "META (?(&name) length=%d offset=", *pptr++);
1095     GETOFFSET(offset, pptr);
1096     fprintf(stderr, "%zd", offset);
1097     break;
1098 
1099     case META_BACKREF_BYNAME:
1100     fprintf(stderr, "META_BACKREF_BYNAME length=%d offset=", *pptr++);
1101     GETOFFSET(offset, pptr);
1102     fprintf(stderr, "%zd", offset);
1103     break;
1104 
1105     case META_COND_NUMBER:
1106     fprintf(stderr, "META_COND_NUMBER %d offset=", pptr[SIZEOFFSET]);
1107     GETOFFSET(offset, pptr);
1108     fprintf(stderr, "%zd", offset);
1109     pptr++;
1110     break;
1111 
1112     case META_COND_DEFINE:
1113     fprintf(stderr, "META (?(DEFINE) offset=");
1114     GETOFFSET(offset, pptr);
1115     fprintf(stderr, "%zd", offset);
1116     break;
1117 
1118     case META_COND_VERSION:
1119     fprintf(stderr, "META (?(VERSION%s", (*pptr++ == 0)? "=" : ">=");
1120     fprintf(stderr, "%d.", *pptr++);
1121     fprintf(stderr, "%d)", *pptr++);
1122     break;
1123 
1124     case META_COND_NAME:
1125     fprintf(stderr, "META (?(<name>) length=%d offset=", *pptr++);
1126     GETOFFSET(offset, pptr);
1127     fprintf(stderr, "%zd", offset);
1128     break;
1129 
1130     case META_COND_RNAME:
1131     fprintf(stderr, "META (?(R&name) length=%d offset=", *pptr++);
1132     GETOFFSET(offset, pptr);
1133     fprintf(stderr, "%zd", offset);
1134     break;
1135 
1136     /* This is kept as a name, because it might be. */
1137 
1138     case META_COND_RNUMBER:
1139     fprintf(stderr, "META (?(Rnumber) length=%d offset=", *pptr++);
1140     GETOFFSET(offset, pptr);
1141     fprintf(stderr, "%zd", offset);
1142     break;
1143 
1144     case META_MARK:
1145     fprintf(stderr, "META (*MARK:");
1146     goto SHOWARG;
1147 
1148     case META_COMMIT_ARG:
1149     fprintf(stderr, "META (*COMMIT:");
1150     goto SHOWARG;
1151 
1152     case META_PRUNE_ARG:
1153     fprintf(stderr, "META (*PRUNE:");
1154     goto SHOWARG;
1155 
1156     case META_SKIP_ARG:
1157     fprintf(stderr, "META (*SKIP:");
1158     goto SHOWARG;
1159 
1160     case META_THEN_ARG:
1161     fprintf(stderr, "META (*THEN:");
1162     SHOWARG:
1163     length = *pptr++;
1164     for (i = 0; i < length; i++)
1165       {
1166       uint32_t cc = *pptr++;
1167       if (cc > 32 && cc < 128) fprintf(stderr, "%c", cc);
1168         else fprintf(stderr, "\\x{%x}", cc);
1169       }
1170     fprintf(stderr, ") length=%u", length);
1171     break;
1172     }
1173   fprintf(stderr, "\n");
1174   }
1175 return;
1176 }
1177 #endif  /* DEBUG_SHOW_PARSED */
1178 
1179 
1180 
1181 /*************************************************
1182 *               Copy compiled code               *
1183 *************************************************/
1184 
1185 /* Compiled JIT code cannot be copied, so the new compiled block has no
1186 associated JIT data. */
1187 
1188 PCRE2_EXP_DEFN pcre2_code * PCRE2_CALL_CONVENTION
pcre2_code_copy(const pcre2_code * code)1189 pcre2_code_copy(const pcre2_code *code)
1190 {
1191 PCRE2_SIZE* ref_count;
1192 pcre2_code *newcode;
1193 
1194 if (code == NULL) return NULL;
1195 newcode = code->memctl.malloc(code->blocksize, code->memctl.memory_data);
1196 if (newcode == NULL) return NULL;
1197 memcpy(newcode, code, code->blocksize);
1198 newcode->executable_jit = NULL;
1199 
1200 /* If the code is one that has been deserialized, increment the reference count
1201 in the decoded tables. */
1202 
1203 if ((code->flags & PCRE2_DEREF_TABLES) != 0)
1204   {
1205   ref_count = (PCRE2_SIZE *)(code->tables + TABLES_LENGTH);
1206   (*ref_count)++;
1207   }
1208 
1209 return newcode;
1210 }
1211 
1212 
1213 
1214 /*************************************************
1215 *     Copy compiled code and character tables    *
1216 *************************************************/
1217 
1218 /* Compiled JIT code cannot be copied, so the new compiled block has no
1219 associated JIT data. This version of code_copy also makes a separate copy of
1220 the character tables. */
1221 
1222 PCRE2_EXP_DEFN pcre2_code * PCRE2_CALL_CONVENTION
pcre2_code_copy_with_tables(const pcre2_code * code)1223 pcre2_code_copy_with_tables(const pcre2_code *code)
1224 {
1225 PCRE2_SIZE* ref_count;
1226 pcre2_code *newcode;
1227 uint8_t *newtables;
1228 
1229 if (code == NULL) return NULL;
1230 newcode = code->memctl.malloc(code->blocksize, code->memctl.memory_data);
1231 if (newcode == NULL) return NULL;
1232 memcpy(newcode, code, code->blocksize);
1233 newcode->executable_jit = NULL;
1234 
1235 newtables = code->memctl.malloc(TABLES_LENGTH + sizeof(PCRE2_SIZE),
1236   code->memctl.memory_data);
1237 if (newtables == NULL)
1238   {
1239   code->memctl.free((void *)newcode, code->memctl.memory_data);
1240   return NULL;
1241   }
1242 memcpy(newtables, code->tables, TABLES_LENGTH);
1243 ref_count = (PCRE2_SIZE *)(newtables + TABLES_LENGTH);
1244 *ref_count = 1;
1245 
1246 newcode->tables = newtables;
1247 newcode->flags |= PCRE2_DEREF_TABLES;
1248 return newcode;
1249 }
1250 
1251 
1252 
1253 /*************************************************
1254 *               Free compiled code               *
1255 *************************************************/
1256 
1257 PCRE2_EXP_DEFN void PCRE2_CALL_CONVENTION
pcre2_code_free(pcre2_code * code)1258 pcre2_code_free(pcre2_code *code)
1259 {
1260 PCRE2_SIZE* ref_count;
1261 
1262 if (code != NULL)
1263   {
1264   if (code->executable_jit != NULL)
1265     PRIV(jit_free)(code->executable_jit, &code->memctl);
1266 
1267   if ((code->flags & PCRE2_DEREF_TABLES) != 0)
1268     {
1269     /* Decoded tables belong to the codes after deserialization, and they must
1270     be freed when there are no more references to them. The *ref_count should
1271     always be > 0. */
1272 
1273     ref_count = (PCRE2_SIZE *)(code->tables + TABLES_LENGTH);
1274     if (*ref_count > 0)
1275       {
1276       (*ref_count)--;
1277       if (*ref_count == 0)
1278         code->memctl.free((void *)code->tables, code->memctl.memory_data);
1279       }
1280     }
1281 
1282   code->memctl.free(code, code->memctl.memory_data);
1283   }
1284 }
1285 
1286 
1287 
1288 /*************************************************
1289 *         Read a number, possibly signed         *
1290 *************************************************/
1291 
1292 /* This function is used to read numbers in the pattern. The initial pointer
1293 must be the sign or first digit of the number. When relative values (introduced
1294 by + or -) are allowed, they are relative group numbers, and the result must be
1295 greater than zero.
1296 
1297 Arguments:
1298   ptrptr      points to the character pointer variable
1299   ptrend      points to the end of the input string
1300   allow_sign  if < 0, sign not allowed; if >= 0, sign is relative to this
1301   max_value   the largest number allowed
1302   max_error   the error to give for an over-large number
1303   intptr      where to put the result
1304   errcodeptr  where to put an error code
1305 
1306 Returns:      TRUE  - a number was read
1307               FALSE - errorcode == 0 => no number was found
1308                       errorcode != 0 => an error occurred
1309 */
1310 
1311 static BOOL
read_number(PCRE2_SPTR * ptrptr,PCRE2_SPTR ptrend,int32_t allow_sign,uint32_t max_value,uint32_t max_error,int * intptr,int * errorcodeptr)1312 read_number(PCRE2_SPTR *ptrptr, PCRE2_SPTR ptrend, int32_t allow_sign,
1313   uint32_t max_value, uint32_t max_error, int *intptr, int *errorcodeptr)
1314 {
1315 int sign = 0;
1316 uint32_t n = 0;
1317 PCRE2_SPTR ptr = *ptrptr;
1318 BOOL yield = FALSE;
1319 
1320 *errorcodeptr = 0;
1321 
1322 if (allow_sign >= 0 && ptr < ptrend)
1323   {
1324   if (*ptr == CHAR_PLUS)
1325     {
1326     sign = +1;
1327     max_value -= allow_sign;
1328     ptr++;
1329     }
1330   else if (*ptr == CHAR_MINUS)
1331     {
1332     sign = -1;
1333     ptr++;
1334     }
1335   }
1336 
1337 if (ptr >= ptrend || !IS_DIGIT(*ptr)) return FALSE;
1338 while (ptr < ptrend && IS_DIGIT(*ptr))
1339   {
1340   n = n * 10 + *ptr++ - CHAR_0;
1341   if (n > max_value)
1342     {
1343     *errorcodeptr = max_error;
1344     goto EXIT;
1345     }
1346   }
1347 
1348 if (allow_sign >= 0 && sign != 0)
1349   {
1350   if (n == 0)
1351     {
1352     *errorcodeptr = ERR26;  /* +0 and -0 are not allowed */
1353     goto EXIT;
1354     }
1355 
1356   if (sign > 0) n += allow_sign;
1357   else if ((int)n > allow_sign)
1358     {
1359     *errorcodeptr = ERR15;  /* Non-existent subpattern */
1360     goto EXIT;
1361     }
1362   else n = allow_sign + 1 - n;
1363   }
1364 
1365 yield = TRUE;
1366 
1367 EXIT:
1368 *intptr = n;
1369 *ptrptr = ptr;
1370 return yield;
1371 }
1372 
1373 
1374 
1375 /*************************************************
1376 *         Read repeat counts                     *
1377 *************************************************/
1378 
1379 /* Read an item of the form {n,m} and return the values if non-NULL pointers
1380 are supplied. Repeat counts must be less than 65536 (MAX_REPEAT_COUNT); a
1381 larger value is used for "unlimited". We have to use signed arguments for
1382 read_number() because it is capable of returning a signed value.
1383 
1384 Arguments:
1385   ptrptr         points to pointer to character after'{'
1386   ptrend         pointer to end of input
1387   minp           if not NULL, pointer to int for min
1388   maxp           if not NULL, pointer to int for max (-1 if no max)
1389                  returned as -1 if no max
1390   errorcodeptr   points to error code variable
1391 
1392 Returns:         FALSE if not a repeat quantifier, errorcode set zero
1393                  FALSE on error, with errorcode set non-zero
1394                  TRUE on success, with pointer updated to point after '}'
1395 */
1396 
1397 static BOOL
read_repeat_counts(PCRE2_SPTR * ptrptr,PCRE2_SPTR ptrend,uint32_t * minp,uint32_t * maxp,int * errorcodeptr)1398 read_repeat_counts(PCRE2_SPTR *ptrptr, PCRE2_SPTR ptrend, uint32_t *minp,
1399   uint32_t *maxp, int *errorcodeptr)
1400 {
1401 PCRE2_SPTR p = *ptrptr;
1402 BOOL yield = FALSE;
1403 int32_t min = 0;
1404 int32_t max = REPEAT_UNLIMITED; /* This value is larger than MAX_REPEAT_COUNT */
1405 
1406 /* NB read_number() initializes the error code to zero. The only error is for a
1407 number that is too big. */
1408 
1409 if (!read_number(&p, ptrend, -1, MAX_REPEAT_COUNT, ERR5, &min, errorcodeptr))
1410   goto EXIT;
1411 
1412 if (p >= ptrend) goto EXIT;
1413 
1414 if (*p == CHAR_RIGHT_CURLY_BRACKET)
1415   {
1416   p++;
1417   max = min;
1418   }
1419 
1420 else
1421   {
1422   if (*p++ != CHAR_COMMA || p >= ptrend) goto EXIT;
1423   if (*p != CHAR_RIGHT_CURLY_BRACKET)
1424     {
1425     if (!read_number(&p, ptrend, -1, MAX_REPEAT_COUNT, ERR5, &max,
1426         errorcodeptr) || p >= ptrend ||  *p != CHAR_RIGHT_CURLY_BRACKET)
1427       goto EXIT;
1428     if (max < min)
1429       {
1430       *errorcodeptr = ERR4;
1431       goto EXIT;
1432       }
1433     }
1434   p++;
1435   }
1436 
1437 yield = TRUE;
1438 if (minp != NULL) *minp = (uint32_t)min;
1439 if (maxp != NULL) *maxp = (uint32_t)max;
1440 
1441 /* Update the pattern pointer on success, or after an error, but not when
1442 the result is "not a repeat quantifier". */
1443 
1444 EXIT:
1445 if (yield || *errorcodeptr != 0) *ptrptr = p;
1446 return yield;
1447 }
1448 
1449 
1450 
1451 /*************************************************
1452 *            Handle escapes                      *
1453 *************************************************/
1454 
1455 /* This function is called when a \ has been encountered. It either returns a
1456 positive value for a simple escape such as \d, or 0 for a data character, which
1457 is placed in chptr. A backreference to group n is returned as negative n. On
1458 entry, ptr is pointing at the character after \. On exit, it points after the
1459 final code unit of the escape sequence.
1460 
1461 This function is also called from pcre2_substitute() to handle escape sequences
1462 in replacement strings. In this case, the cb argument is NULL, and in the case
1463 of escapes that have further processing, only sequences that define a data
1464 character are recognised. The isclass argument is not relevant; the options
1465 argument is the final value of the compiled pattern's options.
1466 
1467 Arguments:
1468   ptrptr         points to the input position pointer
1469   ptrend         points to the end of the input
1470   chptr          points to a returned data character
1471   errorcodeptr   points to the errorcode variable (containing zero)
1472   options        the current options bits
1473   isclass        TRUE if inside a character class
1474   cb             compile data block or NULL when called from pcre2_substitute()
1475 
1476 Returns:         zero => a data character
1477                  positive => a special escape sequence
1478                  negative => a numerical back reference
1479                  on error, errorcodeptr is set non-zero
1480 */
1481 
1482 int
PRIV(check_escape)1483 PRIV(check_escape)(PCRE2_SPTR *ptrptr, PCRE2_SPTR ptrend, uint32_t *chptr,
1484   int *errorcodeptr, uint32_t options, uint32_t extra_options, BOOL isclass,
1485   compile_block *cb)
1486 {
1487 BOOL utf = (options & PCRE2_UTF) != 0;
1488 PCRE2_SPTR ptr = *ptrptr;
1489 uint32_t c, cc;
1490 int escape = 0;
1491 int i;
1492 
1493 /* If backslash is at the end of the string, it's an error. */
1494 
1495 if (ptr >= ptrend)
1496   {
1497   *errorcodeptr = ERR1;
1498   return 0;
1499   }
1500 
1501 GETCHARINCTEST(c, ptr);         /* Get character value, increment pointer */
1502 *errorcodeptr = 0;              /* Be optimistic */
1503 
1504 /* Non-alphanumerics are literals, so we just leave the value in c. An initial
1505 value test saves a memory lookup for code points outside the alphanumeric
1506 range. */
1507 
1508 if (c < ESCAPES_FIRST || c > ESCAPES_LAST) {}  /* Definitely literal */
1509 
1510 /* Otherwise, do a table lookup. Non-zero values need little processing here. A
1511 positive value is a literal value for something like \n. A negative value is
1512 the negation of one of the ESC_ macros that is passed back for handling by the
1513 calling function. Some extra checking is needed for \N because only \N{U+dddd}
1514 is supported. If the value is zero, further processing is handled below. */
1515 
1516 else if ((i = escapes[c - ESCAPES_FIRST]) != 0)
1517   {
1518   if (i > 0)
1519     {
1520     c = (uint32_t)i;
1521     if (c == CHAR_CR && (extra_options & PCRE2_EXTRA_ESCAPED_CR_IS_LF) != 0)
1522       c = CHAR_LF;
1523     }
1524   else  /* Negative table entry */
1525     {
1526     escape = -i;                    /* Else return a special escape */
1527     if (cb != NULL && (escape == ESC_P || escape == ESC_p || escape == ESC_X))
1528       cb->external_flags |= PCRE2_HASBKPORX;   /* Note \P, \p, or \X */
1529 
1530     /* Perl supports \N{name} for character names and \N{U+dddd} for numerical
1531     Unicode code points, as well as plain \N for "not newline". PCRE does not
1532     support \N{name}. However, it does support quantification such as \N{2,3},
1533     so if \N{ is not followed by U+dddd we check for a quantifier. */
1534 
1535     if (escape == ESC_N && ptr < ptrend && *ptr == CHAR_LEFT_CURLY_BRACKET)
1536       {
1537       PCRE2_SPTR p = ptr + 1;
1538 
1539       /* \N{U+ can be handled by the \x{ code. However, this construction is
1540       not valid in EBCDIC environments because it specifies a Unicode
1541       character, not a codepoint in the local code. For example \N{U+0041}
1542       must be "A" in all environments. Also, in Perl, \N{U+ forces Unicode
1543       casing semantics for the entire pattern, so allow it only in UTF (i.e.
1544       Unicode) mode. */
1545 
1546       if (ptrend - p > 1 && *p == CHAR_U && p[1] == CHAR_PLUS)
1547         {
1548 #ifdef EBCDIC
1549         *errorcodeptr = ERR93;
1550 #else
1551         if (utf)
1552           {
1553           ptr = p + 1;
1554           escape = 0;   /* Not a fancy escape after all */
1555           goto COME_FROM_NU;
1556           }
1557         else *errorcodeptr = ERR93;
1558 #endif
1559         }
1560 
1561       /* Give an error if what follows is not a quantifier, but don't override
1562       an error set by the quantifier reader (e.g. number overflow). */
1563 
1564       else
1565         {
1566         if (!read_repeat_counts(&p, ptrend, NULL, NULL, errorcodeptr) &&
1567              *errorcodeptr == 0)
1568           *errorcodeptr = ERR37;
1569         }
1570       }
1571     }
1572   }
1573 
1574 /* Escapes that need further processing, including those that are unknown, have
1575 a zero entry in the lookup table. When called from pcre2_substitute(), only \c,
1576 \o, and \x are recognized (\u and \U can never appear as they are used for case
1577 forcing). */
1578 
1579 else
1580   {
1581   int s;
1582   PCRE2_SPTR oldptr;
1583   BOOL overflow;
1584   BOOL alt_bsux =
1585     ((options & PCRE2_ALT_BSUX) | (extra_options & PCRE2_EXTRA_ALT_BSUX)) != 0;
1586 
1587   /* Filter calls from pcre2_substitute(). */
1588 
1589   if (cb == NULL)
1590     {
1591     if (c != CHAR_c && c != CHAR_o && c != CHAR_x)
1592       {
1593       *errorcodeptr = ERR3;
1594       return 0;
1595       }
1596     alt_bsux = FALSE;   /* Do not modify \x handling */
1597     }
1598 
1599   switch (c)
1600     {
1601     /* A number of Perl escapes are not handled by PCRE. We give an explicit
1602     error. */
1603 
1604     case CHAR_F:
1605     case CHAR_l:
1606     case CHAR_L:
1607     *errorcodeptr = ERR37;
1608     break;
1609 
1610     /* \u is unrecognized when neither PCRE2_ALT_BSUX nor PCRE2_EXTRA_ALT_BSUX
1611     is set. Otherwise, \u must be followed by exactly four hex digits or, if
1612     PCRE2_EXTRA_ALT_BSUX is set, by any number of hex digits in braces.
1613     Otherwise it is a lowercase u letter. This gives some compatibility with
1614     ECMAScript (aka JavaScript). */
1615 
1616     case CHAR_u:
1617     if (!alt_bsux) *errorcodeptr = ERR37; else
1618       {
1619       uint32_t xc;
1620 
1621       if (ptr >= ptrend) break;
1622       if (*ptr == CHAR_LEFT_CURLY_BRACKET &&
1623           (extra_options & PCRE2_EXTRA_ALT_BSUX) != 0)
1624         {
1625         PCRE2_SPTR hptr = ptr + 1;
1626         cc = 0;
1627 
1628         while (hptr < ptrend && (xc = XDIGIT(*hptr)) != 0xff)
1629           {
1630           if ((cc & 0xf0000000) != 0)  /* Test for 32-bit overflow */
1631             {
1632             *errorcodeptr = ERR77;
1633             ptr = hptr;   /* Show where */
1634             break;        /* *hptr != } will cause another break below */
1635             }
1636           cc = (cc << 4) | xc;
1637           hptr++;
1638           }
1639 
1640         if (hptr == ptr + 1 ||   /* No hex digits */
1641             hptr >= ptrend ||    /* Hit end of input */
1642             *hptr != CHAR_RIGHT_CURLY_BRACKET)  /* No } terminator */
1643           break;         /* Hex escape not recognized */
1644 
1645         c = cc;          /* Accept the code point */
1646         ptr = hptr + 1;
1647         }
1648 
1649       else  /* Must be exactly 4 hex digits */
1650         {
1651         if (ptrend - ptr < 4) break;               /* Less than 4 chars */
1652         if ((cc = XDIGIT(ptr[0])) == 0xff) break;  /* Not a hex digit */
1653         if ((xc = XDIGIT(ptr[1])) == 0xff) break;  /* Not a hex digit */
1654         cc = (cc << 4) | xc;
1655         if ((xc = XDIGIT(ptr[2])) == 0xff) break;  /* Not a hex digit */
1656         cc = (cc << 4) | xc;
1657         if ((xc = XDIGIT(ptr[3])) == 0xff) break;  /* Not a hex digit */
1658         c = (cc << 4) | xc;
1659         ptr += 4;
1660         }
1661 
1662       if (utf)
1663         {
1664         if (c > 0x10ffffU) *errorcodeptr = ERR77;
1665         else
1666           if (c >= 0xd800 && c <= 0xdfff &&
1667               (extra_options & PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES) == 0)
1668                 *errorcodeptr = ERR73;
1669         }
1670       else if (c > MAX_NON_UTF_CHAR) *errorcodeptr = ERR77;
1671       }
1672     break;
1673 
1674     /* \U is unrecognized unless PCRE2_ALT_BSUX or PCRE2_EXTRA_ALT_BSUX is set,
1675     in which case it is an upper case letter. */
1676 
1677     case CHAR_U:
1678     if (!alt_bsux) *errorcodeptr = ERR37;
1679     break;
1680 
1681     /* In a character class, \g is just a literal "g". Outside a character
1682     class, \g must be followed by one of a number of specific things:
1683 
1684     (1) A number, either plain or braced. If positive, it is an absolute
1685     backreference. If negative, it is a relative backreference. This is a Perl
1686     5.10 feature.
1687 
1688     (2) Perl 5.10 also supports \g{name} as a reference to a named group. This
1689     is part of Perl's movement towards a unified syntax for back references. As
1690     this is synonymous with \k{name}, we fudge it up by pretending it really
1691     was \k{name}.
1692 
1693     (3) For Oniguruma compatibility we also support \g followed by a name or a
1694     number either in angle brackets or in single quotes. However, these are
1695     (possibly recursive) subroutine calls, _not_ backreferences. We return
1696     the ESC_g code.
1697 
1698     Summary: Return a negative number for a numerical back reference, ESC_k for
1699     a named back reference, and ESC_g for a named or numbered subroutine call.
1700     */
1701 
1702     case CHAR_g:
1703     if (isclass) break;
1704 
1705     if (ptr >= ptrend)
1706       {
1707       *errorcodeptr = ERR57;
1708       break;
1709       }
1710 
1711     if (*ptr == CHAR_LESS_THAN_SIGN || *ptr == CHAR_APOSTROPHE)
1712       {
1713       escape = ESC_g;
1714       break;
1715       }
1716 
1717     /* If there is a brace delimiter, try to read a numerical reference. If
1718     there isn't one, assume we have a name and treat it as \k. */
1719 
1720     if (*ptr == CHAR_LEFT_CURLY_BRACKET)
1721       {
1722       PCRE2_SPTR p = ptr + 1;
1723       if (!read_number(&p, ptrend, cb->bracount, MAX_GROUP_NUMBER, ERR61, &s,
1724           errorcodeptr))
1725         {
1726         if (*errorcodeptr == 0) escape = ESC_k;  /* No number found */
1727         break;
1728         }
1729       if (p >= ptrend || *p != CHAR_RIGHT_CURLY_BRACKET)
1730         {
1731         *errorcodeptr = ERR57;
1732         break;
1733         }
1734       ptr = p + 1;
1735       }
1736 
1737     /* Read an undelimited number */
1738 
1739     else
1740       {
1741       if (!read_number(&ptr, ptrend, cb->bracount, MAX_GROUP_NUMBER, ERR61, &s,
1742           errorcodeptr))
1743         {
1744         if (*errorcodeptr == 0) *errorcodeptr = ERR57;  /* No number found */
1745         break;
1746         }
1747       }
1748 
1749     if (s <= 0)
1750       {
1751       *errorcodeptr = ERR15;
1752       break;
1753       }
1754 
1755     escape = -s;
1756     break;
1757 
1758     /* The handling of escape sequences consisting of a string of digits
1759     starting with one that is not zero is not straightforward. Perl has changed
1760     over the years. Nowadays \g{} for backreferences and \o{} for octal are
1761     recommended to avoid the ambiguities in the old syntax.
1762 
1763     Outside a character class, the digits are read as a decimal number. If the
1764     number is less than 10, or if there are that many previous extracting left
1765     brackets, it is a back reference. Otherwise, up to three octal digits are
1766     read to form an escaped character code. Thus \123 is likely to be octal 123
1767     (cf \0123, which is octal 012 followed by the literal 3).
1768 
1769     Inside a character class, \ followed by a digit is always either a literal
1770     8 or 9 or an octal number. */
1771 
1772     case CHAR_1: case CHAR_2: case CHAR_3: case CHAR_4: case CHAR_5:
1773     case CHAR_6: case CHAR_7: case CHAR_8: case CHAR_9:
1774 
1775     if (!isclass)
1776       {
1777       oldptr = ptr;
1778       ptr--;   /* Back to the digit */
1779       if (!read_number(&ptr, ptrend, -1, INT_MAX/10 - 1, ERR61, &s,
1780           errorcodeptr))
1781         break;
1782 
1783       /* \1 to \9 are always back references. \8x and \9x are too; \1x to \7x
1784       are octal escapes if there are not that many previous captures. */
1785 
1786       if (s < 10 || oldptr[-1] >= CHAR_8 || s <= (int)cb->bracount)
1787         {
1788         if (s > (int)MAX_GROUP_NUMBER) *errorcodeptr = ERR61;
1789           else escape = -s;     /* Indicates a back reference */
1790         break;
1791         }
1792       ptr = oldptr;      /* Put the pointer back and fall through */
1793       }
1794 
1795     /* Handle a digit following \ when the number is not a back reference, or
1796     we are within a character class. If the first digit is 8 or 9, Perl used to
1797     generate a binary zero and then treat the digit as a following literal. At
1798     least by Perl 5.18 this changed so as not to insert the binary zero. */
1799 
1800     if (c >= CHAR_8) break;
1801 
1802     /* Fall through */
1803 
1804     /* \0 always starts an octal number, but we may drop through to here with a
1805     larger first octal digit. The original code used just to take the least
1806     significant 8 bits of octal numbers (I think this is what early Perls used
1807     to do). Nowadays we allow for larger numbers in UTF-8 mode and 16-bit mode,
1808     but no more than 3 octal digits. */
1809 
1810     case CHAR_0:
1811     c -= CHAR_0;
1812     while(i++ < 2 && ptr < ptrend && *ptr >= CHAR_0 && *ptr <= CHAR_7)
1813         c = c * 8 + *ptr++ - CHAR_0;
1814 #if PCRE2_CODE_UNIT_WIDTH == 8
1815     if (!utf && c > 0xff) *errorcodeptr = ERR51;
1816 #endif
1817     break;
1818 
1819     /* \o is a relatively new Perl feature, supporting a more general way of
1820     specifying character codes in octal. The only supported form is \o{ddd}. */
1821 
1822     case CHAR_o:
1823     if (ptr >= ptrend || *ptr++ != CHAR_LEFT_CURLY_BRACKET)
1824       {
1825       ptr--;
1826       *errorcodeptr = ERR55;
1827       }
1828     else if (ptr >= ptrend || *ptr == CHAR_RIGHT_CURLY_BRACKET)
1829       *errorcodeptr = ERR78;
1830     else
1831       {
1832       c = 0;
1833       overflow = FALSE;
1834       while (ptr < ptrend && *ptr >= CHAR_0 && *ptr <= CHAR_7)
1835         {
1836         cc = *ptr++;
1837         if (c == 0 && cc == CHAR_0) continue;     /* Leading zeroes */
1838 #if PCRE2_CODE_UNIT_WIDTH == 32
1839         if (c >= 0x20000000l) { overflow = TRUE; break; }
1840 #endif
1841         c = (c << 3) + (cc - CHAR_0);
1842 #if PCRE2_CODE_UNIT_WIDTH == 8
1843         if (c > (utf ? 0x10ffffU : 0xffU)) { overflow = TRUE; break; }
1844 #elif PCRE2_CODE_UNIT_WIDTH == 16
1845         if (c > (utf ? 0x10ffffU : 0xffffU)) { overflow = TRUE; break; }
1846 #elif PCRE2_CODE_UNIT_WIDTH == 32
1847         if (utf && c > 0x10ffffU) { overflow = TRUE; break; }
1848 #endif
1849         }
1850       if (overflow)
1851         {
1852         while (ptr < ptrend && *ptr >= CHAR_0 && *ptr <= CHAR_7) ptr++;
1853         *errorcodeptr = ERR34;
1854         }
1855       else if (ptr < ptrend && *ptr++ == CHAR_RIGHT_CURLY_BRACKET)
1856         {
1857         if (utf && c >= 0xd800 && c <= 0xdfff &&
1858             (extra_options & PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES) == 0)
1859           {
1860           ptr--;
1861           *errorcodeptr = ERR73;
1862           }
1863         }
1864       else
1865         {
1866         ptr--;
1867         *errorcodeptr = ERR64;
1868         }
1869       }
1870     break;
1871 
1872     /* When PCRE2_ALT_BSUX or PCRE2_EXTRA_ALT_BSUX is set, \x must be followed
1873     by two hexadecimal digits. Otherwise it is a lowercase x letter. */
1874 
1875     case CHAR_x:
1876     if (alt_bsux)
1877       {
1878       uint32_t xc;
1879       if (ptrend - ptr < 2) break;               /* Less than 2 characters */
1880       if ((cc = XDIGIT(ptr[0])) == 0xff) break;  /* Not a hex digit */
1881       if ((xc = XDIGIT(ptr[1])) == 0xff) break;  /* Not a hex digit */
1882       c = (cc << 4) | xc;
1883       ptr += 2;
1884       }
1885 
1886     /* Handle \x in Perl's style. \x{ddd} is a character code which can be
1887     greater than 0xff in UTF-8 or non-8bit mode, but only if the ddd are hex
1888     digits. If not, { used to be treated as a data character. However, Perl
1889     seems to read hex digits up to the first non-such, and ignore the rest, so
1890     that, for example \x{zz} matches a binary zero. This seems crazy, so PCRE
1891     now gives an error. */
1892 
1893     else
1894       {
1895       if (ptr < ptrend && *ptr == CHAR_LEFT_CURLY_BRACKET)
1896         {
1897 #ifndef EBCDIC
1898         COME_FROM_NU:
1899 #endif
1900         if (++ptr >= ptrend || *ptr == CHAR_RIGHT_CURLY_BRACKET)
1901           {
1902           *errorcodeptr = ERR78;
1903           break;
1904           }
1905         c = 0;
1906         overflow = FALSE;
1907 
1908         while (ptr < ptrend && (cc = XDIGIT(*ptr)) != 0xff)
1909           {
1910           ptr++;
1911           if (c == 0 && cc == 0) continue;   /* Leading zeroes */
1912 #if PCRE2_CODE_UNIT_WIDTH == 32
1913           if (c >= 0x10000000l) { overflow = TRUE; break; }
1914 #endif
1915           c = (c << 4) | cc;
1916           if ((utf && c > 0x10ffffU) || (!utf && c > MAX_NON_UTF_CHAR))
1917             {
1918             overflow = TRUE;
1919             break;
1920             }
1921           }
1922 
1923         if (overflow)
1924           {
1925           while (ptr < ptrend && XDIGIT(*ptr) != 0xff) ptr++;
1926           *errorcodeptr = ERR34;
1927           }
1928         else if (ptr < ptrend && *ptr++ == CHAR_RIGHT_CURLY_BRACKET)
1929           {
1930           if (utf && c >= 0xd800 && c <= 0xdfff &&
1931               (extra_options & PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES) == 0)
1932             {
1933             ptr--;
1934             *errorcodeptr = ERR73;
1935             }
1936           }
1937 
1938         /* If the sequence of hex digits does not end with '}', give an error.
1939         We used just to recognize this construct and fall through to the normal
1940         \x handling, but nowadays Perl gives an error, which seems much more
1941         sensible, so we do too. */
1942 
1943         else
1944           {
1945           ptr--;
1946           *errorcodeptr = ERR67;
1947           }
1948         }   /* End of \x{} processing */
1949 
1950       /* Read a up to two hex digits after \x */
1951 
1952       else
1953         {
1954         c = 0;
1955         if (ptr >= ptrend || (cc = XDIGIT(*ptr)) == 0xff) break;  /* Not a hex digit */
1956         ptr++;
1957         c = cc;
1958         if (ptr >= ptrend || (cc = XDIGIT(*ptr)) == 0xff) break;  /* Not a hex digit */
1959         ptr++;
1960         c = (c << 4) | cc;
1961         }     /* End of \xdd handling */
1962       }       /* End of Perl-style \x handling */
1963     break;
1964 
1965     /* The handling of \c is different in ASCII and EBCDIC environments. In an
1966     ASCII (or Unicode) environment, an error is given if the character
1967     following \c is not a printable ASCII character. Otherwise, the following
1968     character is upper-cased if it is a letter, and after that the 0x40 bit is
1969     flipped. The result is the value of the escape.
1970 
1971     In an EBCDIC environment the handling of \c is compatible with the
1972     specification in the perlebcdic document. The following character must be
1973     a letter or one of small number of special characters. These provide a
1974     means of defining the character values 0-31.
1975 
1976     For testing the EBCDIC handling of \c in an ASCII environment, recognize
1977     the EBCDIC value of 'c' explicitly. */
1978 
1979 #if defined EBCDIC && 'a' != 0x81
1980     case 0x83:
1981 #else
1982     case CHAR_c:
1983 #endif
1984     if (ptr >= ptrend)
1985       {
1986       *errorcodeptr = ERR2;
1987       break;
1988       }
1989     c = *ptr;
1990     if (c >= CHAR_a && c <= CHAR_z) c = UPPER_CASE(c);
1991 
1992     /* Handle \c in an ASCII/Unicode environment. */
1993 
1994 #ifndef EBCDIC    /* ASCII/UTF-8 coding */
1995     if (c < 32 || c > 126)  /* Excludes all non-printable ASCII */
1996       {
1997       *errorcodeptr = ERR68;
1998       break;
1999       }
2000     c ^= 0x40;
2001 
2002     /* Handle \c in an EBCDIC environment. The special case \c? is converted to
2003     255 (0xff) or 95 (0x5f) if other characters suggest we are using the
2004     POSIX-BC encoding. (This is the way Perl indicates that it handles \c?.)
2005     The other valid sequences correspond to a list of specific characters. */
2006 
2007 #else
2008     if (c == CHAR_QUESTION_MARK)
2009       c = ('\\' == 188 && '`' == 74)? 0x5f : 0xff;
2010     else
2011       {
2012       for (i = 0; i < 32; i++)
2013         {
2014         if (c == ebcdic_escape_c[i]) break;
2015         }
2016       if (i < 32) c = i; else *errorcodeptr = ERR68;
2017       }
2018 #endif  /* EBCDIC */
2019 
2020     ptr++;
2021     break;
2022 
2023     /* Any other alphanumeric following \ is an error. Perl gives an error only
2024     if in warning mode, but PCRE doesn't have a warning mode. */
2025 
2026     default:
2027     *errorcodeptr = ERR3;
2028     *ptrptr = ptr - 1;     /* Point to the character at fault */
2029     return 0;
2030     }
2031   }
2032 
2033 /* Set the pointer to the next character before returning. */
2034 
2035 *ptrptr = ptr;
2036 *chptr = c;
2037 return escape;
2038 }
2039 
2040 
2041 
2042 #ifdef SUPPORT_UNICODE
2043 /*************************************************
2044 *               Handle \P and \p                 *
2045 *************************************************/
2046 
2047 /* This function is called after \P or \p has been encountered, provided that
2048 PCRE2 is compiled with support for UTF and Unicode properties. On entry, the
2049 contents of ptrptr are pointing after the P or p. On exit, it is left pointing
2050 after the final code unit of the escape sequence.
2051 
2052 Arguments:
2053   ptrptr         the pattern position pointer
2054   negptr         a boolean that is set TRUE for negation else FALSE
2055   ptypeptr       an unsigned int that is set to the type value
2056   pdataptr       an unsigned int that is set to the detailed property value
2057   errorcodeptr   the error code variable
2058   cb             the compile data
2059 
2060 Returns:         TRUE if the type value was found, or FALSE for an invalid type
2061 */
2062 
2063 static BOOL
get_ucp(PCRE2_SPTR * ptrptr,BOOL * negptr,uint16_t * ptypeptr,uint16_t * pdataptr,int * errorcodeptr,compile_block * cb)2064 get_ucp(PCRE2_SPTR *ptrptr, BOOL *negptr, uint16_t *ptypeptr,
2065   uint16_t *pdataptr, int *errorcodeptr, compile_block *cb)
2066 {
2067 PCRE2_UCHAR c;
2068 PCRE2_SIZE i, bot, top;
2069 PCRE2_SPTR ptr = *ptrptr;
2070 PCRE2_UCHAR name[32];
2071 
2072 if (ptr >= cb->end_pattern) goto ERROR_RETURN;
2073 c = *ptr++;
2074 *negptr = FALSE;
2075 
2076 /* \P or \p can be followed by a name in {}, optionally preceded by ^ for
2077 negation. */
2078 
2079 if (c == CHAR_LEFT_CURLY_BRACKET)
2080   {
2081   if (ptr >= cb->end_pattern) goto ERROR_RETURN;
2082   if (*ptr == CHAR_CIRCUMFLEX_ACCENT)
2083     {
2084     *negptr = TRUE;
2085     ptr++;
2086     }
2087   for (i = 0; i < (int)(sizeof(name) / sizeof(PCRE2_UCHAR)) - 1; i++)
2088     {
2089     if (ptr >= cb->end_pattern) goto ERROR_RETURN;
2090     c = *ptr++;
2091     if (c == CHAR_NUL) goto ERROR_RETURN;
2092     if (c == CHAR_RIGHT_CURLY_BRACKET) break;
2093     name[i] = c;
2094     }
2095   if (c != CHAR_RIGHT_CURLY_BRACKET) goto ERROR_RETURN;
2096   name[i] = 0;
2097   }
2098 
2099 /* Otherwise there is just one following character, which must be an ASCII
2100 letter. */
2101 
2102 else if (MAX_255(c) && (cb->ctypes[c] & ctype_letter) != 0)
2103   {
2104   name[0] = c;
2105   name[1] = 0;
2106   }
2107 else goto ERROR_RETURN;
2108 
2109 *ptrptr = ptr;
2110 
2111 /* Search for a recognized property name using binary chop. */
2112 
2113 bot = 0;
2114 top = PRIV(utt_size);
2115 
2116 while (bot < top)
2117   {
2118   int r;
2119   i = (bot + top) >> 1;
2120   r = PRIV(strcmp_c8)(name, PRIV(utt_names) + PRIV(utt)[i].name_offset);
2121   if (r == 0)
2122     {
2123     *ptypeptr = PRIV(utt)[i].type;
2124     *pdataptr = PRIV(utt)[i].value;
2125     return TRUE;
2126     }
2127   if (r > 0) bot = i + 1; else top = i;
2128   }
2129 *errorcodeptr = ERR47;   /* Unrecognized name */
2130 return FALSE;
2131 
2132 ERROR_RETURN:            /* Malformed \P or \p */
2133 *errorcodeptr = ERR46;
2134 *ptrptr = ptr;
2135 return FALSE;
2136 }
2137 #endif
2138 
2139 
2140 
2141 /*************************************************
2142 *           Check for POSIX class syntax         *
2143 *************************************************/
2144 
2145 /* This function is called when the sequence "[:" or "[." or "[=" is
2146 encountered in a character class. It checks whether this is followed by a
2147 sequence of characters terminated by a matching ":]" or ".]" or "=]". If we
2148 reach an unescaped ']' without the special preceding character, return FALSE.
2149 
2150 Originally, this function only recognized a sequence of letters between the
2151 terminators, but it seems that Perl recognizes any sequence of characters,
2152 though of course unknown POSIX names are subsequently rejected. Perl gives an
2153 "Unknown POSIX class" error for [:f\oo:] for example, where previously PCRE
2154 didn't consider this to be a POSIX class. Likewise for [:1234:].
2155 
2156 The problem in trying to be exactly like Perl is in the handling of escapes. We
2157 have to be sure that [abc[:x\]pqr] is *not* treated as containing a POSIX
2158 class, but [abc[:x\]pqr:]] is (so that an error can be generated). The code
2159 below handles the special cases \\ and \], but does not try to do any other
2160 escape processing. This makes it different from Perl for cases such as
2161 [:l\ower:] where Perl recognizes it as the POSIX class "lower" but PCRE does
2162 not recognize "l\ower". This is a lesser evil than not diagnosing bad classes
2163 when Perl does, I think.
2164 
2165 A user pointed out that PCRE was rejecting [:a[:digit:]] whereas Perl was not.
2166 It seems that the appearance of a nested POSIX class supersedes an apparent
2167 external class. For example, [:a[:digit:]b:] matches "a", "b", ":", or
2168 a digit. This is handled by returning FALSE if the start of a new group with
2169 the same terminator is encountered, since the next closing sequence must close
2170 the nested group, not the outer one.
2171 
2172 In Perl, unescaped square brackets may also appear as part of class names. For
2173 example, [:a[:abc]b:] gives unknown POSIX class "[:abc]b:]". However, for
2174 [:a[:abc]b][b:] it gives unknown POSIX class "[:abc]b][b:]", which does not
2175 seem right at all. PCRE does not allow closing square brackets in POSIX class
2176 names.
2177 
2178 Arguments:
2179   ptr      pointer to the character after the initial [ (colon, dot, equals)
2180   ptrend   pointer to the end of the pattern
2181   endptr   where to return a pointer to the terminating ':', '.', or '='
2182 
2183 Returns:   TRUE or FALSE
2184 */
2185 
2186 static BOOL
check_posix_syntax(PCRE2_SPTR ptr,PCRE2_SPTR ptrend,PCRE2_SPTR * endptr)2187 check_posix_syntax(PCRE2_SPTR ptr, PCRE2_SPTR ptrend, PCRE2_SPTR *endptr)
2188 {
2189 PCRE2_UCHAR terminator;  /* Don't combine these lines; the Solaris cc */
2190 terminator = *ptr++;     /* compiler warns about "non-constant" initializer. */
2191 
2192 for (; ptrend - ptr >= 2; ptr++)
2193   {
2194   if (*ptr == CHAR_BACKSLASH &&
2195       (ptr[1] == CHAR_RIGHT_SQUARE_BRACKET || ptr[1] == CHAR_BACKSLASH))
2196     ptr++;
2197 
2198   else if ((*ptr == CHAR_LEFT_SQUARE_BRACKET && ptr[1] == terminator) ||
2199             *ptr == CHAR_RIGHT_SQUARE_BRACKET) return FALSE;
2200 
2201   else if (*ptr == terminator && ptr[1] == CHAR_RIGHT_SQUARE_BRACKET)
2202     {
2203     *endptr = ptr;
2204     return TRUE;
2205     }
2206   }
2207 
2208 return FALSE;
2209 }
2210 
2211 
2212 
2213 /*************************************************
2214 *          Check POSIX class name                *
2215 *************************************************/
2216 
2217 /* This function is called to check the name given in a POSIX-style class entry
2218 such as [:alnum:].
2219 
2220 Arguments:
2221   ptr        points to the first letter
2222   len        the length of the name
2223 
2224 Returns:     a value representing the name, or -1 if unknown
2225 */
2226 
2227 static int
check_posix_name(PCRE2_SPTR ptr,int len)2228 check_posix_name(PCRE2_SPTR ptr, int len)
2229 {
2230 const char *pn = posix_names;
2231 int yield = 0;
2232 while (posix_name_lengths[yield] != 0)
2233   {
2234   if (len == posix_name_lengths[yield] &&
2235     PRIV(strncmp_c8)(ptr, pn, (unsigned int)len) == 0) return yield;
2236   pn += posix_name_lengths[yield] + 1;
2237   yield++;
2238   }
2239 return -1;
2240 }
2241 
2242 
2243 
2244 /*************************************************
2245 *       Read a subpattern or VERB name           *
2246 *************************************************/
2247 
2248 /* This function is called from parse_regex() below whenever it needs to read
2249 the name of a subpattern or a (*VERB) or an (*alpha_assertion). The initial
2250 pointer must be to the character before the name. If that character is '*' we
2251 are reading a verb or alpha assertion name. The pointer is updated to point
2252 after the name, for a VERB or alpha assertion name, or after tha name's
2253 terminator for a subpattern name. Returning both the offset and the name
2254 pointer is redundant information, but some callers use one and some the other,
2255 so it is simplest just to return both.
2256 
2257 Arguments:
2258   ptrptr      points to the character pointer variable
2259   ptrend      points to the end of the input string
2260   utf         true if the input is UTF-encoded
2261   terminator  the terminator of a subpattern name must be this
2262   offsetptr   where to put the offset from the start of the pattern
2263   nameptr     where to put a pointer to the name in the input
2264   namelenptr  where to put the length of the name
2265   errcodeptr  where to put an error code
2266   cb          pointer to the compile data block
2267 
2268 Returns:    TRUE if a name was read
2269             FALSE otherwise, with error code set
2270 */
2271 
2272 static BOOL
read_name(PCRE2_SPTR * ptrptr,PCRE2_SPTR ptrend,BOOL utf,uint32_t terminator,PCRE2_SIZE * offsetptr,PCRE2_SPTR * nameptr,uint32_t * namelenptr,int * errorcodeptr,compile_block * cb)2273 read_name(PCRE2_SPTR *ptrptr, PCRE2_SPTR ptrend, BOOL utf, uint32_t terminator,
2274   PCRE2_SIZE *offsetptr, PCRE2_SPTR *nameptr, uint32_t *namelenptr,
2275   int *errorcodeptr, compile_block *cb)
2276 {
2277 PCRE2_SPTR ptr = *ptrptr;
2278 BOOL is_group = (*ptr != CHAR_ASTERISK);
2279 
2280 if (++ptr >= ptrend)               /* No characters in name */
2281   {
2282   *errorcodeptr = is_group? ERR62: /* Subpattern name expected */
2283                             ERR60; /* Verb not recognized or malformed */
2284   goto FAILED;
2285   }
2286 
2287 *nameptr = ptr;
2288 *offsetptr = (PCRE2_SIZE)(ptr - cb->start_pattern);
2289 
2290 /* In UTF mode, a group name may contain letters and decimal digits as defined
2291 by Unicode properties, and underscores, but must not start with a digit. */
2292 
2293 #ifdef SUPPORT_UNICODE
2294 if (utf && is_group)
2295   {
2296   uint32_t c, type;
2297 
2298   GETCHAR(c, ptr);
2299   type = UCD_CHARTYPE(c);
2300 
2301   if (type == ucp_Nd)
2302     {
2303     *errorcodeptr = ERR44;
2304     goto FAILED;
2305     }
2306 
2307   for(;;)
2308     {
2309     if (type != ucp_Nd && PRIV(ucp_gentype)[type] != ucp_L &&
2310         c != CHAR_UNDERSCORE) break;
2311     ptr++;
2312     FORWARDCHARTEST(ptr, ptrend);
2313     if (ptr >= ptrend) break;
2314     GETCHAR(c, ptr);
2315     type = UCD_CHARTYPE(c);
2316     }
2317   }
2318 else
2319 #else
2320 (void)utf;  /* Avoid compiler warning */
2321 #endif      /* SUPPORT_UNICODE */
2322 
2323 /* Handle non-group names and group names in non-UTF modes. A group name must
2324 not start with a digit. If either of the others start with a digit it just
2325 won't be recognized. */
2326 
2327   {
2328   if (is_group && IS_DIGIT(*ptr))
2329     {
2330     *errorcodeptr = ERR44;
2331     goto FAILED;
2332     }
2333 
2334   while (ptr < ptrend && MAX_255(*ptr) && (cb->ctypes[*ptr] & ctype_word) != 0)
2335     {
2336     ptr++;
2337     }
2338   }
2339 
2340 /* Check name length */
2341 
2342 if (ptr > *nameptr + MAX_NAME_SIZE)
2343   {
2344   *errorcodeptr = ERR48;
2345   goto FAILED;
2346   }
2347 *namelenptr = (uint32_t)(ptr - *nameptr);
2348 
2349 /* Subpattern names must not be empty, and their terminator is checked here.
2350 (What follows a verb or alpha assertion name is checked separately.) */
2351 
2352 if (is_group)
2353   {
2354   if (ptr == *nameptr)
2355     {
2356     *errorcodeptr = ERR62;   /* Subpattern name expected */
2357     goto FAILED;
2358     }
2359   if (ptr >= ptrend || *ptr != (PCRE2_UCHAR)terminator)
2360     {
2361     *errorcodeptr = ERR42;
2362     goto FAILED;
2363     }
2364   ptr++;
2365   }
2366 
2367 *ptrptr = ptr;
2368 return TRUE;
2369 
2370 FAILED:
2371 *ptrptr = ptr;
2372 return FALSE;
2373 }
2374 
2375 
2376 
2377 /*************************************************
2378 *          Manage callouts at start of cycle     *
2379 *************************************************/
2380 
2381 /* At the start of a new item in parse_regex() we are able to record the
2382 details of the previous item in a prior callout, and also to set up an
2383 automatic callout if enabled. Avoid having two adjacent automatic callouts,
2384 which would otherwise happen for items such as \Q that contribute nothing to
2385 the parsed pattern.
2386 
2387 Arguments:
2388   ptr              current pattern pointer
2389   pcalloutptr      points to a pointer to previous callout, or NULL
2390   auto_callout     TRUE if auto_callouts are enabled
2391   parsed_pattern   the parsed pattern pointer
2392   cb               compile block
2393 
2394 Returns: possibly updated parsed_pattern pointer.
2395 */
2396 
2397 static uint32_t *
manage_callouts(PCRE2_SPTR ptr,uint32_t ** pcalloutptr,BOOL auto_callout,uint32_t * parsed_pattern,compile_block * cb)2398 manage_callouts(PCRE2_SPTR ptr, uint32_t **pcalloutptr, BOOL auto_callout,
2399   uint32_t *parsed_pattern, compile_block *cb)
2400 {
2401 uint32_t *previous_callout = *pcalloutptr;
2402 
2403 if (previous_callout != NULL) previous_callout[2] = (uint32_t)(ptr -
2404   cb->start_pattern - (PCRE2_SIZE)previous_callout[1]);
2405 
2406 if (!auto_callout) previous_callout = NULL; else
2407   {
2408   if (previous_callout == NULL ||
2409       previous_callout != parsed_pattern - 4 ||
2410       previous_callout[3] != 255)
2411     {
2412     previous_callout = parsed_pattern;  /* Set up new automatic callout */
2413     parsed_pattern += 4;
2414     previous_callout[0] = META_CALLOUT_NUMBER;
2415     previous_callout[2] = 0;
2416     previous_callout[3] = 255;
2417     }
2418   previous_callout[1] = (uint32_t)(ptr - cb->start_pattern);
2419   }
2420 
2421 *pcalloutptr = previous_callout;
2422 return parsed_pattern;
2423 }
2424 
2425 
2426 
2427 /*************************************************
2428 *      Parse regex and identify named groups     *
2429 *************************************************/
2430 
2431 /* This function is called first of all. It scans the pattern and does two
2432 things: (1) It identifies capturing groups and makes a table of named capturing
2433 groups so that information about them is fully available to both the compiling
2434 scans. (2) It writes a parsed version of the pattern with comments omitted and
2435 escapes processed into the parsed_pattern vector.
2436 
2437 Arguments:
2438   ptr             points to the start of the pattern
2439   options         compiling dynamic options (may change during the scan)
2440   has_lookbehind  points to a boolean, set TRUE if a lookbehind is found
2441   cb              pointer to the compile data block
2442 
2443 Returns:   zero on success or a non-zero error code, with the
2444              error offset placed in the cb field
2445 */
2446 
2447 /* A structure and some flags for dealing with nested groups. */
2448 
2449 typedef struct nest_save {
2450   uint16_t  nest_depth;
2451   uint16_t  reset_group;
2452   uint16_t  max_group;
2453   uint16_t  flags;
2454   uint32_t  options;
2455 } nest_save;
2456 
2457 #define NSF_RESET          0x0001u
2458 #define NSF_CONDASSERT     0x0002u
2459 #define NSF_ATOMICSR       0x0004u
2460 
2461 /* Options that are changeable within the pattern must be tracked during
2462 parsing. Some (e.g. PCRE2_EXTENDED) are implemented entirely during parsing,
2463 but all must be tracked so that META_OPTIONS items set the correct values for
2464 the main compiling phase. */
2465 
2466 #define PARSE_TRACKED_OPTIONS (PCRE2_CASELESS|PCRE2_DOTALL|PCRE2_DUPNAMES| \
2467   PCRE2_EXTENDED|PCRE2_EXTENDED_MORE|PCRE2_MULTILINE|PCRE2_NO_AUTO_CAPTURE| \
2468   PCRE2_UNGREEDY)
2469 
2470 /* States used for analyzing ranges in character classes. The two OK values
2471 must be last. */
2472 
2473 enum { RANGE_NO, RANGE_STARTED, RANGE_OK_ESCAPED, RANGE_OK_LITERAL };
2474 
2475 /* Only in 32-bit mode can there be literals > META_END. A macro encapsulates
2476 the storing of literal values in the main parsed pattern, where they can always
2477 be quantified. */
2478 
2479 #if PCRE2_CODE_UNIT_WIDTH == 32
2480 #define PARSED_LITERAL(c, p) \
2481   { \
2482   if (c >= META_END) *p++ = META_BIGVALUE; \
2483   *p++ = c; \
2484   okquantifier = TRUE; \
2485   }
2486 #else
2487 #define PARSED_LITERAL(c, p) *p++ = c; okquantifier = TRUE;
2488 #endif
2489 
2490 /* Here's the actual function. */
2491 
parse_regex(PCRE2_SPTR ptr,uint32_t options,BOOL * has_lookbehind,compile_block * cb)2492 static int parse_regex(PCRE2_SPTR ptr, uint32_t options, BOOL *has_lookbehind,
2493   compile_block *cb)
2494 {
2495 uint32_t c;
2496 uint32_t delimiter;
2497 uint32_t namelen;
2498 uint32_t class_range_state;
2499 uint32_t *verblengthptr = NULL;     /* Value avoids compiler warning */
2500 uint32_t *verbstartptr = NULL;
2501 uint32_t *previous_callout = NULL;
2502 uint32_t *parsed_pattern = cb->parsed_pattern;
2503 uint32_t *parsed_pattern_end = cb->parsed_pattern_end;
2504 uint32_t meta_quantifier = 0;
2505 uint32_t add_after_mark = 0;
2506 uint32_t extra_options = cb->cx->extra_options;
2507 uint16_t nest_depth = 0;
2508 int after_manual_callout = 0;
2509 int expect_cond_assert = 0;
2510 int errorcode = 0;
2511 int escape;
2512 int i;
2513 BOOL inescq = FALSE;
2514 BOOL inverbname = FALSE;
2515 BOOL utf = (options & PCRE2_UTF) != 0;
2516 BOOL auto_callout = (options & PCRE2_AUTO_CALLOUT) != 0;
2517 BOOL isdupname;
2518 BOOL negate_class;
2519 BOOL okquantifier = FALSE;
2520 PCRE2_SPTR thisptr;
2521 PCRE2_SPTR name;
2522 PCRE2_SPTR ptrend = cb->end_pattern;
2523 PCRE2_SPTR verbnamestart = NULL;    /* Value avoids compiler warning */
2524 named_group *ng;
2525 nest_save *top_nest, *end_nests;
2526 
2527 /* Insert leading items for word and line matching (features provided for the
2528 benefit of pcre2grep). */
2529 
2530 if ((extra_options & PCRE2_EXTRA_MATCH_LINE) != 0)
2531   {
2532   *parsed_pattern++ = META_CIRCUMFLEX;
2533   *parsed_pattern++ = META_NOCAPTURE;
2534   }
2535 else if ((extra_options & PCRE2_EXTRA_MATCH_WORD) != 0)
2536   {
2537   *parsed_pattern++ = META_ESCAPE + ESC_b;
2538   *parsed_pattern++ = META_NOCAPTURE;
2539   }
2540 
2541 /* If the pattern is actually a literal string, process it separately to avoid
2542 cluttering up the main loop. */
2543 
2544 if ((options & PCRE2_LITERAL) != 0)
2545   {
2546   while (ptr < ptrend)
2547     {
2548     if (parsed_pattern >= parsed_pattern_end)
2549       {
2550       errorcode = ERR63;  /* Internal error (parsed pattern overflow) */
2551       goto FAILED;
2552       }
2553     thisptr = ptr;
2554     GETCHARINCTEST(c, ptr);
2555     if (auto_callout)
2556       parsed_pattern = manage_callouts(thisptr, &previous_callout,
2557         auto_callout, parsed_pattern, cb);
2558     PARSED_LITERAL(c, parsed_pattern);
2559     }
2560   goto PARSED_END;
2561   }
2562 
2563 /* Process a real regex which may contain meta-characters. */
2564 
2565 top_nest = NULL;
2566 end_nests = (nest_save *)(cb->start_workspace + cb->workspace_size);
2567 
2568 /* The size of the nest_save structure might not be a factor of the size of the
2569 workspace. Therefore we must round down end_nests so as to correctly avoid
2570 creating a nest_save that spans the end of the workspace. */
2571 
2572 end_nests = (nest_save *)((char *)end_nests -
2573   ((cb->workspace_size * sizeof(PCRE2_UCHAR)) % sizeof(nest_save)));
2574 
2575 /* PCRE2_EXTENDED_MORE implies PCRE2_EXTENDED */
2576 
2577 if ((options & PCRE2_EXTENDED_MORE) != 0) options |= PCRE2_EXTENDED;
2578 
2579 /* Now scan the pattern */
2580 
2581 while (ptr < ptrend)
2582   {
2583   int prev_expect_cond_assert;
2584   uint32_t min_repeat, max_repeat;
2585   uint32_t set, unset, *optset;
2586   uint32_t terminator;
2587   uint32_t prev_meta_quantifier;
2588   BOOL prev_okquantifier;
2589   PCRE2_SPTR tempptr;
2590   PCRE2_SIZE offset;
2591 
2592   if (parsed_pattern >= parsed_pattern_end)
2593     {
2594     errorcode = ERR63;  /* Internal error (parsed pattern overflow) */
2595     goto FAILED;
2596     }
2597 
2598   if (nest_depth > cb->cx->parens_nest_limit)
2599     {
2600     errorcode = ERR19;
2601     goto FAILED;        /* Parentheses too deeply nested */
2602     }
2603 
2604   /* Get next input character, save its position for callout handling. */
2605 
2606   thisptr = ptr;
2607   GETCHARINCTEST(c, ptr);
2608 
2609   /* Copy quoted literals until \E, allowing for the possibility of automatic
2610   callouts, except when processing a (*VERB) "name".  */
2611 
2612   if (inescq)
2613     {
2614     if (c == CHAR_BACKSLASH && ptr < ptrend && *ptr == CHAR_E)
2615       {
2616       inescq = FALSE;
2617       ptr++;   /* Skip E */
2618       }
2619     else
2620       {
2621       if (expect_cond_assert > 0)   /* A literal is not allowed if we are */
2622         {                           /* expecting a conditional assertion, */
2623         ptr--;                      /* but an empty \Q\E sequence is OK.  */
2624         errorcode = ERR28;
2625         goto FAILED;
2626         }
2627       if (inverbname)
2628         {                          /* Don't use PARSED_LITERAL() because it */
2629 #if PCRE2_CODE_UNIT_WIDTH == 32    /* sets okquantifier. */
2630         if (c >= META_END) *parsed_pattern++ = META_BIGVALUE;
2631 #endif
2632         *parsed_pattern++ = c;
2633         }
2634       else
2635         {
2636         if (after_manual_callout-- <= 0)
2637           parsed_pattern = manage_callouts(thisptr, &previous_callout,
2638             auto_callout, parsed_pattern, cb);
2639         PARSED_LITERAL(c, parsed_pattern);
2640         }
2641       meta_quantifier = 0;
2642       }
2643     continue;  /* Next character */
2644     }
2645 
2646   /* If we are processing the "name" part of a (*VERB:NAME) item, all
2647   characters up to the closing parenthesis are literals except when
2648   PCRE2_ALT_VERBNAMES is set. That causes backslash interpretation, but only \Q
2649   and \E and escaped characters are allowed (no character types such as \d). If
2650   PCRE2_EXTENDED is also set, we must ignore white space and # comments. Do
2651   this by not entering the special (*VERB:NAME) processing - they are then
2652   picked up below. Note that c is a character, not a code unit, so we must not
2653   use MAX_255 to test its size because MAX_255 tests code units and is assumed
2654   TRUE in 8-bit mode. */
2655 
2656   if (inverbname &&
2657        (
2658         /* EITHER: not both options set */
2659         ((options & (PCRE2_EXTENDED | PCRE2_ALT_VERBNAMES)) !=
2660                     (PCRE2_EXTENDED | PCRE2_ALT_VERBNAMES)) ||
2661 #ifdef SUPPORT_UNICODE
2662         /* OR: character > 255 AND not Unicode Pattern White Space */
2663         (c > 255 && (c|1) != 0x200f && (c|1) != 0x2029) ||
2664 #endif
2665         /* OR: not a # comment or isspace() white space */
2666         (c < 256 && c != CHAR_NUMBER_SIGN && (cb->ctypes[c] & ctype_space) == 0
2667 #ifdef SUPPORT_UNICODE
2668         /* and not CHAR_NEL when Unicode is supported */
2669           && c != CHAR_NEL
2670 #endif
2671        )))
2672     {
2673     PCRE2_SIZE verbnamelength;
2674 
2675     switch(c)
2676       {
2677       default:                     /* Don't use PARSED_LITERAL() because it */
2678 #if PCRE2_CODE_UNIT_WIDTH == 32    /* sets okquantifier. */
2679       if (c >= META_END) *parsed_pattern++ = META_BIGVALUE;
2680 #endif
2681       *parsed_pattern++ = c;
2682       break;
2683 
2684       case CHAR_RIGHT_PARENTHESIS:
2685       inverbname = FALSE;
2686       /* This is the length in characters */
2687       verbnamelength = (PCRE2_SIZE)(parsed_pattern - verblengthptr - 1);
2688       /* But the limit on the length is in code units */
2689       if (ptr - verbnamestart - 1 > (int)MAX_MARK)
2690         {
2691         ptr--;
2692         errorcode = ERR76;
2693         goto FAILED;
2694         }
2695       *verblengthptr = (uint32_t)verbnamelength;
2696 
2697       /* If this name was on a verb such as (*ACCEPT) which does not continue,
2698       a (*MARK) was generated for the name. We now add the original verb as the
2699       next item. */
2700 
2701       if (add_after_mark != 0)
2702         {
2703         *parsed_pattern++ = add_after_mark;
2704         add_after_mark = 0;
2705         }
2706       break;
2707 
2708       case CHAR_BACKSLASH:
2709       if ((options & PCRE2_ALT_VERBNAMES) != 0)
2710         {
2711         escape = PRIV(check_escape)(&ptr, ptrend, &c, &errorcode, options,
2712           cb->cx->extra_options, FALSE, cb);
2713         if (errorcode != 0) goto FAILED;
2714         }
2715       else escape = 0;   /* Treat all as literal */
2716 
2717       switch(escape)
2718         {
2719         case 0:                    /* Don't use PARSED_LITERAL() because it */
2720 #if PCRE2_CODE_UNIT_WIDTH == 32    /* sets okquantifier. */
2721         if (c >= META_END) *parsed_pattern++ = META_BIGVALUE;
2722 #endif
2723         *parsed_pattern++ = c;
2724         break;
2725 
2726         case ESC_Q:
2727         inescq = TRUE;
2728         break;
2729 
2730         case ESC_E:           /* Ignore */
2731         break;
2732 
2733         default:
2734         errorcode = ERR40;    /* Invalid in verb name */
2735         goto FAILED;
2736         }
2737       }
2738     continue;   /* Next character in pattern */
2739     }
2740 
2741   /* Not a verb name character. At this point we must process everything that
2742   must not change the quantification state. This is mainly comments, but we
2743   handle \Q and \E here as well, so that an item such as A\Q\E+ is treated as
2744   A+, as in Perl. An isolated \E is ignored. */
2745 
2746   if (c == CHAR_BACKSLASH && ptr < ptrend)
2747     {
2748     if (*ptr == CHAR_Q || *ptr == CHAR_E)
2749       {
2750       inescq = *ptr == CHAR_Q;
2751       ptr++;
2752       continue;
2753       }
2754     }
2755 
2756   /* Skip over whitespace and # comments in extended mode. Note that c is a
2757   character, not a code unit, so we must not use MAX_255 to test its size
2758   because MAX_255 tests code units and is assumed TRUE in 8-bit mode. The
2759   whitespace characters are those designated as "Pattern White Space" by
2760   Unicode, which are the isspace() characters plus CHAR_NEL (newline), which is
2761   U+0085 in Unicode, plus U+200E, U+200F, U+2028, and U+2029. These are a
2762   subset of space characters that match \h and \v. */
2763 
2764   if ((options & PCRE2_EXTENDED) != 0)
2765     {
2766     if (c < 256 && (cb->ctypes[c] & ctype_space) != 0) continue;
2767 #ifdef SUPPORT_UNICODE
2768     if (c == CHAR_NEL || (c|1) == 0x200f || (c|1) == 0x2029) continue;
2769 #endif
2770     if (c == CHAR_NUMBER_SIGN)
2771       {
2772       while (ptr < ptrend)
2773         {
2774         if (IS_NEWLINE(ptr))      /* For non-fixed-length newline cases, */
2775           {                       /* IS_NEWLINE sets cb->nllen. */
2776           ptr += cb->nllen;
2777           break;
2778           }
2779         ptr++;
2780 #ifdef SUPPORT_UNICODE
2781         if (utf) FORWARDCHARTEST(ptr, ptrend);
2782 #endif
2783         }
2784       continue;  /* Next character in pattern */
2785       }
2786     }
2787 
2788   /* Skip over bracketed comments */
2789 
2790   if (c == CHAR_LEFT_PARENTHESIS && ptrend - ptr >= 2 &&
2791       ptr[0] == CHAR_QUESTION_MARK && ptr[1] == CHAR_NUMBER_SIGN)
2792     {
2793     while (++ptr < ptrend && *ptr != CHAR_RIGHT_PARENTHESIS);
2794     if (ptr >= ptrend)
2795       {
2796       errorcode = ERR18;  /* A special error for missing ) in a comment */
2797       goto FAILED;        /* to make it easier to debug. */
2798       }
2799     ptr++;
2800     continue;  /* Next character in pattern */
2801     }
2802 
2803   /* If the next item is not a quantifier, fill in length of any previous
2804   callout and create an auto callout if required. */
2805 
2806   if (c != CHAR_ASTERISK && c != CHAR_PLUS && c != CHAR_QUESTION_MARK &&
2807        (c != CHAR_LEFT_CURLY_BRACKET ||
2808          (tempptr = ptr,
2809          !read_repeat_counts(&tempptr, ptrend, NULL, NULL, &errorcode))))
2810     {
2811     if (after_manual_callout-- <= 0)
2812       parsed_pattern = manage_callouts(thisptr, &previous_callout, auto_callout,
2813         parsed_pattern, cb);
2814     }
2815 
2816   /* If expect_cond_assert is 2, we have just passed (?( and are expecting an
2817   assertion, possibly preceded by a callout. If the value is 1, we have just
2818   had the callout and expect an assertion. There must be at least 3 more
2819   characters in all cases. When expect_cond_assert is 2, we know that the
2820   current character is an opening parenthesis, as otherwise we wouldn't be
2821   here. However, when it is 1, we need to check, and it's easiest just to check
2822   always. Note that expect_cond_assert may be negative, since all callouts just
2823   decrement it. */
2824 
2825   if (expect_cond_assert > 0)
2826     {
2827     BOOL ok = c == CHAR_LEFT_PARENTHESIS && ptrend - ptr >= 3 &&
2828               (ptr[0] == CHAR_QUESTION_MARK || ptr[0] == CHAR_ASTERISK);
2829     if (ok)
2830       {
2831       if (ptr[0] == CHAR_ASTERISK)  /* New alpha assertion format, possibly */
2832         {
2833         ok = MAX_255(ptr[1]) && (cb->ctypes[ptr[1]] & ctype_lcletter) != 0;
2834         }
2835       else switch(ptr[1])  /* Traditional symbolic format */
2836         {
2837         case CHAR_C:
2838         ok = expect_cond_assert == 2;
2839         break;
2840 
2841         case CHAR_EQUALS_SIGN:
2842         case CHAR_EXCLAMATION_MARK:
2843         break;
2844 
2845         case CHAR_LESS_THAN_SIGN:
2846         ok = ptr[2] == CHAR_EQUALS_SIGN || ptr[2] == CHAR_EXCLAMATION_MARK;
2847         break;
2848 
2849         default:
2850         ok = FALSE;
2851         }
2852       }
2853 
2854     if (!ok)
2855       {
2856       ptr--;   /* Adjust error offset */
2857       errorcode = ERR28;
2858       goto FAILED;
2859       }
2860     }
2861 
2862   /* Remember whether we are expecting a conditional assertion, and set the
2863   default for this item. */
2864 
2865   prev_expect_cond_assert = expect_cond_assert;
2866   expect_cond_assert = 0;
2867 
2868   /* Remember quantification status for the previous significant item, then set
2869   default for this item. */
2870 
2871   prev_okquantifier = okquantifier;
2872   prev_meta_quantifier = meta_quantifier;
2873   okquantifier = FALSE;
2874   meta_quantifier = 0;
2875 
2876   /* If the previous significant item was a quantifier, adjust the parsed code
2877   if there is a following modifier. The base meta value is always followed by
2878   the PLUS and QUERY values, in that order. We do this here rather than after
2879   reading a quantifier so that intervening comments and /x whitespace can be
2880   ignored without having to replicate code. */
2881 
2882   if (prev_meta_quantifier != 0 && (c == CHAR_QUESTION_MARK || c == CHAR_PLUS))
2883     {
2884     parsed_pattern[(prev_meta_quantifier == META_MINMAX)? -3 : -1] =
2885       prev_meta_quantifier + ((c == CHAR_QUESTION_MARK)?
2886         0x00020000u : 0x00010000u);
2887     continue;  /* Next character in pattern */
2888     }
2889 
2890 
2891   /* Process the next item in the main part of a pattern. */
2892 
2893   switch(c)
2894     {
2895     default:              /* Non-special character */
2896     PARSED_LITERAL(c, parsed_pattern);
2897     break;
2898 
2899 
2900     /* ---- Escape sequence ---- */
2901 
2902     case CHAR_BACKSLASH:
2903     tempptr = ptr;
2904     escape = PRIV(check_escape)(&ptr, ptrend, &c, &errorcode, options,
2905       cb->cx->extra_options, FALSE, cb);
2906     if (errorcode != 0)
2907       {
2908       ESCAPE_FAILED:
2909       if ((extra_options & PCRE2_EXTRA_BAD_ESCAPE_IS_LITERAL) == 0)
2910         goto FAILED;
2911       ptr = tempptr;
2912       if (ptr >= ptrend) c = CHAR_BACKSLASH; else
2913         {
2914         GETCHARINCTEST(c, ptr);   /* Get character value, increment pointer */
2915         }
2916       escape = 0;                 /* Treat as literal character */
2917       }
2918 
2919     /* The escape was a data escape or literal character. */
2920 
2921     if (escape == 0)
2922       {
2923       PARSED_LITERAL(c, parsed_pattern);
2924       }
2925 
2926     /* The escape was a back (or forward) reference. We keep the offset in
2927     order to give a more useful diagnostic for a bad forward reference. For
2928     references to groups numbered less than 10 we can't use more than two items
2929     in parsed_pattern because they may be just two characters in the input (and
2930     in a 64-bit world an offset may need two elements). So for them, the offset
2931     of the first occurrent is held in a special vector. */
2932 
2933     else if (escape < 0)
2934       {
2935       offset = (PCRE2_SIZE)(ptr - cb->start_pattern - 1);
2936       escape = -escape;
2937       *parsed_pattern++ = META_BACKREF | (uint32_t)escape;
2938       if (escape < 10)
2939         {
2940         if (cb->small_ref_offset[escape] == PCRE2_UNSET)
2941           cb->small_ref_offset[escape] = offset;
2942         }
2943       else
2944         {
2945         PUTOFFSET(offset, parsed_pattern);
2946         }
2947       okquantifier = TRUE;
2948       }
2949 
2950     /* The escape was a character class such as \d etc. or other special
2951     escape indicator such as \A or \X. Most of them generate just a single
2952     parsed item, but \P and \p are followed by a 16-bit type and a 16-bit
2953     value. They are supported only when Unicode is available. The type and
2954     value are packed into a single 32-bit value so that the whole sequences
2955     uses only two elements in the parsed_vector. This is because the same
2956     coding is used if \d (for example) is turned into \p{Nd} when PCRE2_UCP is
2957     set.
2958 
2959     There are also some cases where the escape sequence is followed by a name:
2960     \k{name}, \k<name>, and \k'name' are backreferences by name, and \g<name>
2961     and \g'name' are subroutine calls by name; \g{name} is a synonym for
2962     \k{name}. Note that \g<number> and \g'number' are handled by check_escape()
2963     and returned as a negative value (handled above). A name is coded as an
2964     offset into the pattern and a length. */
2965 
2966     else switch (escape)
2967       {
2968       case ESC_C:
2969 #ifdef NEVER_BACKSLASH_C
2970       errorcode = ERR85;
2971       goto ESCAPE_FAILED;
2972 #else
2973       if ((options & PCRE2_NEVER_BACKSLASH_C) != 0)
2974         {
2975         errorcode = ERR83;
2976         goto ESCAPE_FAILED;
2977         }
2978 #endif
2979       okquantifier = TRUE;
2980       *parsed_pattern++ = META_ESCAPE + escape;
2981       break;
2982 
2983       case ESC_X:
2984 #ifndef SUPPORT_UNICODE
2985       errorcode = ERR45;   /* Supported only with Unicode support */
2986       goto ESCAPE_FAILED;
2987 #endif
2988       case ESC_H:
2989       case ESC_h:
2990       case ESC_N:
2991       case ESC_R:
2992       case ESC_V:
2993       case ESC_v:
2994       okquantifier = TRUE;
2995       *parsed_pattern++ = META_ESCAPE + escape;
2996       break;
2997 
2998       default:  /* \A, \B, \b, \G, \K, \Z, \z cannot be quantified. */
2999       *parsed_pattern++ = META_ESCAPE + escape;
3000       break;
3001 
3002       /* Escapes that change in UCP mode. Note that PCRE2_UCP will never be set
3003       without Unicode support because it is checked when pcre2_compile() is
3004       called. */
3005 
3006       case ESC_d:
3007       case ESC_D:
3008       case ESC_s:
3009       case ESC_S:
3010       case ESC_w:
3011       case ESC_W:
3012       okquantifier = TRUE;
3013       if ((options & PCRE2_UCP) == 0)
3014         {
3015         *parsed_pattern++ = META_ESCAPE + escape;
3016         }
3017       else
3018         {
3019         *parsed_pattern++ = META_ESCAPE +
3020           ((escape == ESC_d || escape == ESC_s || escape == ESC_w)?
3021             ESC_p : ESC_P);
3022         switch(escape)
3023           {
3024           case ESC_d:
3025           case ESC_D:
3026           *parsed_pattern++ = (PT_PC << 16) | ucp_Nd;
3027           break;
3028 
3029           case ESC_s:
3030           case ESC_S:
3031           *parsed_pattern++ = PT_SPACE << 16;
3032           break;
3033 
3034           case ESC_w:
3035           case ESC_W:
3036           *parsed_pattern++ = PT_WORD << 16;
3037           break;
3038           }
3039         }
3040       break;
3041 
3042       /* Unicode property matching */
3043 
3044       case ESC_P:
3045       case ESC_p:
3046 #ifdef SUPPORT_UNICODE
3047         {
3048         BOOL negated;
3049         uint16_t ptype = 0, pdata = 0;
3050         if (!get_ucp(&ptr, &negated, &ptype, &pdata, &errorcode, cb))
3051           goto ESCAPE_FAILED;
3052         if (negated) escape = (escape == ESC_P)? ESC_p : ESC_P;
3053         *parsed_pattern++ = META_ESCAPE + escape;
3054         *parsed_pattern++ = (ptype << 16) | pdata;
3055         okquantifier = TRUE;
3056         }
3057 #else
3058       errorcode = ERR45;
3059       goto ESCAPE_FAILED;
3060 #endif
3061       break;  /* End \P and \p */
3062 
3063       /* When \g is used with quotes or angle brackets as delimiters, it is a
3064       numerical or named subroutine call, and control comes here. When used
3065       with brace delimiters it is a numberical back reference and does not come
3066       here because check_escape() returns it directly as a reference. \k is
3067       always a named back reference. */
3068 
3069       case ESC_g:
3070       case ESC_k:
3071       if (ptr >= ptrend || (*ptr != CHAR_LEFT_CURLY_BRACKET &&
3072           *ptr != CHAR_LESS_THAN_SIGN && *ptr != CHAR_APOSTROPHE))
3073         {
3074         errorcode = (escape == ESC_g)? ERR57 : ERR69;
3075         goto ESCAPE_FAILED;
3076         }
3077       terminator = (*ptr == CHAR_LESS_THAN_SIGN)?
3078         CHAR_GREATER_THAN_SIGN : (*ptr == CHAR_APOSTROPHE)?
3079         CHAR_APOSTROPHE : CHAR_RIGHT_CURLY_BRACKET;
3080 
3081       /* For a non-braced \g, check for a numerical recursion. */
3082 
3083       if (escape == ESC_g && terminator != CHAR_RIGHT_CURLY_BRACKET)
3084         {
3085         PCRE2_SPTR p = ptr + 1;
3086 
3087         if (read_number(&p, ptrend, cb->bracount, MAX_GROUP_NUMBER, ERR61, &i,
3088             &errorcode))
3089           {
3090           if (p >= ptrend || *p != terminator)
3091             {
3092             errorcode = ERR57;
3093             goto ESCAPE_FAILED;
3094             }
3095           ptr = p;
3096           goto SET_RECURSION;
3097           }
3098         if (errorcode != 0) goto ESCAPE_FAILED;
3099         }
3100 
3101       /* Not a numerical recursion */
3102 
3103       if (!read_name(&ptr, ptrend, utf, terminator, &offset, &name, &namelen,
3104           &errorcode, cb)) goto ESCAPE_FAILED;
3105 
3106       /* \k and \g when used with braces are back references, whereas \g used
3107       with quotes or angle brackets is a recursion */
3108 
3109       *parsed_pattern++ =
3110         (escape == ESC_k || terminator == CHAR_RIGHT_CURLY_BRACKET)?
3111           META_BACKREF_BYNAME : META_RECURSE_BYNAME;
3112       *parsed_pattern++ = namelen;
3113 
3114       PUTOFFSET(offset, parsed_pattern);
3115       okquantifier = TRUE;
3116       break;  /* End special escape processing */
3117       }
3118     break;    /* End escape sequence processing */
3119 
3120 
3121     /* ---- Single-character special items ---- */
3122 
3123     case CHAR_CIRCUMFLEX_ACCENT:
3124     *parsed_pattern++ = META_CIRCUMFLEX;
3125     break;
3126 
3127     case CHAR_DOLLAR_SIGN:
3128     *parsed_pattern++ = META_DOLLAR;
3129     break;
3130 
3131     case CHAR_DOT:
3132     *parsed_pattern++ = META_DOT;
3133     okquantifier = TRUE;
3134     break;
3135 
3136 
3137     /* ---- Single-character quantifiers ---- */
3138 
3139     case CHAR_ASTERISK:
3140     meta_quantifier = META_ASTERISK;
3141     goto CHECK_QUANTIFIER;
3142 
3143     case CHAR_PLUS:
3144     meta_quantifier = META_PLUS;
3145     goto CHECK_QUANTIFIER;
3146 
3147     case CHAR_QUESTION_MARK:
3148     meta_quantifier = META_QUERY;
3149     goto CHECK_QUANTIFIER;
3150 
3151 
3152     /* ---- Potential {n,m} quantifier ---- */
3153 
3154     case CHAR_LEFT_CURLY_BRACKET:
3155     if (!read_repeat_counts(&ptr, ptrend, &min_repeat, &max_repeat,
3156         &errorcode))
3157       {
3158       if (errorcode != 0) goto FAILED;     /* Error in quantifier. */
3159       PARSED_LITERAL(c, parsed_pattern);   /* Not a quantifier */
3160       break;                               /* No more quantifier processing */
3161       }
3162     meta_quantifier = META_MINMAX;
3163     /* Fall through */
3164 
3165 
3166     /* ---- Quantifier post-processing ---- */
3167 
3168     /* Check that a quantifier is allowed after the previous item. */
3169 
3170     CHECK_QUANTIFIER:
3171     if (!prev_okquantifier)
3172       {
3173       errorcode = ERR9;
3174       goto FAILED_BACK;
3175       }
3176 
3177     /* Most (*VERB)s are not allowed to be quantified, but an ungreedy
3178     quantifier can be useful for (*ACCEPT) - meaning "succeed on backtrack", a
3179     sort of negated (*COMMIT). We therefore allow (*ACCEPT) to be quantified by
3180     wrapping it in non-capturing brackets, but we have to allow for a preceding
3181     (*MARK) for when (*ACCEPT) has an argument. */
3182 
3183     if (parsed_pattern[-1] == META_ACCEPT)
3184       {
3185       uint32_t *p;
3186       for (p = parsed_pattern - 1; p >= verbstartptr; p--) p[1] = p[0];
3187       *verbstartptr = META_NOCAPTURE;
3188       parsed_pattern[1] = META_KET;
3189       parsed_pattern += 2;
3190       }
3191 
3192     /* Now we can put the quantifier into the parsed pattern vector. At this
3193     stage, we have only the basic quantifier. The check for a following + or ?
3194     modifier happens at the top of the loop, after any intervening comments
3195     have been removed. */
3196 
3197     *parsed_pattern++ = meta_quantifier;
3198     if (c == CHAR_LEFT_CURLY_BRACKET)
3199       {
3200       *parsed_pattern++ = min_repeat;
3201       *parsed_pattern++ = max_repeat;
3202       }
3203     break;
3204 
3205 
3206     /* ---- Character class ---- */
3207 
3208     case CHAR_LEFT_SQUARE_BRACKET:
3209     okquantifier = TRUE;
3210 
3211     /* In another (POSIX) regex library, the ugly syntax [[:<:]] and [[:>:]] is
3212     used for "start of word" and "end of word". As these are otherwise illegal
3213     sequences, we don't break anything by recognizing them. They are replaced
3214     by \b(?=\w) and \b(?<=\w) respectively. Sequences like [a[:<:]] are
3215     erroneous and are handled by the normal code below. */
3216 
3217     if (ptrend - ptr >= 6 &&
3218          (PRIV(strncmp_c8)(ptr, STRING_WEIRD_STARTWORD, 6) == 0 ||
3219           PRIV(strncmp_c8)(ptr, STRING_WEIRD_ENDWORD, 6) == 0))
3220       {
3221       *parsed_pattern++ = META_ESCAPE + ESC_b;
3222 
3223       if (ptr[2] == CHAR_LESS_THAN_SIGN)
3224         {
3225         *parsed_pattern++ = META_LOOKAHEAD;
3226         }
3227       else
3228         {
3229         *parsed_pattern++ = META_LOOKBEHIND;
3230         *has_lookbehind = TRUE;
3231 
3232         /* The offset is used only for the "non-fixed length" error; this won't
3233         occur here, so just store zero. */
3234 
3235         PUTOFFSET((PCRE2_SIZE)0, parsed_pattern);
3236         }
3237 
3238       if ((options & PCRE2_UCP) == 0)
3239         *parsed_pattern++ = META_ESCAPE + ESC_w;
3240       else
3241         {
3242         *parsed_pattern++ = META_ESCAPE + ESC_p;
3243         *parsed_pattern++ = PT_WORD << 16;
3244         }
3245       *parsed_pattern++ = META_KET;
3246       ptr += 6;
3247       break;
3248       }
3249 
3250     /* PCRE supports POSIX class stuff inside a class. Perl gives an error if
3251     they are encountered at the top level, so we'll do that too. */
3252 
3253     if (ptr < ptrend && (*ptr == CHAR_COLON || *ptr == CHAR_DOT ||
3254          *ptr == CHAR_EQUALS_SIGN) &&
3255         check_posix_syntax(ptr, ptrend, &tempptr))
3256       {
3257       errorcode = (*ptr-- == CHAR_COLON)? ERR12 : ERR13;
3258       goto FAILED;
3259       }
3260 
3261     /* Process a regular character class. If the first character is '^', set
3262     the negation flag. If the first few characters (either before or after ^)
3263     are \Q\E or \E or space or tab in extended-more mode, we skip them too.
3264     This makes for compatibility with Perl. */
3265 
3266     negate_class = FALSE;
3267     while (ptr < ptrend)
3268       {
3269       GETCHARINCTEST(c, ptr);
3270       if (c == CHAR_BACKSLASH)
3271         {
3272         if (ptr < ptrend && *ptr == CHAR_E) ptr++;
3273         else if (ptrend - ptr >= 3 &&
3274              PRIV(strncmp_c8)(ptr, STR_Q STR_BACKSLASH STR_E, 3) == 0)
3275           ptr += 3;
3276         else
3277           break;
3278         }
3279       else if ((options & PCRE2_EXTENDED_MORE) != 0 &&
3280                (c == CHAR_SPACE || c == CHAR_HT))  /* Note: just these two */
3281         continue;
3282       else if (!negate_class && c == CHAR_CIRCUMFLEX_ACCENT)
3283         negate_class = TRUE;
3284       else break;
3285       }
3286 
3287     /* Now the real contents of the class; c has the first "real" character.
3288     Empty classes are permitted only if the option is set. */
3289 
3290     if (c == CHAR_RIGHT_SQUARE_BRACKET &&
3291         (cb->external_options & PCRE2_ALLOW_EMPTY_CLASS) != 0)
3292       {
3293       *parsed_pattern++ = negate_class? META_CLASS_EMPTY_NOT : META_CLASS_EMPTY;
3294       break;  /* End of class processing */
3295       }
3296 
3297     /* Process a non-empty class. */
3298 
3299     *parsed_pattern++ = negate_class? META_CLASS_NOT : META_CLASS;
3300     class_range_state = RANGE_NO;
3301 
3302     /* In an EBCDIC environment, Perl treats alphabetic ranges specially
3303     because there are holes in the encoding, and simply using the range A-Z
3304     (for example) would include the characters in the holes. This applies only
3305     to ranges where both values are literal; [\xC1-\xE9] is different to [A-Z]
3306     in this respect. In order to accommodate this, we keep track of whether
3307     character values are literal or not, and a state variable for handling
3308     ranges. */
3309 
3310     /* Loop for the contents of the class */
3311 
3312     for (;;)
3313       {
3314       BOOL char_is_literal = TRUE;
3315 
3316       /* Inside \Q...\E everything is literal except \E */
3317 
3318       if (inescq)
3319         {
3320         if (c == CHAR_BACKSLASH && ptr < ptrend && *ptr == CHAR_E)
3321           {
3322           inescq = FALSE;                   /* Reset literal state */
3323           ptr++;                            /* Skip the 'E' */
3324           goto CLASS_CONTINUE;
3325           }
3326         goto CLASS_LITERAL;
3327         }
3328 
3329       /* Skip over space and tab (only) in extended-more mode. */
3330 
3331       if ((options & PCRE2_EXTENDED_MORE) != 0 &&
3332           (c == CHAR_SPACE || c == CHAR_HT))
3333         goto CLASS_CONTINUE;
3334 
3335       /* Handle POSIX class names. Perl allows a negation extension of the
3336       form [:^name:]. A square bracket that doesn't match the syntax is
3337       treated as a literal. We also recognize the POSIX constructions
3338       [.ch.] and [=ch=] ("collating elements") and fault them, as Perl
3339       5.6 and 5.8 do. */
3340 
3341       if (c == CHAR_LEFT_SQUARE_BRACKET &&
3342           ptrend - ptr >= 3 &&
3343           (*ptr == CHAR_COLON || *ptr == CHAR_DOT ||
3344            *ptr == CHAR_EQUALS_SIGN) &&
3345           check_posix_syntax(ptr, ptrend, &tempptr))
3346         {
3347         BOOL posix_negate = FALSE;
3348         int posix_class;
3349 
3350         /* Perl treats a hyphen before a POSIX class as a literal, not the
3351         start of a range. However, it gives a warning in its warning mode. PCRE
3352         does not have a warning mode, so we give an error, because this is
3353         likely an error on the user's part. */
3354 
3355         if (class_range_state == RANGE_STARTED)
3356           {
3357           errorcode = ERR50;
3358           goto FAILED;
3359           }
3360 
3361         if (*ptr != CHAR_COLON)
3362           {
3363           errorcode = ERR13;
3364           goto FAILED_BACK;
3365           }
3366 
3367         if (*(++ptr) == CHAR_CIRCUMFLEX_ACCENT)
3368           {
3369           posix_negate = TRUE;
3370           ptr++;
3371           }
3372 
3373         posix_class = check_posix_name(ptr, (int)(tempptr - ptr));
3374         if (posix_class < 0)
3375           {
3376           errorcode = ERR30;
3377           goto FAILED;
3378           }
3379         ptr = tempptr + 2;
3380 
3381         /* Perl treats a hyphen after a POSIX class as a literal, not the
3382         start of a range. However, it gives a warning in its warning mode
3383         unless the hyphen is the last character in the class. PCRE does not
3384         have a warning mode, so we give an error, because this is likely an
3385         error on the user's part. */
3386 
3387         if (ptr < ptrend - 1 && *ptr == CHAR_MINUS &&
3388             ptr[1] != CHAR_RIGHT_SQUARE_BRACKET)
3389           {
3390           errorcode = ERR50;
3391           goto FAILED;
3392           }
3393 
3394         /* Set "a hyphen is not the start of a range" for the -] case, and also
3395         in case the POSIX class is followed by \E or \Q\E (possibly repeated -
3396         fuzzers do that kind of thing) and *then* a hyphen. This causes that
3397         hyphen to be treated as a literal. I don't think it's worth setting up
3398         special apparatus to do otherwise. */
3399 
3400         class_range_state = RANGE_NO;
3401 
3402         /* When PCRE2_UCP is set, some of the POSIX classes are converted to
3403         use Unicode properties \p or \P or, in one case, \h or \H. The
3404         substitutes table has two values per class, containing the type and
3405         value of a \p or \P item. The special cases are specified with a
3406         negative type: a non-zero value causes \h or \H to be used, and a zero
3407         value falls through to behave like a non-UCP POSIX class. */
3408 
3409 #ifdef SUPPORT_UNICODE
3410         if ((options & PCRE2_UCP) != 0)
3411           {
3412           int ptype = posix_substitutes[2*posix_class];
3413           int pvalue = posix_substitutes[2*posix_class + 1];
3414           if (ptype >= 0)
3415             {
3416             *parsed_pattern++ = META_ESCAPE + (posix_negate? ESC_P : ESC_p);
3417             *parsed_pattern++ = (ptype << 16) | pvalue;
3418             goto CLASS_CONTINUE;
3419             }
3420 
3421           if (pvalue != 0)
3422             {
3423             *parsed_pattern++ = META_ESCAPE + (posix_negate? ESC_H : ESC_h);
3424             goto CLASS_CONTINUE;
3425             }
3426 
3427           /* Fall through */
3428           }
3429 #endif  /* SUPPORT_UNICODE */
3430 
3431         /* Non-UCP POSIX class */
3432 
3433         *parsed_pattern++ = posix_negate? META_POSIX_NEG : META_POSIX;
3434         *parsed_pattern++ = posix_class;
3435         }
3436 
3437       /* Handle potential start of range */
3438 
3439       else if (c == CHAR_MINUS && class_range_state >= RANGE_OK_ESCAPED)
3440         {
3441         *parsed_pattern++ = (class_range_state == RANGE_OK_LITERAL)?
3442           META_RANGE_LITERAL : META_RANGE_ESCAPED;
3443         class_range_state = RANGE_STARTED;
3444         }
3445 
3446       /* Handle a literal character */
3447 
3448       else if (c != CHAR_BACKSLASH)
3449         {
3450         CLASS_LITERAL:
3451         if (class_range_state == RANGE_STARTED)
3452           {
3453           if (c == parsed_pattern[-2])       /* Optimize one-char range */
3454             parsed_pattern--;
3455           else if (parsed_pattern[-2] > c)   /* Check range is in order */
3456             {
3457             errorcode = ERR8;
3458             goto FAILED_BACK;
3459             }
3460           else
3461             {
3462             if (!char_is_literal && parsed_pattern[-1] == META_RANGE_LITERAL)
3463               parsed_pattern[-1] = META_RANGE_ESCAPED;
3464             PARSED_LITERAL(c, parsed_pattern);
3465             }
3466           class_range_state = RANGE_NO;
3467           }
3468         else  /* Potential start of range */
3469           {
3470           class_range_state = char_is_literal?
3471             RANGE_OK_LITERAL : RANGE_OK_ESCAPED;
3472           PARSED_LITERAL(c, parsed_pattern);
3473           }
3474         }
3475 
3476       /* Handle escapes in a class */
3477 
3478       else
3479         {
3480         tempptr = ptr;
3481         escape = PRIV(check_escape)(&ptr, ptrend, &c, &errorcode, options,
3482           cb->cx->extra_options, TRUE, cb);
3483 
3484         if (errorcode != 0)
3485           {
3486           if ((extra_options & PCRE2_EXTRA_BAD_ESCAPE_IS_LITERAL) == 0)
3487             goto FAILED;
3488           ptr = tempptr;
3489           if (ptr >= ptrend) c = CHAR_BACKSLASH; else
3490             {
3491             GETCHARINCTEST(c, ptr);   /* Get character value, increment pointer */
3492             }
3493           escape = 0;                 /* Treat as literal character */
3494           }
3495 
3496         switch(escape)
3497           {
3498           case 0:  /* Escaped character code point is in c */
3499           char_is_literal = FALSE;
3500           goto CLASS_LITERAL;
3501 
3502           case ESC_b:
3503           c = CHAR_BS;    /* \b is backspace in a class */
3504           char_is_literal = FALSE;
3505           goto CLASS_LITERAL;
3506 
3507           case ESC_Q:
3508           inescq = TRUE;  /* Enter literal mode */
3509           goto CLASS_CONTINUE;
3510 
3511           case ESC_E:     /* Ignore orphan \E */
3512           goto CLASS_CONTINUE;
3513 
3514           case ESC_B:     /* Always an error in a class */
3515           case ESC_R:
3516           case ESC_X:
3517           errorcode = ERR7;
3518           ptr--;
3519           goto FAILED;
3520           }
3521 
3522         /* The second part of a range can be a single-character escape
3523         sequence (detected above), but not any of the other escapes. Perl
3524         treats a hyphen as a literal in such circumstances. However, in Perl's
3525         warning mode, a warning is given, so PCRE now faults it, as it is
3526         almost certainly a mistake on the user's part. */
3527 
3528         if (class_range_state == RANGE_STARTED)
3529           {
3530           errorcode = ERR50;
3531           goto FAILED;  /* Not CLASS_ESCAPE_FAILED; always an error */
3532           }
3533 
3534         /* Of the remaining escapes, only those that define characters are
3535         allowed in a class. None may start a range. */
3536 
3537         class_range_state = RANGE_NO;
3538         switch(escape)
3539           {
3540           case ESC_N:
3541           errorcode = ERR71;
3542           goto FAILED;
3543 
3544           case ESC_H:
3545           case ESC_h:
3546           case ESC_V:
3547           case ESC_v:
3548           *parsed_pattern++ = META_ESCAPE + escape;
3549           break;
3550 
3551           /* These escapes are converted to Unicode property tests when
3552           PCRE2_UCP is set. */
3553 
3554           case ESC_d:
3555           case ESC_D:
3556           case ESC_s:
3557           case ESC_S:
3558           case ESC_w:
3559           case ESC_W:
3560           if ((options & PCRE2_UCP) == 0)
3561             {
3562             *parsed_pattern++ = META_ESCAPE + escape;
3563             }
3564           else
3565             {
3566             *parsed_pattern++ = META_ESCAPE +
3567               ((escape == ESC_d || escape == ESC_s || escape == ESC_w)?
3568                 ESC_p : ESC_P);
3569             switch(escape)
3570               {
3571               case ESC_d:
3572               case ESC_D:
3573               *parsed_pattern++ = (PT_PC << 16) | ucp_Nd;
3574               break;
3575 
3576               case ESC_s:
3577               case ESC_S:
3578               *parsed_pattern++ = PT_SPACE << 16;
3579               break;
3580 
3581               case ESC_w:
3582               case ESC_W:
3583               *parsed_pattern++ = PT_WORD << 16;
3584               break;
3585               }
3586             }
3587           break;
3588 
3589           /* Explicit Unicode property matching */
3590 
3591           case ESC_P:
3592           case ESC_p:
3593 #ifdef SUPPORT_UNICODE
3594             {
3595             BOOL negated;
3596             uint16_t ptype = 0, pdata = 0;
3597             if (!get_ucp(&ptr, &negated, &ptype, &pdata, &errorcode, cb))
3598               goto FAILED;
3599             if (negated) escape = (escape == ESC_P)? ESC_p : ESC_P;
3600             *parsed_pattern++ = META_ESCAPE + escape;
3601             *parsed_pattern++ = (ptype << 16) | pdata;
3602             }
3603 #else
3604           errorcode = ERR45;
3605           goto FAILED;
3606 #endif
3607           break;  /* End \P and \p */
3608 
3609           default:    /* All others are not allowed in a class */
3610           errorcode = ERR7;
3611           ptr--;
3612           goto FAILED;
3613           }
3614 
3615         /* Perl gives a warning unless a following hyphen is the last character
3616         in the class. PCRE throws an error. */
3617 
3618         if (ptr < ptrend - 1 && *ptr == CHAR_MINUS &&
3619             ptr[1] != CHAR_RIGHT_SQUARE_BRACKET)
3620           {
3621           errorcode = ERR50;
3622           goto FAILED;
3623           }
3624         }
3625 
3626       /* Proceed to next thing in the class. */
3627 
3628       CLASS_CONTINUE:
3629       if (ptr >= ptrend)
3630         {
3631         errorcode = ERR6;  /* Missing terminating ']' */
3632         goto FAILED;
3633         }
3634       GETCHARINCTEST(c, ptr);
3635       if (c == CHAR_RIGHT_SQUARE_BRACKET && !inescq) break;
3636       }     /* End of class-processing loop */
3637 
3638     /* -] at the end of a class is a literal '-' */
3639 
3640     if (class_range_state == RANGE_STARTED)
3641       {
3642       parsed_pattern[-1] = CHAR_MINUS;
3643       class_range_state = RANGE_NO;
3644       }
3645 
3646     *parsed_pattern++ = META_CLASS_END;
3647     break;  /* End of character class */
3648 
3649 
3650     /* ---- Opening parenthesis ---- */
3651 
3652     case CHAR_LEFT_PARENTHESIS:
3653     if (ptr >= ptrend) goto UNCLOSED_PARENTHESIS;
3654 
3655     /* If ( is not followed by ? it is either a capture or a special verb or an
3656     alpha assertion or a positive non-atomic lookahead. */
3657 
3658     if (*ptr != CHAR_QUESTION_MARK)
3659       {
3660       const char *vn;
3661 
3662       /* Handle capturing brackets (or non-capturing if auto-capture is turned
3663       off). */
3664 
3665       if (*ptr != CHAR_ASTERISK)
3666         {
3667         nest_depth++;
3668         if ((options & PCRE2_NO_AUTO_CAPTURE) == 0)
3669           {
3670           if (cb->bracount >= MAX_GROUP_NUMBER)
3671             {
3672             errorcode = ERR97;
3673             goto FAILED;
3674             }
3675           cb->bracount++;
3676           *parsed_pattern++ = META_CAPTURE | cb->bracount;
3677           }
3678         else *parsed_pattern++ = META_NOCAPTURE;
3679         }
3680 
3681       /* Do nothing for (* followed by end of pattern or ) so it gives a "bad
3682       quantifier" error rather than "(*MARK) must have an argument". */
3683 
3684       else if (ptrend - ptr <= 1 || (c = ptr[1]) == CHAR_RIGHT_PARENTHESIS)
3685         break;
3686 
3687       /* Handle "alpha assertions" such as (*pla:...). Most of these are
3688       synonyms for the historical symbolic assertions, but the script run and
3689       non-atomic lookaround ones are new. They are distinguished by starting
3690       with a lower case letter. Checking both ends of the alphabet makes this
3691       work in all character codes. */
3692 
3693       else if (CHMAX_255(c) && (cb->ctypes[c] & ctype_lcletter) != 0)
3694         {
3695         uint32_t meta;
3696 
3697         vn = alasnames;
3698         if (!read_name(&ptr, ptrend, utf, 0, &offset, &name, &namelen,
3699           &errorcode, cb)) goto FAILED;
3700         if (ptr >= ptrend || *ptr != CHAR_COLON)
3701           {
3702           errorcode = ERR95;  /* Malformed */
3703           goto FAILED;
3704           }
3705 
3706         /* Scan the table of alpha assertion names */
3707 
3708         for (i = 0; i < alascount; i++)
3709           {
3710           if (namelen == alasmeta[i].len &&
3711               PRIV(strncmp_c8)(name, vn, namelen) == 0)
3712             break;
3713           vn += alasmeta[i].len + 1;
3714           }
3715 
3716         if (i >= alascount)
3717           {
3718           errorcode = ERR95;  /* Alpha assertion not recognized */
3719           goto FAILED;
3720           }
3721 
3722         /* Check for expecting an assertion condition. If so, only atomic
3723         lookaround assertions are valid. */
3724 
3725         meta = alasmeta[i].meta;
3726         if (prev_expect_cond_assert > 0 &&
3727             (meta < META_LOOKAHEAD || meta > META_LOOKBEHINDNOT))
3728           {
3729           errorcode = (meta == META_LOOKAHEAD_NA || meta == META_LOOKBEHIND_NA)?
3730             ERR98 : ERR28;  /* (Atomic) assertion expected */
3731           goto FAILED;
3732           }
3733 
3734         /* The lookaround alphabetic synonyms can mostly be handled by jumping
3735         to the code that handles the traditional symbolic forms. */
3736 
3737         switch(meta)
3738           {
3739           default:
3740           errorcode = ERR89;  /* Unknown code; should never occur because */
3741           goto FAILED;        /* the meta values come from a table above. */
3742 
3743           case META_ATOMIC:
3744           goto ATOMIC_GROUP;
3745 
3746           case META_LOOKAHEAD:
3747           goto POSITIVE_LOOK_AHEAD;
3748 
3749           case META_LOOKAHEAD_NA:
3750           goto POSITIVE_NONATOMIC_LOOK_AHEAD;
3751 
3752           case META_LOOKAHEADNOT:
3753           goto NEGATIVE_LOOK_AHEAD;
3754 
3755           case META_LOOKBEHIND:
3756           case META_LOOKBEHINDNOT:
3757           case META_LOOKBEHIND_NA:
3758           *parsed_pattern++ = meta;
3759           ptr--;
3760           goto POST_LOOKBEHIND;
3761 
3762           /* The script run facilities are handled here. Unicode support is
3763           required (give an error if not, as this is a security issue). Always
3764           record a META_SCRIPT_RUN item. Then, for the atomic version, insert
3765           META_ATOMIC and remember that we need two META_KETs at the end. */
3766 
3767           case META_SCRIPT_RUN:
3768           case META_ATOMIC_SCRIPT_RUN:
3769 #ifdef SUPPORT_UNICODE
3770           *parsed_pattern++ = META_SCRIPT_RUN;
3771           nest_depth++;
3772           ptr++;
3773           if (meta == META_ATOMIC_SCRIPT_RUN)
3774             {
3775             *parsed_pattern++ = META_ATOMIC;
3776             if (top_nest == NULL) top_nest = (nest_save *)(cb->start_workspace);
3777             else if (++top_nest >= end_nests)
3778               {
3779               errorcode = ERR84;
3780               goto FAILED;
3781               }
3782             top_nest->nest_depth = nest_depth;
3783             top_nest->flags = NSF_ATOMICSR;
3784             top_nest->options = options & PARSE_TRACKED_OPTIONS;
3785             }
3786           break;
3787 #else  /* SUPPORT_UNICODE */
3788           errorcode = ERR96;
3789           goto FAILED;
3790 #endif
3791           }
3792         }
3793 
3794 
3795       /* ---- Handle (*VERB) and (*VERB:NAME) ---- */
3796 
3797       else
3798         {
3799         vn = verbnames;
3800         if (!read_name(&ptr, ptrend, utf, 0, &offset, &name, &namelen,
3801           &errorcode, cb)) goto FAILED;
3802         if (ptr >= ptrend || (*ptr != CHAR_COLON &&
3803                               *ptr != CHAR_RIGHT_PARENTHESIS))
3804           {
3805           errorcode = ERR60;  /* Malformed */
3806           goto FAILED;
3807           }
3808 
3809         /* Scan the table of verb names */
3810 
3811         for (i = 0; i < verbcount; i++)
3812           {
3813           if (namelen == verbs[i].len &&
3814               PRIV(strncmp_c8)(name, vn, namelen) == 0)
3815             break;
3816           vn += verbs[i].len + 1;
3817           }
3818 
3819         if (i >= verbcount)
3820           {
3821           errorcode = ERR60;  /* Verb not recognized */
3822           goto FAILED;
3823           }
3824 
3825         /* An empty argument is treated as no argument. */
3826 
3827         if (*ptr == CHAR_COLON && ptr + 1 < ptrend &&
3828              ptr[1] == CHAR_RIGHT_PARENTHESIS)
3829           ptr++;    /* Advance to the closing parens */
3830 
3831         /* Check for mandatory non-empty argument; this is (*MARK) */
3832 
3833         if (verbs[i].has_arg > 0 && *ptr != CHAR_COLON)
3834           {
3835           errorcode = ERR66;
3836           goto FAILED;
3837           }
3838 
3839         /* Remember where this verb, possibly with a preceding (*MARK), starts,
3840         for handling quantified (*ACCEPT). */
3841 
3842         verbstartptr = parsed_pattern;
3843         okquantifier = (verbs[i].meta == META_ACCEPT);
3844 
3845         /* It appears that Perl allows any characters whatsoever, other than a
3846         closing parenthesis, to appear in arguments ("names"), so we no longer
3847         insist on letters, digits, and underscores. Perl does not, however, do
3848         any interpretation within arguments, and has no means of including a
3849         closing parenthesis. PCRE supports escape processing but only when it
3850         is requested by an option. We set inverbname TRUE here, and let the
3851         main loop take care of this so that escape and \x processing is done by
3852         the main code above. */
3853 
3854         if (*ptr++ == CHAR_COLON)   /* Skip past : or ) */
3855           {
3856           /* Some optional arguments can be treated as a preceding (*MARK) */
3857 
3858           if (verbs[i].has_arg < 0)
3859             {
3860             add_after_mark = verbs[i].meta;
3861             *parsed_pattern++ = META_MARK;
3862             }
3863 
3864           /* The remaining verbs with arguments (except *MARK) need a different
3865           opcode. */
3866 
3867           else
3868             {
3869             *parsed_pattern++ = verbs[i].meta +
3870               ((verbs[i].meta != META_MARK)? 0x00010000u:0);
3871             }
3872 
3873           /* Set up for reading the name in the main loop. */
3874 
3875           verblengthptr = parsed_pattern++;
3876           verbnamestart = ptr;
3877           inverbname = TRUE;
3878           }
3879         else  /* No verb "name" argument */
3880           {
3881           *parsed_pattern++ = verbs[i].meta;
3882           }
3883         }     /* End of (*VERB) handling */
3884       break;  /* Done with this parenthesis */
3885       }       /* End of groups that don't start with (? */
3886 
3887 
3888     /* ---- Items starting (? ---- */
3889 
3890     /* The type of item is determined by what follows (?. Handle (?| and option
3891     changes under "default" because both need a new block on the nest stack.
3892     Comments starting with (?# are handled above. Note that there is some
3893     ambiguity about the sequence (?- because if a digit follows it's a relative
3894     recursion or subroutine call whereas otherwise it's an option unsetting. */
3895 
3896     if (++ptr >= ptrend) goto UNCLOSED_PARENTHESIS;
3897 
3898     switch(*ptr)
3899       {
3900       default:
3901       if (*ptr == CHAR_MINUS && ptrend - ptr > 1 && IS_DIGIT(ptr[1]))
3902         goto RECURSION_BYNUMBER;  /* The + case is handled by CHAR_PLUS */
3903 
3904       /* We now have either (?| or a (possibly empty) option setting,
3905       optionally followed by a non-capturing group. */
3906 
3907       nest_depth++;
3908       if (top_nest == NULL) top_nest = (nest_save *)(cb->start_workspace);
3909       else if (++top_nest >= end_nests)
3910         {
3911         errorcode = ERR84;
3912         goto FAILED;
3913         }
3914       top_nest->nest_depth = nest_depth;
3915       top_nest->flags = 0;
3916       top_nest->options = options & PARSE_TRACKED_OPTIONS;
3917 
3918       /* Start of non-capturing group that resets the capture count for each
3919       branch. */
3920 
3921       if (*ptr == CHAR_VERTICAL_LINE)
3922         {
3923         top_nest->reset_group = (uint16_t)cb->bracount;
3924         top_nest->max_group = (uint16_t)cb->bracount;
3925         top_nest->flags |= NSF_RESET;
3926         cb->external_flags |= PCRE2_DUPCAPUSED;
3927         *parsed_pattern++ = META_NOCAPTURE;
3928         ptr++;
3929         }
3930 
3931       /* Scan for options imnsxJU to be set or unset. */
3932 
3933       else
3934         {
3935         BOOL hyphenok = TRUE;
3936         uint32_t oldoptions = options;
3937 
3938         top_nest->reset_group = 0;
3939         top_nest->max_group = 0;
3940         set = unset = 0;
3941         optset = &set;
3942 
3943         /* ^ at the start unsets imnsx and disables the subsequent use of - */
3944 
3945         if (ptr < ptrend && *ptr == CHAR_CIRCUMFLEX_ACCENT)
3946           {
3947           options &= ~(PCRE2_CASELESS|PCRE2_MULTILINE|PCRE2_NO_AUTO_CAPTURE|
3948                        PCRE2_DOTALL|PCRE2_EXTENDED|PCRE2_EXTENDED_MORE);
3949           hyphenok = FALSE;
3950           ptr++;
3951           }
3952 
3953         while (ptr < ptrend && *ptr != CHAR_RIGHT_PARENTHESIS &&
3954                                *ptr != CHAR_COLON)
3955           {
3956           switch (*ptr++)
3957             {
3958             case CHAR_MINUS:
3959             if (!hyphenok)
3960               {
3961               errorcode = ERR94;
3962               ptr--;  /* Correct the offset */
3963               goto FAILED;
3964               }
3965             optset = &unset;
3966             hyphenok = FALSE;
3967             break;
3968 
3969             case CHAR_J:  /* Record that it changed in the external options */
3970             *optset |= PCRE2_DUPNAMES;
3971             cb->external_flags |= PCRE2_JCHANGED;
3972             break;
3973 
3974             case CHAR_i: *optset |= PCRE2_CASELESS; break;
3975             case CHAR_m: *optset |= PCRE2_MULTILINE; break;
3976             case CHAR_n: *optset |= PCRE2_NO_AUTO_CAPTURE; break;
3977             case CHAR_s: *optset |= PCRE2_DOTALL; break;
3978             case CHAR_U: *optset |= PCRE2_UNGREEDY; break;
3979 
3980             /* If x appears twice it sets the extended extended option. */
3981 
3982             case CHAR_x:
3983             *optset |= PCRE2_EXTENDED;
3984             if (ptr < ptrend && *ptr == CHAR_x)
3985               {
3986               *optset |= PCRE2_EXTENDED_MORE;
3987               ptr++;
3988               }
3989             break;
3990 
3991             default:
3992             errorcode = ERR11;
3993             ptr--;    /* Correct the offset */
3994             goto FAILED;
3995             }
3996           }
3997 
3998         /* If we are setting extended without extended-more, ensure that any
3999         existing extended-more gets unset. Also, unsetting extended must also
4000         unset extended-more. */
4001 
4002         if ((set & (PCRE2_EXTENDED|PCRE2_EXTENDED_MORE)) == PCRE2_EXTENDED ||
4003             (unset & PCRE2_EXTENDED) != 0)
4004           unset |= PCRE2_EXTENDED_MORE;
4005 
4006         options = (options | set) & (~unset);
4007 
4008         /* If the options ended with ')' this is not the start of a nested
4009         group with option changes, so the options change at this level.
4010         In this case, if the previous level set up a nest block, discard the
4011         one we have just created. Otherwise adjust it for the previous level.
4012         If the options ended with ':' we are starting a non-capturing group,
4013         possibly with an options setting. */
4014 
4015         if (ptr >= ptrend) goto UNCLOSED_PARENTHESIS;
4016         if (*ptr++ == CHAR_RIGHT_PARENTHESIS)
4017           {
4018           nest_depth--;  /* This is not a nested group after all. */
4019           if (top_nest > (nest_save *)(cb->start_workspace) &&
4020               (top_nest-1)->nest_depth == nest_depth) top_nest--;
4021           else top_nest->nest_depth = nest_depth;
4022           }
4023         else *parsed_pattern++ = META_NOCAPTURE;
4024 
4025         /* If nothing changed, no need to record. */
4026 
4027         if (options != oldoptions)
4028           {
4029           *parsed_pattern++ = META_OPTIONS;
4030           *parsed_pattern++ = options;
4031           }
4032         }     /* End options processing */
4033       break;  /* End default case after (? */
4034 
4035 
4036       /* ---- Python syntax support ---- */
4037 
4038       case CHAR_P:
4039       if (++ptr >= ptrend) goto UNCLOSED_PARENTHESIS;
4040 
4041       /* (?P<name> is the same as (?<name>, which defines a named group. */
4042 
4043       if (*ptr == CHAR_LESS_THAN_SIGN)
4044         {
4045         terminator = CHAR_GREATER_THAN_SIGN;
4046         goto DEFINE_NAME;
4047         }
4048 
4049       /* (?P>name) is the same as (?&name), which is a recursion or subroutine
4050       call. */
4051 
4052       if (*ptr == CHAR_GREATER_THAN_SIGN) goto RECURSE_BY_NAME;
4053 
4054       /* (?P=name) is the same as \k<name>, a back reference by name. Anything
4055       else after (?P is an error. */
4056 
4057       if (*ptr != CHAR_EQUALS_SIGN)
4058         {
4059         errorcode = ERR41;
4060         goto FAILED;
4061         }
4062       if (!read_name(&ptr, ptrend, utf, CHAR_RIGHT_PARENTHESIS, &offset, &name,
4063           &namelen, &errorcode, cb)) goto FAILED;
4064       *parsed_pattern++ = META_BACKREF_BYNAME;
4065       *parsed_pattern++ = namelen;
4066       PUTOFFSET(offset, parsed_pattern);
4067       okquantifier = TRUE;
4068       break;   /* End of (?P processing */
4069 
4070 
4071       /* ---- Recursion/subroutine calls by number ---- */
4072 
4073       case CHAR_R:
4074       i = 0;         /* (?R) == (?R0) */
4075       ptr++;
4076       if (ptr >= ptrend || *ptr != CHAR_RIGHT_PARENTHESIS)
4077         {
4078         errorcode = ERR58;
4079         goto FAILED;
4080         }
4081       goto SET_RECURSION;
4082 
4083       /* An item starting (?- followed by a digit comes here via the "default"
4084       case because (?- followed by a non-digit is an options setting. */
4085 
4086       case CHAR_PLUS:
4087       if (ptrend - ptr < 2 || !IS_DIGIT(ptr[1]))
4088         {
4089         errorcode = ERR29;   /* Missing number */
4090         goto FAILED;
4091         }
4092       /* Fall through */
4093 
4094       case CHAR_0: case CHAR_1: case CHAR_2: case CHAR_3: case CHAR_4:
4095       case CHAR_5: case CHAR_6: case CHAR_7: case CHAR_8: case CHAR_9:
4096       RECURSION_BYNUMBER:
4097       if (!read_number(&ptr, ptrend,
4098           (IS_DIGIT(*ptr))? -1:(int)(cb->bracount), /* + and - are relative */
4099           MAX_GROUP_NUMBER, ERR61,
4100           &i, &errorcode)) goto FAILED;
4101       if (i < 0)  /* NB (?0) is permitted */
4102         {
4103         errorcode = ERR15;   /* Unknown group */
4104         goto FAILED_BACK;
4105         }
4106       if (ptr >= ptrend || *ptr != CHAR_RIGHT_PARENTHESIS)
4107         goto UNCLOSED_PARENTHESIS;
4108 
4109       SET_RECURSION:
4110       *parsed_pattern++ = META_RECURSE | (uint32_t)i;
4111       offset = (PCRE2_SIZE)(ptr - cb->start_pattern);
4112       ptr++;
4113       PUTOFFSET(offset, parsed_pattern);
4114       okquantifier = TRUE;
4115       break;  /* End of recursive call by number handling */
4116 
4117 
4118       /* ---- Recursion/subroutine calls by name ---- */
4119 
4120       case CHAR_AMPERSAND:
4121       RECURSE_BY_NAME:
4122       if (!read_name(&ptr, ptrend, utf, CHAR_RIGHT_PARENTHESIS, &offset, &name,
4123           &namelen, &errorcode, cb)) goto FAILED;
4124       *parsed_pattern++ = META_RECURSE_BYNAME;
4125       *parsed_pattern++ = namelen;
4126       PUTOFFSET(offset, parsed_pattern);
4127       okquantifier = TRUE;
4128       break;
4129 
4130       /* ---- Callout with numerical or string argument ---- */
4131 
4132       case CHAR_C:
4133       if (++ptr >= ptrend) goto UNCLOSED_PARENTHESIS;
4134 
4135       /* If the previous item was a condition starting (?(? an assertion,
4136       optionally preceded by a callout, is expected. This is checked later on,
4137       during actual compilation. However we need to identify this kind of
4138       assertion in this pass because it must not be qualified. The value of
4139       expect_cond_assert is set to 2 after (?(? is processed. We decrement it
4140       for a callout - still leaving a positive value that identifies the
4141       assertion. Multiple callouts or any other items will make it zero or
4142       less, which doesn't matter because they will cause an error later. */
4143 
4144       expect_cond_assert = prev_expect_cond_assert - 1;
4145 
4146       /* If previous_callout is not NULL, it means this follows a previous
4147       callout. If it was a manual callout, do nothing; this means its "length
4148       of next pattern item" field will remain zero. If it was an automatic
4149       callout, abolish it. */
4150 
4151       if (previous_callout != NULL && (options & PCRE2_AUTO_CALLOUT) != 0 &&
4152           previous_callout == parsed_pattern - 4 &&
4153           parsed_pattern[-1] == 255)
4154         parsed_pattern = previous_callout;
4155 
4156       /* Save for updating next pattern item length, and skip one item before
4157       completing. */
4158 
4159       previous_callout = parsed_pattern;
4160       after_manual_callout = 1;
4161 
4162       /* Handle a string argument; specific delimiter is required. */
4163 
4164       if (*ptr != CHAR_RIGHT_PARENTHESIS && !IS_DIGIT(*ptr))
4165         {
4166         PCRE2_SIZE calloutlength;
4167         PCRE2_SPTR startptr = ptr;
4168 
4169         delimiter = 0;
4170         for (i = 0; PRIV(callout_start_delims)[i] != 0; i++)
4171           {
4172           if (*ptr == PRIV(callout_start_delims)[i])
4173             {
4174             delimiter = PRIV(callout_end_delims)[i];
4175             break;
4176             }
4177           }
4178         if (delimiter == 0)
4179           {
4180           errorcode = ERR82;
4181           goto FAILED;
4182           }
4183 
4184         *parsed_pattern = META_CALLOUT_STRING;
4185         parsed_pattern += 3;   /* Skip pattern info */
4186 
4187         for (;;)
4188           {
4189           if (++ptr >= ptrend)
4190             {
4191             errorcode = ERR81;
4192             ptr = startptr;   /* To give a more useful message */
4193             goto FAILED;
4194             }
4195           if (*ptr == delimiter && (++ptr >= ptrend || *ptr != delimiter))
4196             break;
4197           }
4198 
4199         calloutlength = (PCRE2_SIZE)(ptr - startptr);
4200         if (calloutlength > UINT32_MAX)
4201           {
4202           errorcode = ERR72;
4203           goto FAILED;
4204           }
4205         *parsed_pattern++ = (uint32_t)calloutlength;
4206         offset = (PCRE2_SIZE)(startptr - cb->start_pattern);
4207         PUTOFFSET(offset, parsed_pattern);
4208         }
4209 
4210       /* Handle a callout with an optional numerical argument, which must be
4211       less than or equal to 255. A missing argument gives 0. */
4212 
4213       else
4214         {
4215         int n = 0;
4216         *parsed_pattern = META_CALLOUT_NUMBER;     /* Numerical callout */
4217         parsed_pattern += 3;                       /* Skip pattern info */
4218         while (ptr < ptrend && IS_DIGIT(*ptr))
4219           {
4220           n = n * 10 + *ptr++ - CHAR_0;
4221           if (n > 255)
4222             {
4223             errorcode = ERR38;
4224             goto FAILED;
4225             }
4226           }
4227         *parsed_pattern++ = n;
4228         }
4229 
4230       /* Both formats must have a closing parenthesis */
4231 
4232       if (ptr >= ptrend || *ptr != CHAR_RIGHT_PARENTHESIS)
4233         {
4234         errorcode = ERR39;
4235         goto FAILED;
4236         }
4237       ptr++;
4238 
4239       /* Remember the offset to the next item in the pattern, and set a default
4240       length. This should get updated after the next item is read. */
4241 
4242       previous_callout[1] = (uint32_t)(ptr - cb->start_pattern);
4243       previous_callout[2] = 0;
4244       break;                  /* End callout */
4245 
4246 
4247       /* ---- Conditional group ---- */
4248 
4249       /* A condition can be an assertion, a number (referring to a numbered
4250       group's having been set), a name (referring to a named group), or 'R',
4251       referring to overall recursion. R<digits> and R&name are also permitted
4252       for recursion state tests. Numbers may be preceded by + or - to specify a
4253       relative group number.
4254 
4255       There are several syntaxes for testing a named group: (?(name)) is used
4256       by Python; Perl 5.10 onwards uses (?(<name>) or (?('name')).
4257 
4258       There are two unfortunate ambiguities. 'R' can be the recursive thing or
4259       the name 'R' (and similarly for 'R' followed by digits). 'DEFINE' can be
4260       the Perl DEFINE feature or the Python named test. We look for a name
4261       first; if not found, we try the other case.
4262 
4263       For compatibility with auto-callouts, we allow a callout to be specified
4264       before a condition that is an assertion. */
4265 
4266       case CHAR_LEFT_PARENTHESIS:
4267       if (++ptr >= ptrend) goto UNCLOSED_PARENTHESIS;
4268       nest_depth++;
4269 
4270       /* If the next character is ? or * there must be an assertion next
4271       (optionally preceded by a callout). We do not check this here, but
4272       instead we set expect_cond_assert to 2. If this is still greater than
4273       zero (callouts decrement it) when the next assertion is read, it will be
4274       marked as a condition that must not be repeated. A value greater than
4275       zero also causes checking that an assertion (possibly with callout)
4276       follows. */
4277 
4278       if (*ptr == CHAR_QUESTION_MARK || *ptr == CHAR_ASTERISK)
4279         {
4280         *parsed_pattern++ = META_COND_ASSERT;
4281         ptr--;   /* Pull pointer back to the opening parenthesis. */
4282         expect_cond_assert = 2;
4283         break;  /* End of conditional */
4284         }
4285 
4286       /* Handle (?([+-]number)... */
4287 
4288       if (read_number(&ptr, ptrend, cb->bracount, MAX_GROUP_NUMBER, ERR61, &i,
4289           &errorcode))
4290         {
4291         if (i <= 0)
4292           {
4293           errorcode = ERR15;
4294           goto FAILED;
4295           }
4296         *parsed_pattern++ = META_COND_NUMBER;
4297         offset = (PCRE2_SIZE)(ptr - cb->start_pattern - 2);
4298         PUTOFFSET(offset, parsed_pattern);
4299         *parsed_pattern++ = i;
4300         }
4301       else if (errorcode != 0) goto FAILED;   /* Number too big */
4302 
4303       /* No number found. Handle the special case (?(VERSION[>]=n.m)... */
4304 
4305       else if (ptrend - ptr >= 10 &&
4306                PRIV(strncmp_c8)(ptr, STRING_VERSION, 7) == 0 &&
4307                ptr[7] != CHAR_RIGHT_PARENTHESIS)
4308         {
4309         uint32_t ge = 0;
4310         int major = 0;
4311         int minor = 0;
4312 
4313         ptr += 7;
4314         if (*ptr == CHAR_GREATER_THAN_SIGN)
4315           {
4316           ge = 1;
4317           ptr++;
4318           }
4319 
4320         /* NOTE: cannot write IS_DIGIT(*(++ptr)) here because IS_DIGIT
4321         references its argument twice. */
4322 
4323         if (*ptr != CHAR_EQUALS_SIGN || (ptr++, !IS_DIGIT(*ptr)))
4324           goto BAD_VERSION_CONDITION;
4325 
4326         if (!read_number(&ptr, ptrend, -1, 1000, ERR79, &major, &errorcode))
4327           goto FAILED;
4328 
4329         if (ptr >= ptrend) goto BAD_VERSION_CONDITION;
4330         if (*ptr == CHAR_DOT)
4331           {
4332           if (++ptr >= ptrend || !IS_DIGIT(*ptr)) goto BAD_VERSION_CONDITION;
4333           minor = (*ptr++ - CHAR_0) * 10;
4334           if (ptr >= ptrend) goto BAD_VERSION_CONDITION;
4335           if (IS_DIGIT(*ptr)) minor += *ptr++ - CHAR_0;
4336           if (ptr >= ptrend || *ptr != CHAR_RIGHT_PARENTHESIS)
4337             goto BAD_VERSION_CONDITION;
4338           }
4339 
4340         *parsed_pattern++ = META_COND_VERSION;
4341         *parsed_pattern++ = ge;
4342         *parsed_pattern++ = major;
4343         *parsed_pattern++ = minor;
4344         }
4345 
4346       /* All the remaining cases now require us to read a name. We cannot at
4347       this stage distinguish ambiguous cases such as (?(R12) which might be a
4348       recursion test by number or a name, because the named groups have not yet
4349       all been identified. Those cases are treated as names, but given a
4350       different META code. */
4351 
4352       else
4353         {
4354         BOOL was_r_ampersand = FALSE;
4355 
4356         if (*ptr == CHAR_R && ptrend - ptr > 1 && ptr[1] == CHAR_AMPERSAND)
4357           {
4358           terminator = CHAR_RIGHT_PARENTHESIS;
4359           was_r_ampersand = TRUE;
4360           ptr++;
4361           }
4362         else if (*ptr == CHAR_LESS_THAN_SIGN)
4363           terminator = CHAR_GREATER_THAN_SIGN;
4364         else if (*ptr == CHAR_APOSTROPHE)
4365           terminator = CHAR_APOSTROPHE;
4366         else
4367           {
4368           terminator = CHAR_RIGHT_PARENTHESIS;
4369           ptr--;   /* Point to char before name */
4370           }
4371         if (!read_name(&ptr, ptrend, utf, terminator, &offset, &name, &namelen,
4372             &errorcode, cb)) goto FAILED;
4373 
4374         /* Handle (?(R&name) */
4375 
4376         if (was_r_ampersand)
4377           {
4378           *parsed_pattern = META_COND_RNAME;
4379           ptr--;   /* Back to closing parens */
4380           }
4381 
4382         /* Handle (?(name). If the name is "DEFINE" we identify it with a
4383         special code. Likewise if the name consists of R followed only by
4384         digits. Otherwise, handle it like a quoted name. */
4385 
4386         else if (terminator == CHAR_RIGHT_PARENTHESIS)
4387           {
4388           if (namelen == 6 && PRIV(strncmp_c8)(name, STRING_DEFINE, 6) == 0)
4389             *parsed_pattern = META_COND_DEFINE;
4390           else
4391             {
4392             for (i = 1; i < (int)namelen; i++)
4393               if (!IS_DIGIT(name[i])) break;
4394             *parsed_pattern = (*name == CHAR_R && i >= (int)namelen)?
4395               META_COND_RNUMBER : META_COND_NAME;
4396             }
4397           ptr--;   /* Back to closing parens */
4398           }
4399 
4400         /* Handle (?('name') or (?(<name>) */
4401 
4402         else *parsed_pattern = META_COND_NAME;
4403 
4404         /* All these cases except DEFINE end with the name length and offset;
4405         DEFINE just has an offset (for the "too many branches" error). */
4406 
4407         if (*parsed_pattern++ != META_COND_DEFINE) *parsed_pattern++ = namelen;
4408         PUTOFFSET(offset, parsed_pattern);
4409         }  /* End cases that read a name */
4410 
4411       /* Check the closing parenthesis of the condition */
4412 
4413       if (ptr >= ptrend || *ptr != CHAR_RIGHT_PARENTHESIS)
4414         {
4415         errorcode = ERR24;
4416         goto FAILED;
4417         }
4418       ptr++;
4419       break;  /* End of condition processing */
4420 
4421 
4422       /* ---- Atomic group ---- */
4423 
4424       case CHAR_GREATER_THAN_SIGN:
4425       ATOMIC_GROUP:                          /* Come from (*atomic: */
4426       *parsed_pattern++ = META_ATOMIC;
4427       nest_depth++;
4428       ptr++;
4429       break;
4430 
4431 
4432       /* ---- Lookahead assertions ---- */
4433 
4434       case CHAR_EQUALS_SIGN:
4435       POSITIVE_LOOK_AHEAD:                   /* Come from (*pla: */
4436       *parsed_pattern++ = META_LOOKAHEAD;
4437       ptr++;
4438       goto POST_ASSERTION;
4439 
4440       case CHAR_ASTERISK:
4441       POSITIVE_NONATOMIC_LOOK_AHEAD:         /* Come from (?* */
4442       *parsed_pattern++ = META_LOOKAHEAD_NA;
4443       ptr++;
4444       goto POST_ASSERTION;
4445 
4446       case CHAR_EXCLAMATION_MARK:
4447       NEGATIVE_LOOK_AHEAD:                   /* Come from (*nla: */
4448       *parsed_pattern++ = META_LOOKAHEADNOT;
4449       ptr++;
4450       goto POST_ASSERTION;
4451 
4452 
4453       /* ---- Lookbehind assertions ---- */
4454 
4455       /* (?< followed by = or ! or * is a lookbehind assertion. Otherwise (?<
4456       is the start of the name of a capturing group. */
4457 
4458       case CHAR_LESS_THAN_SIGN:
4459       if (ptrend - ptr <= 1 ||
4460          (ptr[1] != CHAR_EQUALS_SIGN &&
4461           ptr[1] != CHAR_EXCLAMATION_MARK &&
4462           ptr[1] != CHAR_ASTERISK))
4463         {
4464         terminator = CHAR_GREATER_THAN_SIGN;
4465         goto DEFINE_NAME;
4466         }
4467       *parsed_pattern++ = (ptr[1] == CHAR_EQUALS_SIGN)?
4468         META_LOOKBEHIND : (ptr[1] == CHAR_EXCLAMATION_MARK)?
4469         META_LOOKBEHINDNOT : META_LOOKBEHIND_NA;
4470 
4471       POST_LOOKBEHIND:           /* Come from (*plb: (*naplb: and (*nlb: */
4472       *has_lookbehind = TRUE;
4473       offset = (PCRE2_SIZE)(ptr - cb->start_pattern - 2);
4474       PUTOFFSET(offset, parsed_pattern);
4475       ptr += 2;
4476       /* Fall through */
4477 
4478       /* If the previous item was a condition starting (?(? an assertion,
4479       optionally preceded by a callout, is expected. This is checked later on,
4480       during actual compilation. However we need to identify this kind of
4481       assertion in this pass because it must not be qualified. The value of
4482       expect_cond_assert is set to 2 after (?(? is processed. We decrement it
4483       for a callout - still leaving a positive value that identifies the
4484       assertion. Multiple callouts or any other items will make it zero or
4485       less, which doesn't matter because they will cause an error later. */
4486 
4487       POST_ASSERTION:
4488       nest_depth++;
4489       if (prev_expect_cond_assert > 0)
4490         {
4491         if (top_nest == NULL) top_nest = (nest_save *)(cb->start_workspace);
4492         else if (++top_nest >= end_nests)
4493           {
4494           errorcode = ERR84;
4495           goto FAILED;
4496           }
4497         top_nest->nest_depth = nest_depth;
4498         top_nest->flags = NSF_CONDASSERT;
4499         top_nest->options = options & PARSE_TRACKED_OPTIONS;
4500         }
4501       break;
4502 
4503 
4504       /* ---- Define a named group ---- */
4505 
4506       /* A named group may be defined as (?'name') or (?<name>). In the latter
4507       case we jump to DEFINE_NAME from the disambiguation of (?< above with the
4508       terminator set to '>'. */
4509 
4510       case CHAR_APOSTROPHE:
4511       terminator = CHAR_APOSTROPHE;    /* Terminator */
4512 
4513       DEFINE_NAME:
4514       if (!read_name(&ptr, ptrend, utf, terminator, &offset, &name, &namelen,
4515           &errorcode, cb)) goto FAILED;
4516 
4517       /* We have a name for this capturing group. It is also assigned a number,
4518       which is its primary means of identification. */
4519 
4520       if (cb->bracount >= MAX_GROUP_NUMBER)
4521         {
4522         errorcode = ERR97;
4523         goto FAILED;
4524         }
4525       cb->bracount++;
4526       *parsed_pattern++ = META_CAPTURE | cb->bracount;
4527       nest_depth++;
4528 
4529       /* Check not too many names */
4530 
4531       if (cb->names_found >= MAX_NAME_COUNT)
4532         {
4533         errorcode = ERR49;
4534         goto FAILED;
4535         }
4536 
4537       /* Adjust the entry size to accommodate the longest name found. */
4538 
4539       if (namelen + IMM2_SIZE + 1 > cb->name_entry_size)
4540         cb->name_entry_size = (uint16_t)(namelen + IMM2_SIZE + 1);
4541 
4542       /* Scan the list to check for duplicates. For duplicate names, if the
4543       number is the same, break the loop, which causes the name to be
4544       discarded; otherwise, if DUPNAMES is not set, give an error.
4545       If it is set, allow the name with a different number, but continue
4546       scanning in case this is a duplicate with the same number. For
4547       non-duplicate names, give an error if the number is duplicated. */
4548 
4549       isdupname = FALSE;
4550       ng = cb->named_groups;
4551       for (i = 0; i < cb->names_found; i++, ng++)
4552         {
4553         if (namelen == ng->length &&
4554             PRIV(strncmp)(name, ng->name, (PCRE2_SIZE)namelen) == 0)
4555           {
4556           if (ng->number == cb->bracount) break;
4557           if ((options & PCRE2_DUPNAMES) == 0)
4558             {
4559             errorcode = ERR43;
4560             goto FAILED;
4561             }
4562           isdupname = ng->isdup = TRUE;     /* Mark as a duplicate */
4563           cb->dupnames = TRUE;              /* Duplicate names exist */
4564           }
4565         else if (ng->number == cb->bracount)
4566           {
4567           errorcode = ERR65;
4568           goto FAILED;
4569           }
4570         }
4571 
4572       if (i < cb->names_found) break;   /* Ignore duplicate with same number */
4573 
4574       /* Increase the list size if necessary */
4575 
4576       if (cb->names_found >= cb->named_group_list_size)
4577         {
4578         uint32_t newsize = cb->named_group_list_size * 2;
4579         named_group *newspace =
4580           cb->cx->memctl.malloc(newsize * sizeof(named_group),
4581           cb->cx->memctl.memory_data);
4582         if (newspace == NULL)
4583           {
4584           errorcode = ERR21;
4585           goto FAILED;
4586           }
4587 
4588         memcpy(newspace, cb->named_groups,
4589           cb->named_group_list_size * sizeof(named_group));
4590         if (cb->named_group_list_size > NAMED_GROUP_LIST_SIZE)
4591           cb->cx->memctl.free((void *)cb->named_groups,
4592           cb->cx->memctl.memory_data);
4593         cb->named_groups = newspace;
4594         cb->named_group_list_size = newsize;
4595         }
4596 
4597       /* Add this name to the list */
4598 
4599       cb->named_groups[cb->names_found].name = name;
4600       cb->named_groups[cb->names_found].length = (uint16_t)namelen;
4601       cb->named_groups[cb->names_found].number = cb->bracount;
4602       cb->named_groups[cb->names_found].isdup = (uint16_t)isdupname;
4603       cb->names_found++;
4604       break;
4605       }        /* End of (? switch */
4606     break;     /* End of ( handling */
4607 
4608 
4609     /* ---- Branch terminators ---- */
4610 
4611     /* Alternation: reset the capture count if we are in a (?| group. */
4612 
4613     case CHAR_VERTICAL_LINE:
4614     if (top_nest != NULL && top_nest->nest_depth == nest_depth &&
4615         (top_nest->flags & NSF_RESET) != 0)
4616       {
4617       if (cb->bracount > top_nest->max_group)
4618         top_nest->max_group = (uint16_t)cb->bracount;
4619       cb->bracount = top_nest->reset_group;
4620       }
4621     *parsed_pattern++ = META_ALT;
4622     break;
4623 
4624     /* End of group; reset the capture count to the maximum if we are in a (?|
4625     group and/or reset the options that are tracked during parsing. Disallow
4626     quantifier for a condition that is an assertion. */
4627 
4628     case CHAR_RIGHT_PARENTHESIS:
4629     okquantifier = TRUE;
4630     if (top_nest != NULL && top_nest->nest_depth == nest_depth)
4631       {
4632       options = (options & ~PARSE_TRACKED_OPTIONS) | top_nest->options;
4633       if ((top_nest->flags & NSF_RESET) != 0 &&
4634           top_nest->max_group > cb->bracount)
4635         cb->bracount = top_nest->max_group;
4636       if ((top_nest->flags & NSF_CONDASSERT) != 0)
4637         okquantifier = FALSE;
4638 
4639       if ((top_nest->flags & NSF_ATOMICSR) != 0)
4640         {
4641         *parsed_pattern++ = META_KET;
4642         }
4643 
4644       if (top_nest == (nest_save *)(cb->start_workspace)) top_nest = NULL;
4645         else top_nest--;
4646       }
4647     if (nest_depth == 0)    /* Unmatched closing parenthesis */
4648       {
4649       errorcode = ERR22;
4650       goto FAILED_BACK;
4651       }
4652     nest_depth--;
4653     *parsed_pattern++ = META_KET;
4654     break;
4655     }  /* End of switch on pattern character */
4656   }    /* End of main character scan loop */
4657 
4658 /* End of pattern reached. Check for missing ) at the end of a verb name. */
4659 
4660 if (inverbname && ptr >= ptrend)
4661   {
4662   errorcode = ERR60;
4663   goto FAILED;
4664   }
4665 
4666 /* Manage callout for the final item */
4667 
4668 PARSED_END:
4669 parsed_pattern = manage_callouts(ptr, &previous_callout, auto_callout,
4670   parsed_pattern, cb);
4671 
4672 /* Insert trailing items for word and line matching (features provided for the
4673 benefit of pcre2grep). */
4674 
4675 if ((extra_options & PCRE2_EXTRA_MATCH_LINE) != 0)
4676   {
4677   *parsed_pattern++ = META_KET;
4678   *parsed_pattern++ = META_DOLLAR;
4679   }
4680 else if ((extra_options & PCRE2_EXTRA_MATCH_WORD) != 0)
4681   {
4682   *parsed_pattern++ = META_KET;
4683   *parsed_pattern++ = META_ESCAPE + ESC_b;
4684   }
4685 
4686 /* Terminate the parsed pattern, then return success if all groups are closed.
4687 Otherwise we have unclosed parentheses. */
4688 
4689 if (parsed_pattern >= parsed_pattern_end)
4690   {
4691   errorcode = ERR63;  /* Internal error (parsed pattern overflow) */
4692   goto FAILED;
4693   }
4694 
4695 *parsed_pattern = META_END;
4696 if (nest_depth == 0) return 0;
4697 
4698 UNCLOSED_PARENTHESIS:
4699 errorcode = ERR14;
4700 
4701 /* Come here for all failures. */
4702 
4703 FAILED:
4704 cb->erroroffset = (PCRE2_SIZE)(ptr - cb->start_pattern);
4705 return errorcode;
4706 
4707 /* Some errors need to indicate the previous character. */
4708 
4709 FAILED_BACK:
4710 ptr--;
4711 goto FAILED;
4712 
4713 /* This failure happens several times. */
4714 
4715 BAD_VERSION_CONDITION:
4716 errorcode = ERR79;
4717 goto FAILED;
4718 }
4719 
4720 
4721 
4722 /*************************************************
4723 *       Find first significant opcode            *
4724 *************************************************/
4725 
4726 /* This is called by several functions that scan a compiled expression looking
4727 for a fixed first character, or an anchoring opcode etc. It skips over things
4728 that do not influence this. For some calls, it makes sense to skip negative
4729 forward and all backward assertions, and also the \b assertion; for others it
4730 does not.
4731 
4732 Arguments:
4733   code         pointer to the start of the group
4734   skipassert   TRUE if certain assertions are to be skipped
4735 
4736 Returns:       pointer to the first significant opcode
4737 */
4738 
4739 static const PCRE2_UCHAR*
first_significant_code(PCRE2_SPTR code,BOOL skipassert)4740 first_significant_code(PCRE2_SPTR code, BOOL skipassert)
4741 {
4742 for (;;)
4743   {
4744   switch ((int)*code)
4745     {
4746     case OP_ASSERT_NOT:
4747     case OP_ASSERTBACK:
4748     case OP_ASSERTBACK_NOT:
4749     case OP_ASSERTBACK_NA:
4750     if (!skipassert) return code;
4751     do code += GET(code, 1); while (*code == OP_ALT);
4752     code += PRIV(OP_lengths)[*code];
4753     break;
4754 
4755     case OP_WORD_BOUNDARY:
4756     case OP_NOT_WORD_BOUNDARY:
4757     if (!skipassert) return code;
4758     /* Fall through */
4759 
4760     case OP_CALLOUT:
4761     case OP_CREF:
4762     case OP_DNCREF:
4763     case OP_RREF:
4764     case OP_DNRREF:
4765     case OP_FALSE:
4766     case OP_TRUE:
4767     code += PRIV(OP_lengths)[*code];
4768     break;
4769 
4770     case OP_CALLOUT_STR:
4771     code += GET(code, 1 + 2*LINK_SIZE);
4772     break;
4773 
4774     case OP_SKIPZERO:
4775     code += 2 + GET(code, 2) + LINK_SIZE;
4776     break;
4777 
4778     case OP_COND:
4779     case OP_SCOND:
4780     if (code[1+LINK_SIZE] != OP_FALSE ||   /* Not DEFINE */
4781         code[GET(code, 1)] != OP_KET)      /* More than one branch */
4782       return code;
4783     code += GET(code, 1) + 1 + LINK_SIZE;
4784     break;
4785 
4786     case OP_MARK:
4787     case OP_COMMIT_ARG:
4788     case OP_PRUNE_ARG:
4789     case OP_SKIP_ARG:
4790     case OP_THEN_ARG:
4791     code += code[1] + PRIV(OP_lengths)[*code];
4792     break;
4793 
4794     default:
4795     return code;
4796     }
4797   }
4798 /* Control never reaches here */
4799 }
4800 
4801 
4802 
4803 #ifdef SUPPORT_UNICODE
4804 /*************************************************
4805 *           Get othercase range                  *
4806 *************************************************/
4807 
4808 /* This function is passed the start and end of a class range in UCP mode. It
4809 searches up the characters, looking for ranges of characters in the "other"
4810 case. Each call returns the next one, updating the start address. A character
4811 with multiple other cases is returned on its own with a special return value.
4812 
4813 Arguments:
4814   cptr        points to starting character value; updated
4815   d           end value
4816   ocptr       where to put start of othercase range
4817   odptr       where to put end of othercase range
4818 
4819 Yield:        -1 when no more
4820                0 when a range is returned
4821               >0 the CASESET offset for char with multiple other cases
4822                 in this case, ocptr contains the original
4823 */
4824 
4825 static int
get_othercase_range(uint32_t * cptr,uint32_t d,uint32_t * ocptr,uint32_t * odptr)4826 get_othercase_range(uint32_t *cptr, uint32_t d, uint32_t *ocptr,
4827   uint32_t *odptr)
4828 {
4829 uint32_t c, othercase, next;
4830 unsigned int co;
4831 
4832 /* Find the first character that has an other case. If it has multiple other
4833 cases, return its case offset value. */
4834 
4835 for (c = *cptr; c <= d; c++)
4836   {
4837   if ((co = UCD_CASESET(c)) != 0)
4838     {
4839     *ocptr = c++;   /* Character that has the set */
4840     *cptr = c;      /* Rest of input range */
4841     return (int)co;
4842     }
4843   if ((othercase = UCD_OTHERCASE(c)) != c) break;
4844   }
4845 
4846 if (c > d) return -1;  /* Reached end of range */
4847 
4848 /* Found a character that has a single other case. Search for the end of the
4849 range, which is either the end of the input range, or a character that has zero
4850 or more than one other cases. */
4851 
4852 *ocptr = othercase;
4853 next = othercase + 1;
4854 
4855 for (++c; c <= d; c++)
4856   {
4857   if ((co = UCD_CASESET(c)) != 0 || UCD_OTHERCASE(c) != next) break;
4858   next++;
4859   }
4860 
4861 *odptr = next - 1;     /* End of othercase range */
4862 *cptr = c;             /* Rest of input range */
4863 return 0;
4864 }
4865 #endif  /* SUPPORT_UNICODE */
4866 
4867 
4868 
4869 /*************************************************
4870 * Add a character or range to a class (internal) *
4871 *************************************************/
4872 
4873 /* This function packages up the logic of adding a character or range of
4874 characters to a class. The character values in the arguments will be within the
4875 valid values for the current mode (8-bit, 16-bit, UTF, etc). This function is
4876 called only from within the "add to class" group of functions, some of which
4877 are recursive and mutually recursive. The external entry point is
4878 add_to_class().
4879 
4880 Arguments:
4881   classbits     the bit map for characters < 256
4882   uchardptr     points to the pointer for extra data
4883   options       the options word
4884   cb            compile data
4885   start         start of range character
4886   end           end of range character
4887 
4888 Returns:        the number of < 256 characters added
4889                 the pointer to extra data is updated
4890 */
4891 
4892 static unsigned int
add_to_class_internal(uint8_t * classbits,PCRE2_UCHAR ** uchardptr,uint32_t options,compile_block * cb,uint32_t start,uint32_t end)4893 add_to_class_internal(uint8_t *classbits, PCRE2_UCHAR **uchardptr,
4894   uint32_t options, compile_block *cb, uint32_t start, uint32_t end)
4895 {
4896 uint32_t c;
4897 uint32_t classbits_end = (end <= 0xff ? end : 0xff);
4898 unsigned int n8 = 0;
4899 
4900 /* If caseless matching is required, scan the range and process alternate
4901 cases. In Unicode, there are 8-bit characters that have alternate cases that
4902 are greater than 255 and vice-versa. Sometimes we can just extend the original
4903 range. */
4904 
4905 if ((options & PCRE2_CASELESS) != 0)
4906   {
4907 #ifdef SUPPORT_UNICODE
4908   if ((options & (PCRE2_UTF|PCRE2_UCP)) != 0)
4909     {
4910     int rc;
4911     uint32_t oc, od;
4912 
4913     options &= ~PCRE2_CASELESS;   /* Remove for recursive calls */
4914     c = start;
4915 
4916     while ((rc = get_othercase_range(&c, end, &oc, &od)) >= 0)
4917       {
4918       /* Handle a single character that has more than one other case. */
4919 
4920       if (rc > 0) n8 += add_list_to_class_internal(classbits, uchardptr, options, cb,
4921         PRIV(ucd_caseless_sets) + rc, oc);
4922 
4923       /* Do nothing if the other case range is within the original range. */
4924 
4925       else if (oc >= cb->class_range_start && od <= cb->class_range_end) continue;
4926 
4927       /* Extend the original range if there is overlap, noting that if oc < c, we
4928       can't have od > end because a subrange is always shorter than the basic
4929       range. Otherwise, use a recursive call to add the additional range. */
4930 
4931       else if (oc < start && od >= start - 1) start = oc; /* Extend downwards */
4932       else if (od > end && oc <= end + 1)
4933         {
4934         end = od;       /* Extend upwards */
4935         if (end > classbits_end) classbits_end = (end <= 0xff ? end : 0xff);
4936         }
4937       else n8 += add_to_class_internal(classbits, uchardptr, options, cb, oc, od);
4938       }
4939     }
4940   else
4941 #endif  /* SUPPORT_UNICODE */
4942 
4943   /* Not UTF mode */
4944 
4945   for (c = start; c <= classbits_end; c++)
4946     {
4947     SETBIT(classbits, cb->fcc[c]);
4948     n8++;
4949     }
4950   }
4951 
4952 /* Now handle the originally supplied range. Adjust the final value according
4953 to the bit length - this means that the same lists of (e.g.) horizontal spaces
4954 can be used in all cases. */
4955 
4956 if ((options & PCRE2_UTF) == 0 && end > MAX_NON_UTF_CHAR)
4957   end = MAX_NON_UTF_CHAR;
4958 
4959 if (start > cb->class_range_start && end < cb->class_range_end) return n8;
4960 
4961 /* Use the bitmap for characters < 256. Otherwise use extra data.*/
4962 
4963 for (c = start; c <= classbits_end; c++)
4964   {
4965   /* Regardless of start, c will always be <= 255. */
4966   SETBIT(classbits, c);
4967   n8++;
4968   }
4969 
4970 #ifdef SUPPORT_WIDE_CHARS
4971 if (start <= 0xff) start = 0xff + 1;
4972 
4973 if (end >= start)
4974   {
4975   PCRE2_UCHAR *uchardata = *uchardptr;
4976 
4977 #ifdef SUPPORT_UNICODE
4978   if ((options & PCRE2_UTF) != 0)
4979     {
4980     if (start < end)
4981       {
4982       *uchardata++ = XCL_RANGE;
4983       uchardata += PRIV(ord2utf)(start, uchardata);
4984       uchardata += PRIV(ord2utf)(end, uchardata);
4985       }
4986     else if (start == end)
4987       {
4988       *uchardata++ = XCL_SINGLE;
4989       uchardata += PRIV(ord2utf)(start, uchardata);
4990       }
4991     }
4992   else
4993 #endif  /* SUPPORT_UNICODE */
4994 
4995   /* Without UTF support, character values are constrained by the bit length,
4996   and can only be > 256 for 16-bit and 32-bit libraries. */
4997 
4998 #if PCRE2_CODE_UNIT_WIDTH == 8
4999     {}
5000 #else
5001   if (start < end)
5002     {
5003     *uchardata++ = XCL_RANGE;
5004     *uchardata++ = start;
5005     *uchardata++ = end;
5006     }
5007   else if (start == end)
5008     {
5009     *uchardata++ = XCL_SINGLE;
5010     *uchardata++ = start;
5011     }
5012 #endif  /* PCRE2_CODE_UNIT_WIDTH == 8 */
5013   *uchardptr = uchardata;   /* Updata extra data pointer */
5014   }
5015 #else  /* SUPPORT_WIDE_CHARS */
5016   (void)uchardptr;          /* Avoid compiler warning */
5017 #endif /* SUPPORT_WIDE_CHARS */
5018 
5019 return n8;    /* Number of 8-bit characters */
5020 }
5021 
5022 
5023 
5024 #ifdef SUPPORT_UNICODE
5025 /*************************************************
5026 * Add a list of characters to a class (internal) *
5027 *************************************************/
5028 
5029 /* This function is used for adding a list of case-equivalent characters to a
5030 class when in UTF mode. This function is called only from within
5031 add_to_class_internal(), with which it is mutually recursive.
5032 
5033 Arguments:
5034   classbits     the bit map for characters < 256
5035   uchardptr     points to the pointer for extra data
5036   options       the options word
5037   cb            contains pointers to tables etc.
5038   p             points to row of 32-bit values, terminated by NOTACHAR
5039   except        character to omit; this is used when adding lists of
5040                   case-equivalent characters to avoid including the one we
5041                   already know about
5042 
5043 Returns:        the number of < 256 characters added
5044                 the pointer to extra data is updated
5045 */
5046 
5047 static unsigned int
add_list_to_class_internal(uint8_t * classbits,PCRE2_UCHAR ** uchardptr,uint32_t options,compile_block * cb,const uint32_t * p,unsigned int except)5048 add_list_to_class_internal(uint8_t *classbits, PCRE2_UCHAR **uchardptr,
5049   uint32_t options, compile_block *cb, const uint32_t *p, unsigned int except)
5050 {
5051 unsigned int n8 = 0;
5052 while (p[0] < NOTACHAR)
5053   {
5054   unsigned int n = 0;
5055   if (p[0] != except)
5056     {
5057     while(p[n+1] == p[0] + n + 1) n++;
5058     n8 += add_to_class_internal(classbits, uchardptr, options, cb, p[0], p[n]);
5059     }
5060   p += n + 1;
5061   }
5062 return n8;
5063 }
5064 #endif
5065 
5066 
5067 
5068 /*************************************************
5069 *   External entry point for add range to class  *
5070 *************************************************/
5071 
5072 /* This function sets the overall range so that the internal functions can try
5073 to avoid duplication when handling case-independence.
5074 
5075 Arguments:
5076   classbits     the bit map for characters < 256
5077   uchardptr     points to the pointer for extra data
5078   options       the options word
5079   cb            compile data
5080   start         start of range character
5081   end           end of range character
5082 
5083 Returns:        the number of < 256 characters added
5084                 the pointer to extra data is updated
5085 */
5086 
5087 static unsigned int
add_to_class(uint8_t * classbits,PCRE2_UCHAR ** uchardptr,uint32_t options,compile_block * cb,uint32_t start,uint32_t end)5088 add_to_class(uint8_t *classbits, PCRE2_UCHAR **uchardptr, uint32_t options,
5089   compile_block *cb, uint32_t start, uint32_t end)
5090 {
5091 cb->class_range_start = start;
5092 cb->class_range_end = end;
5093 return add_to_class_internal(classbits, uchardptr, options, cb, start, end);
5094 }
5095 
5096 
5097 /*************************************************
5098 *   External entry point for add list to class   *
5099 *************************************************/
5100 
5101 /* This function is used for adding a list of horizontal or vertical whitespace
5102 characters to a class. The list must be in order so that ranges of characters
5103 can be detected and handled appropriately. This function sets the overall range
5104 so that the internal functions can try to avoid duplication when handling
5105 case-independence.
5106 
5107 Arguments:
5108   classbits     the bit map for characters < 256
5109   uchardptr     points to the pointer for extra data
5110   options       the options word
5111   cb            contains pointers to tables etc.
5112   p             points to row of 32-bit values, terminated by NOTACHAR
5113   except        character to omit; this is used when adding lists of
5114                   case-equivalent characters to avoid including the one we
5115                   already know about
5116 
5117 Returns:        the number of < 256 characters added
5118                 the pointer to extra data is updated
5119 */
5120 
5121 static unsigned int
add_list_to_class(uint8_t * classbits,PCRE2_UCHAR ** uchardptr,uint32_t options,compile_block * cb,const uint32_t * p,unsigned int except)5122 add_list_to_class(uint8_t *classbits, PCRE2_UCHAR **uchardptr, uint32_t options,
5123   compile_block *cb, const uint32_t *p, unsigned int except)
5124 {
5125 unsigned int n8 = 0;
5126 while (p[0] < NOTACHAR)
5127   {
5128   unsigned int n = 0;
5129   if (p[0] != except)
5130     {
5131     while(p[n+1] == p[0] + n + 1) n++;
5132     cb->class_range_start = p[0];
5133     cb->class_range_end = p[n];
5134     n8 += add_to_class_internal(classbits, uchardptr, options, cb, p[0], p[n]);
5135     }
5136   p += n + 1;
5137   }
5138 return n8;
5139 }
5140 
5141 
5142 
5143 /*************************************************
5144 *    Add characters not in a list to a class     *
5145 *************************************************/
5146 
5147 /* This function is used for adding the complement of a list of horizontal or
5148 vertical whitespace to a class. The list must be in order.
5149 
5150 Arguments:
5151   classbits     the bit map for characters < 256
5152   uchardptr     points to the pointer for extra data
5153   options       the options word
5154   cb            contains pointers to tables etc.
5155   p             points to row of 32-bit values, terminated by NOTACHAR
5156 
5157 Returns:        the number of < 256 characters added
5158                 the pointer to extra data is updated
5159 */
5160 
5161 static unsigned int
add_not_list_to_class(uint8_t * classbits,PCRE2_UCHAR ** uchardptr,uint32_t options,compile_block * cb,const uint32_t * p)5162 add_not_list_to_class(uint8_t *classbits, PCRE2_UCHAR **uchardptr,
5163   uint32_t options, compile_block *cb, const uint32_t *p)
5164 {
5165 BOOL utf = (options & PCRE2_UTF) != 0;
5166 unsigned int n8 = 0;
5167 if (p[0] > 0)
5168   n8 += add_to_class(classbits, uchardptr, options, cb, 0, p[0] - 1);
5169 while (p[0] < NOTACHAR)
5170   {
5171   while (p[1] == p[0] + 1) p++;
5172   n8 += add_to_class(classbits, uchardptr, options, cb, p[0] + 1,
5173     (p[1] == NOTACHAR) ? (utf ? 0x10ffffu : 0xffffffffu) : p[1] - 1);
5174   p++;
5175   }
5176 return n8;
5177 }
5178 
5179 
5180 
5181 /*************************************************
5182 *    Find details of duplicate group names       *
5183 *************************************************/
5184 
5185 /* This is called from compile_branch() when it needs to know the index and
5186 count of duplicates in the names table when processing named backreferences,
5187 either directly, or as conditions.
5188 
5189 Arguments:
5190   name          points to the name
5191   length        the length of the name
5192   indexptr      where to put the index
5193   countptr      where to put the count of duplicates
5194   errorcodeptr  where to put an error code
5195   cb            the compile block
5196 
5197 Returns:        TRUE if OK, FALSE if not, error code set
5198 */
5199 
5200 static BOOL
find_dupname_details(PCRE2_SPTR name,uint32_t length,int * indexptr,int * countptr,int * errorcodeptr,compile_block * cb)5201 find_dupname_details(PCRE2_SPTR name, uint32_t length, int *indexptr,
5202   int *countptr, int *errorcodeptr, compile_block *cb)
5203 {
5204 uint32_t i, groupnumber;
5205 int count;
5206 PCRE2_UCHAR *slot = cb->name_table;
5207 
5208 /* Find the first entry in the table */
5209 
5210 for (i = 0; i < cb->names_found; i++)
5211   {
5212   if (PRIV(strncmp)(name, slot+IMM2_SIZE, length) == 0 &&
5213       slot[IMM2_SIZE+length] == 0) break;
5214   slot += cb->name_entry_size;
5215   }
5216 
5217 /* This should not occur, because this function is called only when we know we
5218 have duplicate names. Give an internal error. */
5219 
5220 if (i >= cb->names_found)
5221   {
5222   *errorcodeptr = ERR53;
5223   cb->erroroffset = name - cb->start_pattern;
5224   return FALSE;
5225   }
5226 
5227 /* Record the index and then see how many duplicates there are, updating the
5228 backref map and maximum back reference as we do. */
5229 
5230 *indexptr = i;
5231 count = 0;
5232 
5233 for (;;)
5234   {
5235   count++;
5236   groupnumber = GET2(slot,0);
5237   cb->backref_map |= (groupnumber < 32)? (1u << groupnumber) : 1;
5238   if (groupnumber > cb->top_backref) cb->top_backref = groupnumber;
5239   if (++i >= cb->names_found) break;
5240   slot += cb->name_entry_size;
5241   if (PRIV(strncmp)(name, slot+IMM2_SIZE, length) != 0 ||
5242     (slot+IMM2_SIZE)[length] != 0) break;
5243   }
5244 
5245 *countptr = count;
5246 return TRUE;
5247 }
5248 
5249 
5250 
5251 /*************************************************
5252 *           Compile one branch                   *
5253 *************************************************/
5254 
5255 /* Scan the parsed pattern, compiling it into the a vector of PCRE2_UCHAR. If
5256 the options are changed during the branch, the pointer is used to change the
5257 external options bits. This function is used during the pre-compile phase when
5258 we are trying to find out the amount of memory needed, as well as during the
5259 real compile phase. The value of lengthptr distinguishes the two phases.
5260 
5261 Arguments:
5262   optionsptr        pointer to the option bits
5263   codeptr           points to the pointer to the current code point
5264   pptrptr           points to the current parsed pattern pointer
5265   errorcodeptr      points to error code variable
5266   firstcuptr        place to put the first required code unit
5267   firstcuflagsptr   place to put the first code unit flags, or a negative number
5268   reqcuptr          place to put the last required code unit
5269   reqcuflagsptr     place to put the last required code unit flags, or a negative number
5270   bcptr             points to current branch chain
5271   cb                contains pointers to tables etc.
5272   lengthptr         NULL during the real compile phase
5273                     points to length accumulator during pre-compile phase
5274 
5275 Returns:            0 There's been an error, *errorcodeptr is non-zero
5276                    +1 Success, this branch must match at least one character
5277                    -1 Success, this branch may match an empty string
5278 */
5279 
5280 static int
compile_branch(uint32_t * optionsptr,PCRE2_UCHAR ** codeptr,uint32_t ** pptrptr,int * errorcodeptr,uint32_t * firstcuptr,int32_t * firstcuflagsptr,uint32_t * reqcuptr,int32_t * reqcuflagsptr,branch_chain * bcptr,compile_block * cb,PCRE2_SIZE * lengthptr)5281 compile_branch(uint32_t *optionsptr, PCRE2_UCHAR **codeptr, uint32_t **pptrptr,
5282   int *errorcodeptr, uint32_t *firstcuptr, int32_t *firstcuflagsptr,
5283   uint32_t *reqcuptr, int32_t *reqcuflagsptr, branch_chain *bcptr,
5284   compile_block *cb, PCRE2_SIZE *lengthptr)
5285 {
5286 int bravalue = 0;
5287 int okreturn = -1;
5288 int group_return = 0;
5289 uint32_t repeat_min = 0, repeat_max = 0;      /* To please picky compilers */
5290 uint32_t greedy_default, greedy_non_default;
5291 uint32_t repeat_type, op_type;
5292 uint32_t options = *optionsptr;               /* May change dynamically */
5293 uint32_t firstcu, reqcu;
5294 uint32_t zeroreqcu, zerofirstcu;
5295 uint32_t escape;
5296 uint32_t *pptr = *pptrptr;
5297 uint32_t meta, meta_arg;
5298 int32_t firstcuflags, reqcuflags;
5299 int32_t zeroreqcuflags, zerofirstcuflags;
5300 int32_t req_caseopt, reqvary, tempreqvary;
5301 PCRE2_SIZE offset = 0;
5302 PCRE2_SIZE length_prevgroup = 0;
5303 PCRE2_UCHAR *code = *codeptr;
5304 PCRE2_UCHAR *last_code = code;
5305 PCRE2_UCHAR *orig_code = code;
5306 PCRE2_UCHAR *tempcode;
5307 PCRE2_UCHAR *previous = NULL;
5308 PCRE2_UCHAR op_previous;
5309 BOOL groupsetfirstcu = FALSE;
5310 BOOL had_accept = FALSE;
5311 BOOL matched_char = FALSE;
5312 BOOL previous_matched_char = FALSE;
5313 BOOL reset_caseful = FALSE;
5314 const uint8_t *cbits = cb->cbits;
5315 uint8_t classbits[32];
5316 
5317 /* We can fish out the UTF setting once and for all into a BOOL, but we must
5318 not do this for other options (e.g. PCRE2_EXTENDED) because they may change
5319 dynamically as we process the pattern. */
5320 
5321 #ifdef SUPPORT_UNICODE
5322 BOOL utf = (options & PCRE2_UTF) != 0;
5323 BOOL ucp = (options & PCRE2_UCP) != 0;
5324 #else  /* No Unicode support */
5325 BOOL utf = FALSE;
5326 #endif
5327 
5328 /* Helper variables for OP_XCLASS opcode (for characters > 255). We define
5329 class_uchardata always so that it can be passed to add_to_class() always,
5330 though it will not be used in non-UTF 8-bit cases. This avoids having to supply
5331 alternative calls for the different cases. */
5332 
5333 PCRE2_UCHAR *class_uchardata;
5334 #ifdef SUPPORT_WIDE_CHARS
5335 BOOL xclass;
5336 PCRE2_UCHAR *class_uchardata_base;
5337 #endif
5338 
5339 /* Set up the default and non-default settings for greediness */
5340 
5341 greedy_default = ((options & PCRE2_UNGREEDY) != 0);
5342 greedy_non_default = greedy_default ^ 1;
5343 
5344 /* Initialize no first unit, no required unit. REQ_UNSET means "no char
5345 matching encountered yet". It gets changed to REQ_NONE if we hit something that
5346 matches a non-fixed first unit; reqcu just remains unset if we never find one.
5347 
5348 When we hit a repeat whose minimum is zero, we may have to adjust these values
5349 to take the zero repeat into account. This is implemented by setting them to
5350 zerofirstcu and zeroreqcu when such a repeat is encountered. The individual
5351 item types that can be repeated set these backoff variables appropriately. */
5352 
5353 firstcu = reqcu = zerofirstcu = zeroreqcu = 0;
5354 firstcuflags = reqcuflags = zerofirstcuflags = zeroreqcuflags = REQ_UNSET;
5355 
5356 /* The variable req_caseopt contains either the REQ_CASELESS value or zero,
5357 according to the current setting of the caseless flag. The REQ_CASELESS value
5358 leaves the lower 28 bit empty. It is added into the firstcu or reqcu variables
5359 to record the case status of the value. This is used only for ASCII characters.
5360 */
5361 
5362 req_caseopt = ((options & PCRE2_CASELESS) != 0)? REQ_CASELESS:0;
5363 
5364 /* Switch on next META item until the end of the branch */
5365 
5366 for (;; pptr++)
5367   {
5368 #ifdef SUPPORT_WIDE_CHARS
5369   BOOL xclass_has_prop;
5370 #endif
5371   BOOL negate_class;
5372   BOOL should_flip_negation;
5373   BOOL match_all_or_no_wide_chars;
5374   BOOL possessive_quantifier;
5375   BOOL note_group_empty;
5376   int class_has_8bitchar;
5377   int i;
5378   uint32_t mclength;
5379   uint32_t skipunits;
5380   uint32_t subreqcu, subfirstcu;
5381   uint32_t groupnumber;
5382   uint32_t verbarglen, verbculen;
5383   int32_t subreqcuflags, subfirstcuflags;  /* Must be signed */
5384   open_capitem *oc;
5385   PCRE2_UCHAR mcbuffer[8];
5386 
5387   /* Get next META item in the pattern and its potential argument. */
5388 
5389   meta = META_CODE(*pptr);
5390   meta_arg = META_DATA(*pptr);
5391 
5392   /* If we are in the pre-compile phase, accumulate the length used for the
5393   previous cycle of this loop, unless the next item is a quantifier. */
5394 
5395   if (lengthptr != NULL)
5396     {
5397     if (code > cb->start_workspace + cb->workspace_size -
5398         WORK_SIZE_SAFETY_MARGIN)                       /* Check for overrun */
5399       {
5400       *errorcodeptr = (code >= cb->start_workspace + cb->workspace_size)?
5401         ERR52 : ERR86;
5402       return 0;
5403       }
5404 
5405     /* There is at least one situation where code goes backwards: this is the
5406     case of a zero quantifier after a class (e.g. [ab]{0}). When the quantifier
5407     is processed, the whole class is eliminated. However, it is created first,
5408     so we have to allow memory for it. Therefore, don't ever reduce the length
5409     at this point. */
5410 
5411     if (code < last_code) code = last_code;
5412 
5413     /* If the next thing is not a quantifier, we add the length of the previous
5414     item into the total, and reset the code pointer to the start of the
5415     workspace. Otherwise leave the previous item available to be quantified. */
5416 
5417     if (meta < META_ASTERISK || meta > META_MINMAX_QUERY)
5418       {
5419       if (OFLOW_MAX - *lengthptr < (PCRE2_SIZE)(code - orig_code))
5420         {
5421         *errorcodeptr = ERR20;   /* Integer overflow */
5422         return 0;
5423         }
5424       *lengthptr += (PCRE2_SIZE)(code - orig_code);
5425       if (*lengthptr > MAX_PATTERN_SIZE)
5426         {
5427         *errorcodeptr = ERR20;   /* Pattern is too large */
5428         return 0;
5429         }
5430       code = orig_code;
5431       }
5432 
5433     /* Remember where this code item starts so we can catch the "backwards"
5434     case above next time round. */
5435 
5436     last_code = code;
5437     }
5438 
5439   /* Process the next parsed pattern item. If it is not a quantifier, remember
5440   where it starts so that it can be quantified when a quantifier follows.
5441   Checking for the legality of quantifiers happens in parse_regex(), except for
5442   a quantifier after an assertion that is a condition. */
5443 
5444   if (meta < META_ASTERISK || meta > META_MINMAX_QUERY)
5445     {
5446     previous = code;
5447     if (matched_char && !had_accept) okreturn = 1;
5448     }
5449 
5450   previous_matched_char = matched_char;
5451   matched_char = FALSE;
5452   note_group_empty = FALSE;
5453   skipunits = 0;         /* Default value for most subgroups */
5454 
5455   switch(meta)
5456     {
5457     /* ===================================================================*/
5458     /* The branch terminates at pattern end or | or ) */
5459 
5460     case META_END:
5461     case META_ALT:
5462     case META_KET:
5463     *firstcuptr = firstcu;
5464     *firstcuflagsptr = firstcuflags;
5465     *reqcuptr = reqcu;
5466     *reqcuflagsptr = reqcuflags;
5467     *codeptr = code;
5468     *pptrptr = pptr;
5469     return okreturn;
5470 
5471 
5472     /* ===================================================================*/
5473     /* Handle single-character metacharacters. In multiline mode, ^ disables
5474     the setting of any following char as a first character. */
5475 
5476     case META_CIRCUMFLEX:
5477     if ((options & PCRE2_MULTILINE) != 0)
5478       {
5479       if (firstcuflags == REQ_UNSET)
5480         zerofirstcuflags = firstcuflags = REQ_NONE;
5481       *code++ = OP_CIRCM;
5482       }
5483     else *code++ = OP_CIRC;
5484     break;
5485 
5486     case META_DOLLAR:
5487     *code++ = ((options & PCRE2_MULTILINE) != 0)? OP_DOLLM : OP_DOLL;
5488     break;
5489 
5490     /* There can never be a first char if '.' is first, whatever happens about
5491     repeats. The value of reqcu doesn't change either. */
5492 
5493     case META_DOT:
5494     matched_char = TRUE;
5495     if (firstcuflags == REQ_UNSET) firstcuflags = REQ_NONE;
5496     zerofirstcu = firstcu;
5497     zerofirstcuflags = firstcuflags;
5498     zeroreqcu = reqcu;
5499     zeroreqcuflags = reqcuflags;
5500     *code++ = ((options & PCRE2_DOTALL) != 0)? OP_ALLANY: OP_ANY;
5501     break;
5502 
5503 
5504     /* ===================================================================*/
5505     /* Empty character classes are allowed if PCRE2_ALLOW_EMPTY_CLASS is set.
5506     Otherwise, an initial ']' is taken as a data character. When empty classes
5507     are allowed, [] must always fail, so generate OP_FAIL, whereas [^] must
5508     match any character, so generate OP_ALLANY. */
5509 
5510     case META_CLASS_EMPTY:
5511     case META_CLASS_EMPTY_NOT:
5512     matched_char = TRUE;
5513     *code++ = (meta == META_CLASS_EMPTY_NOT)? OP_ALLANY : OP_FAIL;
5514     if (firstcuflags == REQ_UNSET) firstcuflags = REQ_NONE;
5515     zerofirstcu = firstcu;
5516     zerofirstcuflags = firstcuflags;
5517     break;
5518 
5519 
5520     /* ===================================================================*/
5521     /* Non-empty character class. If the included characters are all < 256, we
5522     build a 32-byte bitmap of the permitted characters, except in the special
5523     case where there is only one such character. For negated classes, we build
5524     the map as usual, then invert it at the end. However, we use a different
5525     opcode so that data characters > 255 can be handled correctly.
5526 
5527     If the class contains characters outside the 0-255 range, a different
5528     opcode is compiled. It may optionally have a bit map for characters < 256,
5529     but those above are are explicitly listed afterwards. A flag code unit
5530     tells whether the bitmap is present, and whether this is a negated class or
5531     not. */
5532 
5533     case META_CLASS_NOT:
5534     case META_CLASS:
5535     matched_char = TRUE;
5536     negate_class = meta == META_CLASS_NOT;
5537 
5538     /* We can optimize the case of a single character in a class by generating
5539     OP_CHAR or OP_CHARI if it's positive, or OP_NOT or OP_NOTI if it's
5540     negative. In the negative case there can be no first char if this item is
5541     first, whatever repeat count may follow. In the case of reqcu, save the
5542     previous value for reinstating. */
5543 
5544     /* NOTE: at present this optimization is not effective if the only
5545     character in a class in 32-bit, non-UCP mode has its top bit set. */
5546 
5547     if (pptr[1] < META_END && pptr[2] == META_CLASS_END)
5548       {
5549 #ifdef SUPPORT_UNICODE
5550       uint32_t d;
5551 #endif
5552       uint32_t c = pptr[1];
5553 
5554       pptr += 2;                 /* Move on to class end */
5555       if (meta == META_CLASS)    /* A positive one-char class can be */
5556         {                        /* handled as a normal literal character. */
5557         meta = c;                /* Set up the character */
5558         goto NORMAL_CHAR_SET;
5559         }
5560 
5561       /* Handle a negative one-character class */
5562 
5563       zeroreqcu = reqcu;
5564       zeroreqcuflags = reqcuflags;
5565       if (firstcuflags == REQ_UNSET) firstcuflags = REQ_NONE;
5566       zerofirstcu = firstcu;
5567       zerofirstcuflags = firstcuflags;
5568 
5569       /* For caseless UTF or UCP mode, check whether this character has more
5570       than one other case. If so, generate a special OP_NOTPROP item instead of
5571       OP_NOTI. */
5572 
5573 #ifdef SUPPORT_UNICODE
5574       if ((utf||ucp) && (options & PCRE2_CASELESS) != 0 &&
5575           (d = UCD_CASESET(c)) != 0)
5576         {
5577         *code++ = OP_NOTPROP;
5578         *code++ = PT_CLIST;
5579         *code++ = d;
5580         break;   /* We are finished with this class */
5581         }
5582 #endif
5583       /* Char has only one other case, or UCP not available */
5584 
5585       *code++ = ((options & PCRE2_CASELESS) != 0)? OP_NOTI: OP_NOT;
5586       code += PUTCHAR(c, code);
5587       break;   /* We are finished with this class */
5588       }        /* End of 1-char optimization */
5589 
5590     /* Handle character classes that contain more than just one literal
5591     character. If there are exactly two characters in a positive class, see if
5592     they are case partners. This can be optimized to generate a caseless single
5593     character match (which also sets first/required code units if relevant). */
5594 
5595     if (meta == META_CLASS && pptr[1] < META_END && pptr[2] < META_END &&
5596         pptr[3] == META_CLASS_END)
5597       {
5598       uint32_t c = pptr[1];
5599 
5600 #ifdef SUPPORT_UNICODE
5601       if (UCD_CASESET(c) == 0)
5602 #endif
5603         {
5604         uint32_t d;
5605 
5606 #ifdef SUPPORT_UNICODE
5607         if ((utf || ucp) && c > 127) d = UCD_OTHERCASE(c); else
5608 #endif
5609           {
5610 #if PCRE2_CODE_UNIT_WIDTH != 8
5611           if (c > 255) d = c; else
5612 #endif
5613           d = TABLE_GET(c, cb->fcc, c);
5614           }
5615 
5616         if (c != d && pptr[2] == d)
5617           {
5618           pptr += 3;                 /* Move on to class end */
5619           meta = c;
5620           if ((options & PCRE2_CASELESS) == 0)
5621             {
5622             reset_caseful = TRUE;
5623             options |= PCRE2_CASELESS;
5624             req_caseopt = REQ_CASELESS;
5625             }
5626           goto CLASS_CASELESS_CHAR;
5627           }
5628         }
5629       }
5630 
5631     /* If a non-extended class contains a negative special such as \S, we need
5632     to flip the negation flag at the end, so that support for characters > 255
5633     works correctly (they are all included in the class). An extended class may
5634     need to insert specific matching or non-matching code for wide characters.
5635     */
5636 
5637     should_flip_negation = match_all_or_no_wide_chars = FALSE;
5638 
5639     /* Extended class (xclass) will be used when characters > 255
5640     might match. */
5641 
5642 #ifdef SUPPORT_WIDE_CHARS
5643     xclass = FALSE;
5644     class_uchardata = code + LINK_SIZE + 2;   /* For XCLASS items */
5645     class_uchardata_base = class_uchardata;   /* Save the start */
5646 #endif
5647 
5648     /* For optimization purposes, we track some properties of the class:
5649     class_has_8bitchar will be non-zero if the class contains at least one
5650     character with a code point less than 256; xclass_has_prop will be TRUE if
5651     Unicode property checks are present in the class. */
5652 
5653     class_has_8bitchar = 0;
5654 #ifdef SUPPORT_WIDE_CHARS
5655     xclass_has_prop = FALSE;
5656 #endif
5657 
5658     /* Initialize the 256-bit (32-byte) bit map to all zeros. We build the map
5659     in a temporary bit of memory, in case the class contains fewer than two
5660     8-bit characters because in that case the compiled code doesn't use the bit
5661     map. */
5662 
5663     memset(classbits, 0, 32 * sizeof(uint8_t));
5664 
5665     /* Process items until META_CLASS_END is reached. */
5666 
5667     while ((meta = *(++pptr)) != META_CLASS_END)
5668       {
5669       /* Handle POSIX classes such as [:alpha:] etc. */
5670 
5671       if (meta == META_POSIX || meta == META_POSIX_NEG)
5672         {
5673         BOOL local_negate = (meta == META_POSIX_NEG);
5674         int posix_class = *(++pptr);
5675         int taboffset, tabopt;
5676         uint8_t pbits[32];
5677 
5678         should_flip_negation = local_negate;  /* Note negative special */
5679 
5680         /* If matching is caseless, upper and lower are converted to alpha.
5681         This relies on the fact that the class table starts with alpha,
5682         lower, upper as the first 3 entries. */
5683 
5684         if ((options & PCRE2_CASELESS) != 0 && posix_class <= 2)
5685           posix_class = 0;
5686 
5687         /* When PCRE2_UCP is set, some of the POSIX classes are converted to
5688         different escape sequences that use Unicode properties \p or \P.
5689         Others that are not available via \p or \P have to generate
5690         XCL_PROP/XCL_NOTPROP directly, which is done here. */
5691 
5692 #ifdef SUPPORT_UNICODE
5693         if ((options & PCRE2_UCP) != 0) switch(posix_class)
5694           {
5695           case PC_GRAPH:
5696           case PC_PRINT:
5697           case PC_PUNCT:
5698           *class_uchardata++ = local_negate? XCL_NOTPROP : XCL_PROP;
5699           *class_uchardata++ = (PCRE2_UCHAR)
5700             ((posix_class == PC_GRAPH)? PT_PXGRAPH :
5701              (posix_class == PC_PRINT)? PT_PXPRINT : PT_PXPUNCT);
5702           *class_uchardata++ = 0;
5703           xclass_has_prop = TRUE;
5704           goto CONTINUE_CLASS;
5705 
5706           /* For the other POSIX classes (ascii, xdigit) we are going to
5707           fall through to the non-UCP case and build a bit map for
5708           characters with code points less than 256. However, if we are in
5709           a negated POSIX class, characters with code points greater than
5710           255 must either all match or all not match, depending on whether
5711           the whole class is not or is negated. For example, for
5712           [[:^ascii:]... they must all match, whereas for [^[:^xdigit:]...
5713           they must not.
5714 
5715           In the special case where there are no xclass items, this is
5716           automatically handled by the use of OP_CLASS or OP_NCLASS, but an
5717           explicit range is needed for OP_XCLASS. Setting a flag here
5718           causes the range to be generated later when it is known that
5719           OP_XCLASS is required. In the 8-bit library this is relevant only in
5720           utf mode, since no wide characters can exist otherwise. */
5721 
5722           default:
5723 #if PCRE2_CODE_UNIT_WIDTH == 8
5724           if (utf)
5725 #endif
5726           match_all_or_no_wide_chars |= local_negate;
5727           break;
5728           }
5729 #endif  /* SUPPORT_UNICODE */
5730 
5731         /* In the non-UCP case, or when UCP makes no difference, we build the
5732         bit map for the POSIX class in a chunk of local store because we may
5733         be adding and subtracting from it, and we don't want to subtract bits
5734         that may be in the main map already. At the end we or the result into
5735         the bit map that is being built. */
5736 
5737         posix_class *= 3;
5738 
5739         /* Copy in the first table (always present) */
5740 
5741         memcpy(pbits, cbits + posix_class_maps[posix_class],
5742           32 * sizeof(uint8_t));
5743 
5744         /* If there is a second table, add or remove it as required. */
5745 
5746         taboffset = posix_class_maps[posix_class + 1];
5747         tabopt = posix_class_maps[posix_class + 2];
5748 
5749         if (taboffset >= 0)
5750           {
5751           if (tabopt >= 0)
5752             for (i = 0; i < 32; i++) pbits[i] |= cbits[(int)i + taboffset];
5753           else
5754             for (i = 0; i < 32; i++) pbits[i] &= ~cbits[(int)i + taboffset];
5755           }
5756 
5757         /* Now see if we need to remove any special characters. An option
5758         value of 1 removes vertical space and 2 removes underscore. */
5759 
5760         if (tabopt < 0) tabopt = -tabopt;
5761         if (tabopt == 1) pbits[1] &= ~0x3c;
5762           else if (tabopt == 2) pbits[11] &= 0x7f;
5763 
5764         /* Add the POSIX table or its complement into the main table that is
5765         being built and we are done. */
5766 
5767         if (local_negate)
5768           for (i = 0; i < 32; i++) classbits[i] |= ~pbits[i];
5769         else
5770           for (i = 0; i < 32; i++) classbits[i] |= pbits[i];
5771 
5772         /* Every class contains at least one < 256 character. */
5773 
5774         class_has_8bitchar = 1;
5775         goto CONTINUE_CLASS;    /* End of POSIX handling */
5776         }
5777 
5778       /* Other than POSIX classes, the only items we should encounter are
5779       \d-type escapes and literal characters (possibly as ranges). */
5780 
5781       if (meta == META_BIGVALUE)
5782         {
5783         meta = *(++pptr);
5784         goto CLASS_LITERAL;
5785         }
5786 
5787       /* Any other non-literal must be an escape */
5788 
5789       if (meta >= META_END)
5790         {
5791         if (META_CODE(meta) != META_ESCAPE)
5792           {
5793 #ifdef DEBUG_SHOW_PARSED
5794           fprintf(stderr, "** Unrecognized parsed pattern item 0x%.8x "
5795                           "in character class\n", meta);
5796 #endif
5797           *errorcodeptr = ERR89;  /* Internal error - unrecognized. */
5798           return 0;
5799           }
5800         escape = META_DATA(meta);
5801 
5802         /* Every class contains at least one < 256 character. */
5803 
5804         class_has_8bitchar++;
5805 
5806         switch(escape)
5807           {
5808           case ESC_d:
5809           for (i = 0; i < 32; i++) classbits[i] |= cbits[i+cbit_digit];
5810           break;
5811 
5812           case ESC_D:
5813           should_flip_negation = TRUE;
5814           for (i = 0; i < 32; i++) classbits[i] |= ~cbits[i+cbit_digit];
5815           break;
5816 
5817           case ESC_w:
5818           for (i = 0; i < 32; i++) classbits[i] |= cbits[i+cbit_word];
5819           break;
5820 
5821           case ESC_W:
5822           should_flip_negation = TRUE;
5823           for (i = 0; i < 32; i++) classbits[i] |= ~cbits[i+cbit_word];
5824           break;
5825 
5826           /* Perl 5.004 onwards omitted VT from \s, but restored it at Perl
5827           5.18. Before PCRE 8.34, we had to preserve the VT bit if it was
5828           previously set by something earlier in the character class.
5829           Luckily, the value of CHAR_VT is 0x0b in both ASCII and EBCDIC, so
5830           we could just adjust the appropriate bit. From PCRE 8.34 we no
5831           longer treat \s and \S specially. */
5832 
5833           case ESC_s:
5834           for (i = 0; i < 32; i++) classbits[i] |= cbits[i+cbit_space];
5835           break;
5836 
5837           case ESC_S:
5838           should_flip_negation = TRUE;
5839           for (i = 0; i < 32; i++) classbits[i] |= ~cbits[i+cbit_space];
5840           break;
5841 
5842           /* When adding the horizontal or vertical space lists to a class, or
5843           their complements, disable PCRE2_CASELESS, because it justs wastes
5844           time, and in the "not-x" UTF cases can create unwanted duplicates in
5845           the XCLASS list (provoked by characters that have more than one other
5846           case and by both cases being in the same "not-x" sublist). */
5847 
5848           case ESC_h:
5849           (void)add_list_to_class(classbits, &class_uchardata,
5850             options & ~PCRE2_CASELESS, cb, PRIV(hspace_list), NOTACHAR);
5851           break;
5852 
5853           case ESC_H:
5854           (void)add_not_list_to_class(classbits, &class_uchardata,
5855             options & ~PCRE2_CASELESS, cb, PRIV(hspace_list));
5856           break;
5857 
5858           case ESC_v:
5859           (void)add_list_to_class(classbits, &class_uchardata,
5860             options & ~PCRE2_CASELESS, cb, PRIV(vspace_list), NOTACHAR);
5861           break;
5862 
5863           case ESC_V:
5864           (void)add_not_list_to_class(classbits, &class_uchardata,
5865             options & ~PCRE2_CASELESS, cb, PRIV(vspace_list));
5866           break;
5867 
5868           /* If Unicode is not supported, \P and \p are not allowed and are
5869           faulted at parse time, so will never appear here. */
5870 
5871 #ifdef SUPPORT_UNICODE
5872           case ESC_p:
5873           case ESC_P:
5874             {
5875             uint32_t ptype = *(++pptr) >> 16;
5876             uint32_t pdata = *pptr & 0xffff;
5877             *class_uchardata++ = (escape == ESC_p)? XCL_PROP : XCL_NOTPROP;
5878             *class_uchardata++ = ptype;
5879             *class_uchardata++ = pdata;
5880             xclass_has_prop = TRUE;
5881             class_has_8bitchar--;                /* Undo! */
5882             }
5883           break;
5884 #endif
5885           }
5886 
5887         goto CONTINUE_CLASS;
5888         }  /* End handling \d-type escapes */
5889 
5890       /* A literal character may be followed by a range meta. At parse time
5891       there are checks for out-of-order characters, for ranges where the two
5892       characters are equal, and for hyphens that cannot indicate a range. At
5893       this point, therefore, no checking is needed. */
5894 
5895       else
5896         {
5897         uint32_t c, d;
5898 
5899         CLASS_LITERAL:
5900         c = d = meta;
5901 
5902         /* Remember if \r or \n were explicitly used */
5903 
5904         if (c == CHAR_CR || c == CHAR_NL) cb->external_flags |= PCRE2_HASCRORLF;
5905 
5906         /* Process a character range */
5907 
5908         if (pptr[1] == META_RANGE_LITERAL || pptr[1] == META_RANGE_ESCAPED)
5909           {
5910 #ifdef EBCDIC
5911           BOOL range_is_literal = (pptr[1] == META_RANGE_LITERAL);
5912 #endif
5913           pptr += 2;
5914           d = *pptr;
5915           if (d == META_BIGVALUE) d = *(++pptr);
5916 
5917           /* Remember an explicit \r or \n, and add the range to the class. */
5918 
5919           if (d == CHAR_CR || d == CHAR_NL) cb->external_flags |= PCRE2_HASCRORLF;
5920 
5921           /* In an EBCDIC environment, Perl treats alphabetic ranges specially
5922           because there are holes in the encoding, and simply using the range
5923           A-Z (for example) would include the characters in the holes. This
5924           applies only to literal ranges; [\xC1-\xE9] is different to [A-Z]. */
5925 
5926 #ifdef EBCDIC
5927           if (range_is_literal &&
5928                (cb->ctypes[c] & ctype_letter) != 0 &&
5929                (cb->ctypes[d] & ctype_letter) != 0 &&
5930                (c <= CHAR_z) == (d <= CHAR_z))
5931             {
5932             uint32_t uc = (d <= CHAR_z)? 0 : 64;
5933             uint32_t C = c - uc;
5934             uint32_t D = d - uc;
5935 
5936             if (C <= CHAR_i)
5937               {
5938               class_has_8bitchar +=
5939                 add_to_class(classbits, &class_uchardata, options, cb, C + uc,
5940                   ((D < CHAR_i)? D : CHAR_i) + uc);
5941               C = CHAR_j;
5942               }
5943 
5944             if (C <= D && C <= CHAR_r)
5945               {
5946               class_has_8bitchar +=
5947                 add_to_class(classbits, &class_uchardata, options, cb, C + uc,
5948                   ((D < CHAR_r)? D : CHAR_r) + uc);
5949               C = CHAR_s;
5950               }
5951 
5952             if (C <= D)
5953               {
5954               class_has_8bitchar +=
5955                 add_to_class(classbits, &class_uchardata, options, cb, C + uc,
5956                   D + uc);
5957               }
5958             }
5959           else
5960 #endif
5961           /* Not an EBCDIC special range */
5962 
5963           class_has_8bitchar +=
5964             add_to_class(classbits, &class_uchardata, options, cb, c, d);
5965           goto CONTINUE_CLASS;   /* Go get the next char in the class */
5966           }  /* End of range handling */
5967 
5968 
5969         /* Handle a single character. */
5970 
5971         class_has_8bitchar +=
5972           add_to_class(classbits, &class_uchardata, options, cb, meta, meta);
5973         }
5974 
5975       /* Continue to the next item in the class. */
5976 
5977       CONTINUE_CLASS:
5978 
5979 #ifdef SUPPORT_WIDE_CHARS
5980       /* If any wide characters or Unicode properties have been encountered,
5981       set xclass = TRUE. Then, in the pre-compile phase, accumulate the length
5982       of the extra data and reset the pointer. This is so that very large
5983       classes that contain a zillion wide characters or Unicode property tests
5984       do not overwrite the workspace (which is on the stack). */
5985 
5986       if (class_uchardata > class_uchardata_base)
5987         {
5988         xclass = TRUE;
5989         if (lengthptr != NULL)
5990           {
5991           *lengthptr += class_uchardata - class_uchardata_base;
5992           class_uchardata = class_uchardata_base;
5993           }
5994         }
5995 #endif
5996 
5997       continue;  /* Needed to avoid error when not supporting wide chars */
5998       }   /* End of main class-processing loop */
5999 
6000     /* If this class is the first thing in the branch, there can be no first
6001     char setting, whatever the repeat count. Any reqcu setting must remain
6002     unchanged after any kind of repeat. */
6003 
6004     if (firstcuflags == REQ_UNSET) firstcuflags = REQ_NONE;
6005     zerofirstcu = firstcu;
6006     zerofirstcuflags = firstcuflags;
6007     zeroreqcu = reqcu;
6008     zeroreqcuflags = reqcuflags;
6009 
6010     /* If there are characters with values > 255, or Unicode property settings
6011     (\p or \P), we have to compile an extended class, with its own opcode,
6012     unless there were no property settings and there was a negated special such
6013     as \S in the class, and PCRE2_UCP is not set, because in that case all
6014     characters > 255 are in or not in the class, so any that were explicitly
6015     given as well can be ignored.
6016 
6017     In the UCP case, if certain negated POSIX classes ([:^ascii:] or
6018     [^:xdigit:]) were present in a class, we either have to match or not match
6019     all wide characters (depending on whether the whole class is or is not
6020     negated). This requirement is indicated by match_all_or_no_wide_chars being
6021     true. We do this by including an explicit range, which works in both cases.
6022     This applies only in UTF and 16-bit and 32-bit non-UTF modes, since there
6023     cannot be any wide characters in 8-bit non-UTF mode.
6024 
6025     When there *are* properties in a positive UTF-8 or any 16-bit or 32_bit
6026     class where \S etc is present without PCRE2_UCP, causing an extended class
6027     to be compiled, we make sure that all characters > 255 are included by
6028     forcing match_all_or_no_wide_chars to be true.
6029 
6030     If, when generating an xclass, there are no characters < 256, we can omit
6031     the bitmap in the actual compiled code. */
6032 
6033 #ifdef SUPPORT_WIDE_CHARS  /* Defined for 16/32 bits, or 8-bit with Unicode */
6034     if (xclass && (
6035 #ifdef SUPPORT_UNICODE
6036         (options & PCRE2_UCP) != 0 ||
6037 #endif
6038         xclass_has_prop || !should_flip_negation))
6039       {
6040       if (match_all_or_no_wide_chars || (
6041 #if PCRE2_CODE_UNIT_WIDTH == 8
6042            utf &&
6043 #endif
6044            should_flip_negation && !negate_class && (options & PCRE2_UCP) == 0))
6045         {
6046         *class_uchardata++ = XCL_RANGE;
6047         if (utf)   /* Will always be utf in the 8-bit library */
6048           {
6049           class_uchardata += PRIV(ord2utf)(0x100, class_uchardata);
6050           class_uchardata += PRIV(ord2utf)(MAX_UTF_CODE_POINT, class_uchardata);
6051           }
6052         else       /* Can only happen for the 16-bit & 32-bit libraries */
6053           {
6054 #if PCRE2_CODE_UNIT_WIDTH == 16
6055           *class_uchardata++ = 0x100;
6056           *class_uchardata++ = 0xffffu;
6057 #elif PCRE2_CODE_UNIT_WIDTH == 32
6058           *class_uchardata++ = 0x100;
6059           *class_uchardata++ = 0xffffffffu;
6060 #endif
6061           }
6062         }
6063       *class_uchardata++ = XCL_END;    /* Marks the end of extra data */
6064       *code++ = OP_XCLASS;
6065       code += LINK_SIZE;
6066       *code = negate_class? XCL_NOT:0;
6067       if (xclass_has_prop) *code |= XCL_HASPROP;
6068 
6069       /* If the map is required, move up the extra data to make room for it;
6070       otherwise just move the code pointer to the end of the extra data. */
6071 
6072       if (class_has_8bitchar > 0)
6073         {
6074         *code++ |= XCL_MAP;
6075         (void)memmove(code + (32 / sizeof(PCRE2_UCHAR)), code,
6076           CU2BYTES(class_uchardata - code));
6077         if (negate_class && !xclass_has_prop)
6078           {
6079           /* Using 255 ^ instead of ~ avoids clang sanitize warning. */
6080           for (i = 0; i < 32; i++) classbits[i] = 255 ^ classbits[i];
6081           }
6082         memcpy(code, classbits, 32);
6083         code = class_uchardata + (32 / sizeof(PCRE2_UCHAR));
6084         }
6085       else code = class_uchardata;
6086 
6087       /* Now fill in the complete length of the item */
6088 
6089       PUT(previous, 1, (int)(code - previous));
6090       break;   /* End of class handling */
6091       }
6092 #endif  /* SUPPORT_WIDE_CHARS */
6093 
6094     /* If there are no characters > 255, or they are all to be included or
6095     excluded, set the opcode to OP_CLASS or OP_NCLASS, depending on whether the
6096     whole class was negated and whether there were negative specials such as \S
6097     (non-UCP) in the class. Then copy the 32-byte map into the code vector,
6098     negating it if necessary. */
6099 
6100     *code++ = (negate_class == should_flip_negation) ? OP_CLASS : OP_NCLASS;
6101     if (lengthptr == NULL)    /* Save time in the pre-compile phase */
6102       {
6103       if (negate_class)
6104         {
6105        /* Using 255 ^ instead of ~ avoids clang sanitize warning. */
6106        for (i = 0; i < 32; i++) classbits[i] = 255 ^ classbits[i];
6107        }
6108       memcpy(code, classbits, 32);
6109       }
6110     code += 32 / sizeof(PCRE2_UCHAR);
6111     break;  /* End of class processing */
6112 
6113 
6114     /* ===================================================================*/
6115     /* Deal with (*VERB)s. */
6116 
6117     /* Check for open captures before ACCEPT and close those that are within
6118     the same assertion level, also converting ACCEPT to ASSERT_ACCEPT in an
6119     assertion. In the first pass, just accumulate the length required;
6120     otherwise hitting (*ACCEPT) inside many nested parentheses can cause
6121     workspace overflow. Do not set firstcu after *ACCEPT. */
6122 
6123     case META_ACCEPT:
6124     cb->had_accept = had_accept = TRUE;
6125     for (oc = cb->open_caps;
6126          oc != NULL && oc->assert_depth >= cb->assert_depth;
6127          oc = oc->next)
6128       {
6129       if (lengthptr != NULL)
6130         {
6131         *lengthptr += CU2BYTES(1) + IMM2_SIZE;
6132         }
6133       else
6134         {
6135         *code++ = OP_CLOSE;
6136         PUT2INC(code, 0, oc->number);
6137         }
6138       }
6139     *code++ = (cb->assert_depth > 0)? OP_ASSERT_ACCEPT : OP_ACCEPT;
6140     if (firstcuflags == REQ_UNSET) firstcuflags = REQ_NONE;
6141     break;
6142 
6143     case META_PRUNE:
6144     case META_SKIP:
6145     cb->had_pruneorskip = TRUE;
6146     /* Fall through */
6147     case META_COMMIT:
6148     case META_FAIL:
6149     *code++ = verbops[(meta - META_MARK) >> 16];
6150     break;
6151 
6152     case META_THEN:
6153     cb->external_flags |= PCRE2_HASTHEN;
6154     *code++ = OP_THEN;
6155     break;
6156 
6157     /* Handle verbs with arguments. Arguments can be very long, especially in
6158     16- and 32-bit modes, and can overflow the workspace in the first pass.
6159     However, the argument length is constrained to be small enough to fit in
6160     one code unit. This check happens in parse_regex(). In the first pass,
6161     instead of putting the argument into memory, we just update the length
6162     counter and set up an empty argument. */
6163 
6164     case META_THEN_ARG:
6165     cb->external_flags |= PCRE2_HASTHEN;
6166     goto VERB_ARG;
6167 
6168     case META_PRUNE_ARG:
6169     case META_SKIP_ARG:
6170     cb->had_pruneorskip = TRUE;
6171     /* Fall through */
6172     case META_MARK:
6173     case META_COMMIT_ARG:
6174     VERB_ARG:
6175     *code++ = verbops[(meta - META_MARK) >> 16];
6176     /* The length is in characters. */
6177     verbarglen = *(++pptr);
6178     verbculen = 0;
6179     tempcode = code++;
6180     for (i = 0; i < (int)verbarglen; i++)
6181       {
6182       meta = *(++pptr);
6183 #ifdef SUPPORT_UNICODE
6184       if (utf) mclength = PRIV(ord2utf)(meta, mcbuffer); else
6185 #endif
6186         {
6187         mclength = 1;
6188         mcbuffer[0] = meta;
6189         }
6190       if (lengthptr != NULL) *lengthptr += mclength; else
6191         {
6192         memcpy(code, mcbuffer, CU2BYTES(mclength));
6193         code += mclength;
6194         verbculen += mclength;
6195         }
6196       }
6197 
6198     *tempcode = verbculen;   /* Fill in the code unit length */
6199     *code++ = 0;             /* Terminating zero */
6200     break;
6201 
6202 
6203     /* ===================================================================*/
6204     /* Handle options change. The new setting must be passed back for use in
6205     subsequent branches. Reset the greedy defaults and the case value for
6206     firstcu and reqcu. */
6207 
6208     case META_OPTIONS:
6209     *optionsptr = options = *(++pptr);
6210     greedy_default = ((options & PCRE2_UNGREEDY) != 0);
6211     greedy_non_default = greedy_default ^ 1;
6212     req_caseopt = ((options & PCRE2_CASELESS) != 0)? REQ_CASELESS : 0;
6213     break;
6214 
6215 
6216     /* ===================================================================*/
6217     /* Handle conditional subpatterns. The case of (?(Rdigits) is ambiguous
6218     because it could be a numerical check on recursion, or a name check on a
6219     group's being set. The pre-pass sets up META_COND_RNUMBER as a name so that
6220     we can handle it either way. We first try for a name; if not found, process
6221     the number. */
6222 
6223     case META_COND_RNUMBER:   /* (?(Rdigits) */
6224     case META_COND_NAME:      /* (?(name) or (?'name') or ?(<name>) */
6225     case META_COND_RNAME:     /* (?(R&name) - test for recursion */
6226     bravalue = OP_COND;
6227       {
6228       int count, index;
6229       PCRE2_SPTR name;
6230       named_group *ng = cb->named_groups;
6231       uint32_t length = *(++pptr);
6232 
6233       GETPLUSOFFSET(offset, pptr);
6234       name = cb->start_pattern + offset;
6235 
6236       /* In the first pass, the names generated in the pre-pass are available,
6237       but the main name table has not yet been created. Scan the list of names
6238       generated in the pre-pass in order to get a number and whether or not
6239       this name is duplicated. If it is not duplicated, we can handle it as a
6240       numerical group. */
6241 
6242       for (i = 0; i < cb->names_found; i++, ng++)
6243         {
6244         if (length == ng->length &&
6245             PRIV(strncmp)(name, ng->name, length) == 0)
6246           {
6247           if (!ng->isdup)
6248             {
6249             code[1+LINK_SIZE] = (meta == META_COND_RNAME)? OP_RREF : OP_CREF;
6250             PUT2(code, 2+LINK_SIZE, ng->number);
6251             if (ng->number > cb->top_backref) cb->top_backref = ng->number;
6252             skipunits = 1+IMM2_SIZE;
6253             goto GROUP_PROCESS_NOTE_EMPTY;
6254             }
6255           break;  /* Found a duplicated name */
6256           }
6257         }
6258 
6259       /* If the name was not found we have a bad reference, unless we are
6260       dealing with R<digits>, which is treated as a recursion test by number.
6261       */
6262 
6263       if (i >= cb->names_found)
6264         {
6265         groupnumber = 0;
6266         if (meta == META_COND_RNUMBER)
6267           {
6268           for (i = 1; i < (int)length; i++)
6269             {
6270             groupnumber = groupnumber * 10 + name[i] - CHAR_0;
6271             if (groupnumber > MAX_GROUP_NUMBER)
6272               {
6273               *errorcodeptr = ERR61;
6274               cb->erroroffset = offset + i;
6275               return 0;
6276               }
6277             }
6278           }
6279 
6280         if (meta != META_COND_RNUMBER || groupnumber > cb->bracount)
6281           {
6282           *errorcodeptr = ERR15;
6283           cb->erroroffset = offset;
6284           return 0;
6285           }
6286 
6287         /* (?Rdigits) treated as a recursion reference by number. A value of
6288         zero (which is the result of both (?R) and (?R0)) means "any", and is
6289         translated into RREF_ANY (which is 0xffff). */
6290 
6291         if (groupnumber == 0) groupnumber = RREF_ANY;
6292         code[1+LINK_SIZE] = OP_RREF;
6293         PUT2(code, 2+LINK_SIZE, groupnumber);
6294         skipunits = 1+IMM2_SIZE;
6295         goto GROUP_PROCESS_NOTE_EMPTY;
6296         }
6297 
6298       /* A duplicated name was found. Note that if an R<digits> name is found
6299       (META_COND_RNUMBER), it is a reference test, not a recursion test. */
6300 
6301       code[1+LINK_SIZE] = (meta == META_COND_RNAME)? OP_RREF : OP_CREF;
6302 
6303       /* We have a duplicated name. In the compile pass we have to search the
6304       main table in order to get the index and count values. */
6305 
6306       count = 0;  /* Values for first pass (avoids compiler warning) */
6307       index = 0;
6308       if (lengthptr == NULL && !find_dupname_details(name, length, &index,
6309             &count, errorcodeptr, cb)) return 0;
6310 
6311       /* Add one to the opcode to change CREF/RREF into DNCREF/DNRREF and
6312       insert appropriate data values. */
6313 
6314       code[1+LINK_SIZE]++;
6315       skipunits = 1+2*IMM2_SIZE;
6316       PUT2(code, 2+LINK_SIZE, index);
6317       PUT2(code, 2+LINK_SIZE+IMM2_SIZE, count);
6318       }
6319     goto GROUP_PROCESS_NOTE_EMPTY;
6320 
6321     /* The DEFINE condition is always false. Its internal groups may never
6322     be called, so matched_char must remain false, hence the jump to
6323     GROUP_PROCESS rather than GROUP_PROCESS_NOTE_EMPTY. */
6324 
6325     case META_COND_DEFINE:
6326     bravalue = OP_COND;
6327     GETPLUSOFFSET(offset, pptr);
6328     code[1+LINK_SIZE] = OP_DEFINE;
6329     skipunits = 1;
6330     goto GROUP_PROCESS;
6331 
6332     /* Conditional test of a group's being set. */
6333 
6334     case META_COND_NUMBER:
6335     bravalue = OP_COND;
6336     GETPLUSOFFSET(offset, pptr);
6337     groupnumber = *(++pptr);
6338     if (groupnumber > cb->bracount)
6339       {
6340       *errorcodeptr = ERR15;
6341       cb->erroroffset = offset;
6342       return 0;
6343       }
6344     if (groupnumber > cb->top_backref) cb->top_backref = groupnumber;
6345     offset -= 2;   /* Point at initial ( for too many branches error */
6346     code[1+LINK_SIZE] = OP_CREF;
6347     skipunits = 1+IMM2_SIZE;
6348     PUT2(code, 2+LINK_SIZE, groupnumber);
6349     goto GROUP_PROCESS_NOTE_EMPTY;
6350 
6351     /* Test for the PCRE2 version. */
6352 
6353     case META_COND_VERSION:
6354     bravalue = OP_COND;
6355     if (pptr[1] > 0)
6356       code[1+LINK_SIZE] = ((PCRE2_MAJOR > pptr[2]) ||
6357         (PCRE2_MAJOR == pptr[2] && PCRE2_MINOR >= pptr[3]))?
6358           OP_TRUE : OP_FALSE;
6359     else
6360       code[1+LINK_SIZE] = (PCRE2_MAJOR == pptr[2] && PCRE2_MINOR == pptr[3])?
6361         OP_TRUE : OP_FALSE;
6362     skipunits = 1;
6363     pptr += 3;
6364     goto GROUP_PROCESS_NOTE_EMPTY;
6365 
6366     /* The condition is an assertion, possibly preceded by a callout. */
6367 
6368     case META_COND_ASSERT:
6369     bravalue = OP_COND;
6370     goto GROUP_PROCESS_NOTE_EMPTY;
6371 
6372 
6373     /* ===================================================================*/
6374     /* Handle all kinds of nested bracketed groups. The non-capturing,
6375     non-conditional cases are here; others come to GROUP_PROCESS via goto. */
6376 
6377     case META_LOOKAHEAD:
6378     bravalue = OP_ASSERT;
6379     cb->assert_depth += 1;
6380     goto GROUP_PROCESS;
6381 
6382     case META_LOOKAHEAD_NA:
6383     bravalue = OP_ASSERT_NA;
6384     cb->assert_depth += 1;
6385     goto GROUP_PROCESS;
6386 
6387     /* Optimize (?!) to (*FAIL) unless it is quantified - which is a weird
6388     thing to do, but Perl allows all assertions to be quantified, and when
6389     they contain capturing parentheses there may be a potential use for
6390     this feature. Not that that applies to a quantified (?!) but we allow
6391     it for uniformity. */
6392 
6393     case META_LOOKAHEADNOT:
6394     if (pptr[1] == META_KET &&
6395          (pptr[2] < META_ASTERISK || pptr[2] > META_MINMAX_QUERY))
6396       {
6397       *code++ = OP_FAIL;
6398       pptr++;
6399       }
6400     else
6401       {
6402       bravalue = OP_ASSERT_NOT;
6403       cb->assert_depth += 1;
6404       goto GROUP_PROCESS;
6405       }
6406     break;
6407 
6408     case META_LOOKBEHIND:
6409     bravalue = OP_ASSERTBACK;
6410     cb->assert_depth += 1;
6411     goto GROUP_PROCESS;
6412 
6413     case META_LOOKBEHINDNOT:
6414     bravalue = OP_ASSERTBACK_NOT;
6415     cb->assert_depth += 1;
6416     goto GROUP_PROCESS;
6417 
6418     case META_LOOKBEHIND_NA:
6419     bravalue = OP_ASSERTBACK_NA;
6420     cb->assert_depth += 1;
6421     goto GROUP_PROCESS;
6422 
6423     case META_ATOMIC:
6424     bravalue = OP_ONCE;
6425     goto GROUP_PROCESS_NOTE_EMPTY;
6426 
6427     case META_SCRIPT_RUN:
6428     bravalue = OP_SCRIPT_RUN;
6429     goto GROUP_PROCESS_NOTE_EMPTY;
6430 
6431     case META_NOCAPTURE:
6432     bravalue = OP_BRA;
6433     /* Fall through */
6434 
6435     /* Process nested bracketed regex. The nesting depth is maintained for the
6436     benefit of the stackguard function. The test for too deep nesting is now
6437     done in parse_regex(). Assertion and DEFINE groups come to GROUP_PROCESS;
6438     others come to GROUP_PROCESS_NOTE_EMPTY, to indicate that we need to take
6439     note of whether or not they may match an empty string. */
6440 
6441     GROUP_PROCESS_NOTE_EMPTY:
6442     note_group_empty = TRUE;
6443 
6444     GROUP_PROCESS:
6445     cb->parens_depth += 1;
6446     *code = bravalue;
6447     pptr++;
6448     tempcode = code;
6449     tempreqvary = cb->req_varyopt;        /* Save value before group */
6450     length_prevgroup = 0;                 /* Initialize for pre-compile phase */
6451 
6452     if ((group_return =
6453          compile_regex(
6454          options,                         /* The option state */
6455          &tempcode,                       /* Where to put code (updated) */
6456          &pptr,                           /* Input pointer (updated) */
6457          errorcodeptr,                    /* Where to put an error message */
6458          skipunits,                       /* Skip over bracket number */
6459          &subfirstcu,                     /* For possible first char */
6460          &subfirstcuflags,
6461          &subreqcu,                       /* For possible last char */
6462          &subreqcuflags,
6463          bcptr,                           /* Current branch chain */
6464          cb,                              /* Compile data block */
6465          (lengthptr == NULL)? NULL :      /* Actual compile phase */
6466            &length_prevgroup              /* Pre-compile phase */
6467          )) == 0)
6468       return 0;  /* Error */
6469 
6470     cb->parens_depth -= 1;
6471 
6472     /* If that was a non-conditional significant group (not an assertion, not a
6473     DEFINE) that matches at least one character, then the current item matches
6474     a character. Conditionals are handled below. */
6475 
6476     if (note_group_empty && bravalue != OP_COND && group_return > 0)
6477       matched_char = TRUE;
6478 
6479     /* If we've just compiled an assertion, pop the assert depth. */
6480 
6481     if (bravalue >= OP_ASSERT && bravalue <= OP_ASSERTBACK_NA)
6482       cb->assert_depth -= 1;
6483 
6484     /* At the end of compiling, code is still pointing to the start of the
6485     group, while tempcode has been updated to point past the end of the group.
6486     The parsed pattern pointer (pptr) is on the closing META_KET.
6487 
6488     If this is a conditional bracket, check that there are no more than
6489     two branches in the group, or just one if it's a DEFINE group. We do this
6490     in the real compile phase, not in the pre-pass, where the whole group may
6491     not be available. */
6492 
6493     if (bravalue == OP_COND && lengthptr == NULL)
6494       {
6495       PCRE2_UCHAR *tc = code;
6496       int condcount = 0;
6497 
6498       do {
6499          condcount++;
6500          tc += GET(tc,1);
6501          }
6502       while (*tc != OP_KET);
6503 
6504       /* A DEFINE group is never obeyed inline (the "condition" is always
6505       false). It must have only one branch. Having checked this, change the
6506       opcode to OP_FALSE. */
6507 
6508       if (code[LINK_SIZE+1] == OP_DEFINE)
6509         {
6510         if (condcount > 1)
6511           {
6512           cb->erroroffset = offset;
6513           *errorcodeptr = ERR54;
6514           return 0;
6515           }
6516         code[LINK_SIZE+1] = OP_FALSE;
6517         bravalue = OP_DEFINE;   /* A flag to suppress char handling below */
6518         }
6519 
6520       /* A "normal" conditional group. If there is just one branch, we must not
6521       make use of its firstcu or reqcu, because this is equivalent to an
6522       empty second branch. Also, it may match an empty string. If there are two
6523       branches, this item must match a character if the group must. */
6524 
6525       else
6526         {
6527         if (condcount > 2)
6528           {
6529           cb->erroroffset = offset;
6530           *errorcodeptr = ERR27;
6531           return 0;
6532           }
6533         if (condcount == 1) subfirstcuflags = subreqcuflags = REQ_NONE;
6534           else if (group_return > 0) matched_char = TRUE;
6535         }
6536       }
6537 
6538     /* In the pre-compile phase, update the length by the length of the group,
6539     less the brackets at either end. Then reduce the compiled code to just a
6540     set of non-capturing brackets so that it doesn't use much memory if it is
6541     duplicated by a quantifier.*/
6542 
6543     if (lengthptr != NULL)
6544       {
6545       if (OFLOW_MAX - *lengthptr < length_prevgroup - 2 - 2*LINK_SIZE)
6546         {
6547         *errorcodeptr = ERR20;
6548         return 0;
6549         }
6550       *lengthptr += length_prevgroup - 2 - 2*LINK_SIZE;
6551       code++;   /* This already contains bravalue */
6552       PUTINC(code, 0, 1 + LINK_SIZE);
6553       *code++ = OP_KET;
6554       PUTINC(code, 0, 1 + LINK_SIZE);
6555       break;    /* No need to waste time with special character handling */
6556       }
6557 
6558     /* Otherwise update the main code pointer to the end of the group. */
6559 
6560     code = tempcode;
6561 
6562     /* For a DEFINE group, required and first character settings are not
6563     relevant. */
6564 
6565     if (bravalue == OP_DEFINE) break;
6566 
6567     /* Handle updating of the required and first code units for other types of
6568     group. Update for normal brackets of all kinds, and conditions with two
6569     branches (see code above). If the bracket is followed by a quantifier with
6570     zero repeat, we have to back off. Hence the definition of zeroreqcu and
6571     zerofirstcu outside the main loop so that they can be accessed for the back
6572     off. */
6573 
6574     zeroreqcu = reqcu;
6575     zeroreqcuflags = reqcuflags;
6576     zerofirstcu = firstcu;
6577     zerofirstcuflags = firstcuflags;
6578     groupsetfirstcu = FALSE;
6579 
6580     if (bravalue >= OP_ONCE)  /* Not an assertion */
6581       {
6582       /* If we have not yet set a firstcu in this branch, take it from the
6583       subpattern, remembering that it was set here so that a repeat of more
6584       than one can replicate it as reqcu if necessary. If the subpattern has
6585       no firstcu, set "none" for the whole branch. In both cases, a zero
6586       repeat forces firstcu to "none". */
6587 
6588       if (firstcuflags == REQ_UNSET && subfirstcuflags != REQ_UNSET)
6589         {
6590         if (subfirstcuflags >= 0)
6591           {
6592           firstcu = subfirstcu;
6593           firstcuflags = subfirstcuflags;
6594           groupsetfirstcu = TRUE;
6595           }
6596         else firstcuflags = REQ_NONE;
6597         zerofirstcuflags = REQ_NONE;
6598         }
6599 
6600       /* If firstcu was previously set, convert the subpattern's firstcu
6601       into reqcu if there wasn't one, using the vary flag that was in
6602       existence beforehand. */
6603 
6604       else if (subfirstcuflags >= 0 && subreqcuflags < 0)
6605         {
6606         subreqcu = subfirstcu;
6607         subreqcuflags = subfirstcuflags | tempreqvary;
6608         }
6609 
6610       /* If the subpattern set a required code unit (or set a first code unit
6611       that isn't really the first code unit - see above), set it. */
6612 
6613       if (subreqcuflags >= 0)
6614         {
6615         reqcu = subreqcu;
6616         reqcuflags = subreqcuflags;
6617         }
6618       }
6619 
6620     /* For a forward assertion, we take the reqcu, if set, provided that the
6621     group has also set a firstcu. This can be helpful if the pattern that
6622     follows the assertion doesn't set a different char. For example, it's
6623     useful for /(?=abcde).+/. We can't set firstcu for an assertion, however
6624     because it leads to incorrect effect for patterns such as /(?=a)a.+/ when
6625     the "real" "a" would then become a reqcu instead of a firstcu. This is
6626     overcome by a scan at the end if there's no firstcu, looking for an
6627     asserted first char. A similar effect for patterns like /(?=.*X)X$/ means
6628     we must only take the reqcu when the group also set a firstcu. Otherwise,
6629     in that example, 'X' ends up set for both. */
6630 
6631     else if ((bravalue == OP_ASSERT || bravalue == OP_ASSERT_NA) &&
6632              subreqcuflags >= 0 && subfirstcuflags >= 0)
6633       {
6634       reqcu = subreqcu;
6635       reqcuflags = subreqcuflags;
6636       }
6637 
6638     break;  /* End of nested group handling */
6639 
6640 
6641     /* ===================================================================*/
6642     /* Handle named backreferences and recursions. */
6643 
6644     case META_BACKREF_BYNAME:
6645     case META_RECURSE_BYNAME:
6646       {
6647       int count, index;
6648       PCRE2_SPTR name;
6649       BOOL is_dupname = FALSE;
6650       named_group *ng = cb->named_groups;
6651       uint32_t length = *(++pptr);
6652 
6653       GETPLUSOFFSET(offset, pptr);
6654       name = cb->start_pattern + offset;
6655 
6656       /* In the first pass, the names generated in the pre-pass are available,
6657       but the main name table has not yet been created. Scan the list of names
6658       generated in the pre-pass in order to get a number and whether or not
6659       this name is duplicated. */
6660 
6661       groupnumber = 0;
6662       for (i = 0; i < cb->names_found; i++, ng++)
6663         {
6664         if (length == ng->length &&
6665             PRIV(strncmp)(name, ng->name, length) == 0)
6666           {
6667           is_dupname = ng->isdup;
6668           groupnumber = ng->number;
6669 
6670           /* For a recursion, that's all that is needed. We can now go to
6671           the code that handles numerical recursion, applying it to the first
6672           group with the given name. */
6673 
6674           if (meta == META_RECURSE_BYNAME)
6675             {
6676             meta_arg = groupnumber;
6677             goto HANDLE_NUMERICAL_RECURSION;
6678             }
6679 
6680           /* For a back reference, update the back reference map and the
6681           maximum back reference. */
6682 
6683           cb->backref_map |= (groupnumber < 32)? (1u << groupnumber) : 1;
6684           if (groupnumber > cb->top_backref)
6685             cb->top_backref = groupnumber;
6686           }
6687         }
6688 
6689       /* If the name was not found we have a bad reference. */
6690 
6691       if (groupnumber == 0)
6692         {
6693         *errorcodeptr = ERR15;
6694         cb->erroroffset = offset;
6695         return 0;
6696         }
6697 
6698       /* If a back reference name is not duplicated, we can handle it as
6699       a numerical reference. */
6700 
6701       if (!is_dupname)
6702         {
6703         meta_arg = groupnumber;
6704         goto HANDLE_SINGLE_REFERENCE;
6705         }
6706 
6707       /* If a back reference name is duplicated, we generate a different
6708       opcode to a numerical back reference. In the second pass we must
6709       search for the index and count in the final name table. */
6710 
6711       count = 0;  /* Values for first pass (avoids compiler warning) */
6712       index = 0;
6713       if (lengthptr == NULL && !find_dupname_details(name, length, &index,
6714             &count, errorcodeptr, cb)) return 0;
6715 
6716       if (firstcuflags == REQ_UNSET) firstcuflags = REQ_NONE;
6717       *code++ = ((options & PCRE2_CASELESS) != 0)? OP_DNREFI : OP_DNREF;
6718       PUT2INC(code, 0, index);
6719       PUT2INC(code, 0, count);
6720       }
6721     break;
6722 
6723 
6724     /* ===================================================================*/
6725     /* Handle a numerical callout. */
6726 
6727     case META_CALLOUT_NUMBER:
6728     code[0] = OP_CALLOUT;
6729     PUT(code, 1, pptr[1]);               /* Offset to next pattern item */
6730     PUT(code, 1 + LINK_SIZE, pptr[2]);   /* Length of next pattern item */
6731     code[1 + 2*LINK_SIZE] = pptr[3];
6732     pptr += 3;
6733     code += PRIV(OP_lengths)[OP_CALLOUT];
6734     break;
6735 
6736 
6737     /* ===================================================================*/
6738     /* Handle a callout with a string argument. In the pre-pass we just compute
6739     the length without generating anything. The length in pptr[3] includes both
6740     delimiters; in the actual compile only the first one is copied, but a
6741     terminating zero is added. Any doubled delimiters within the string make
6742     this an overestimate, but it is not worth bothering about. */
6743 
6744     case META_CALLOUT_STRING:
6745     if (lengthptr != NULL)
6746       {
6747       *lengthptr += pptr[3] + (1 + 4*LINK_SIZE);
6748       pptr += 3;
6749       SKIPOFFSET(pptr);
6750       }
6751 
6752     /* In the real compile we can copy the string. The starting delimiter is
6753      included so that the client can discover it if they want. We also pass the
6754      start offset to help a script language give better error messages. */
6755 
6756     else
6757       {
6758       PCRE2_SPTR pp;
6759       uint32_t delimiter;
6760       uint32_t length = pptr[3];
6761       PCRE2_UCHAR *callout_string = code + (1 + 4*LINK_SIZE);
6762 
6763       code[0] = OP_CALLOUT_STR;
6764       PUT(code, 1, pptr[1]);               /* Offset to next pattern item */
6765       PUT(code, 1 + LINK_SIZE, pptr[2]);   /* Length of next pattern item */
6766 
6767       pptr += 3;
6768       GETPLUSOFFSET(offset, pptr);         /* Offset to string in pattern */
6769       pp = cb->start_pattern + offset;
6770       delimiter = *callout_string++ = *pp++;
6771       if (delimiter == CHAR_LEFT_CURLY_BRACKET)
6772         delimiter = CHAR_RIGHT_CURLY_BRACKET;
6773       PUT(code, 1 + 3*LINK_SIZE, (int)(offset + 1));  /* One after delimiter */
6774 
6775       /* The syntax of the pattern was checked in the parsing scan. The length
6776       includes both delimiters, but we have passed the opening one just above,
6777       so we reduce length before testing it. The test is for > 1 because we do
6778       not want to copy the final delimiter. This also ensures that pp[1] is
6779       accessible. */
6780 
6781       while (--length > 1)
6782         {
6783         if (*pp == delimiter && pp[1] == delimiter)
6784           {
6785           *callout_string++ = delimiter;
6786           pp += 2;
6787           length--;
6788           }
6789         else *callout_string++ = *pp++;
6790         }
6791       *callout_string++ = CHAR_NUL;
6792 
6793       /* Set the length of the entire item, the advance to its end. */
6794 
6795       PUT(code, 1 + 2*LINK_SIZE, (int)(callout_string - code));
6796       code = callout_string;
6797       }
6798     break;
6799 
6800 
6801     /* ===================================================================*/
6802     /* Handle repetition. The different types are all sorted out in the parsing
6803     pass. */
6804 
6805     case META_MINMAX_PLUS:
6806     case META_MINMAX_QUERY:
6807     case META_MINMAX:
6808     repeat_min = *(++pptr);
6809     repeat_max = *(++pptr);
6810     goto REPEAT;
6811 
6812     case META_ASTERISK:
6813     case META_ASTERISK_PLUS:
6814     case META_ASTERISK_QUERY:
6815     repeat_min = 0;
6816     repeat_max = REPEAT_UNLIMITED;
6817     goto REPEAT;
6818 
6819     case META_PLUS:
6820     case META_PLUS_PLUS:
6821     case META_PLUS_QUERY:
6822     repeat_min = 1;
6823     repeat_max = REPEAT_UNLIMITED;
6824     goto REPEAT;
6825 
6826     case META_QUERY:
6827     case META_QUERY_PLUS:
6828     case META_QUERY_QUERY:
6829     repeat_min = 0;
6830     repeat_max = 1;
6831 
6832     REPEAT:
6833     if (previous_matched_char && repeat_min > 0) matched_char = TRUE;
6834 
6835     /* Remember whether this is a variable length repeat, and default to
6836     single-char opcodes. */
6837 
6838     reqvary = (repeat_min == repeat_max)? 0 : REQ_VARY;
6839     op_type = 0;
6840 
6841     /* Adjust first and required code units for a zero repeat. */
6842 
6843     if (repeat_min == 0)
6844       {
6845       firstcu = zerofirstcu;
6846       firstcuflags = zerofirstcuflags;
6847       reqcu = zeroreqcu;
6848       reqcuflags = zeroreqcuflags;
6849       }
6850 
6851     /* Note the greediness and possessiveness. */
6852 
6853     switch (meta)
6854       {
6855       case META_MINMAX_PLUS:
6856       case META_ASTERISK_PLUS:
6857       case META_PLUS_PLUS:
6858       case META_QUERY_PLUS:
6859       repeat_type = 0;                  /* Force greedy */
6860       possessive_quantifier = TRUE;
6861       break;
6862 
6863       case META_MINMAX_QUERY:
6864       case META_ASTERISK_QUERY:
6865       case META_PLUS_QUERY:
6866       case META_QUERY_QUERY:
6867       repeat_type = greedy_non_default;
6868       possessive_quantifier = FALSE;
6869       break;
6870 
6871       default:
6872       repeat_type = greedy_default;
6873       possessive_quantifier = FALSE;
6874       break;
6875       }
6876 
6877     /* Save start of previous item, in case we have to move it up in order to
6878     insert something before it, and remember what it was. */
6879 
6880     tempcode = previous;
6881     op_previous = *previous;
6882 
6883     /* Now handle repetition for the different types of item. If the repeat
6884     minimum and the repeat maximum are both 1, we can ignore the quantifier for
6885     non-parenthesized items, as they have only one alternative. For anything in
6886     parentheses, we must not ignore if {1} is possessive. */
6887 
6888     switch (op_previous)
6889       {
6890       /* If previous was a character or negated character match, abolish the
6891       item and generate a repeat item instead. If a char item has a minimum of
6892       more than one, ensure that it is set in reqcu - it might not be if a
6893       sequence such as x{3} is the first thing in a branch because the x will
6894       have gone into firstcu instead.  */
6895 
6896       case OP_CHAR:
6897       case OP_CHARI:
6898       case OP_NOT:
6899       case OP_NOTI:
6900       if (repeat_max == 1 && repeat_min == 1) goto END_REPEAT;
6901       op_type = chartypeoffset[op_previous - OP_CHAR];
6902 
6903       /* Deal with UTF characters that take up more than one code unit. */
6904 
6905 #ifdef MAYBE_UTF_MULTI
6906       if (utf && NOT_FIRSTCU(code[-1]))
6907         {
6908         PCRE2_UCHAR *lastchar = code - 1;
6909         BACKCHAR(lastchar);
6910         mclength = (uint32_t)(code - lastchar);   /* Length of UTF character */
6911         memcpy(mcbuffer, lastchar, CU2BYTES(mclength));  /* Save the char */
6912         }
6913       else
6914 #endif  /* MAYBE_UTF_MULTI */
6915 
6916       /* Handle the case of a single code unit - either with no UTF support, or
6917       with UTF disabled, or for a single-code-unit UTF character. */
6918         {
6919         mcbuffer[0] = code[-1];
6920         mclength = 1;
6921         if (op_previous <= OP_CHARI && repeat_min > 1)
6922           {
6923           reqcu = mcbuffer[0];
6924           reqcuflags = req_caseopt | cb->req_varyopt;
6925           }
6926         }
6927       goto OUTPUT_SINGLE_REPEAT;  /* Code shared with single character types */
6928 
6929       /* If previous was a character class or a back reference, we put the
6930       repeat stuff after it, but just skip the item if the repeat was {0,0}. */
6931 
6932 #ifdef SUPPORT_WIDE_CHARS
6933       case OP_XCLASS:
6934 #endif
6935       case OP_CLASS:
6936       case OP_NCLASS:
6937       case OP_REF:
6938       case OP_REFI:
6939       case OP_DNREF:
6940       case OP_DNREFI:
6941 
6942       if (repeat_max == 0)
6943         {
6944         code = previous;
6945         goto END_REPEAT;
6946         }
6947       if (repeat_max == 1 && repeat_min == 1) goto END_REPEAT;
6948 
6949       if (repeat_min == 0 && repeat_max == REPEAT_UNLIMITED)
6950         *code++ = OP_CRSTAR + repeat_type;
6951       else if (repeat_min == 1 && repeat_max == REPEAT_UNLIMITED)
6952         *code++ = OP_CRPLUS + repeat_type;
6953       else if (repeat_min == 0 && repeat_max == 1)
6954         *code++ = OP_CRQUERY + repeat_type;
6955       else
6956         {
6957         *code++ = OP_CRRANGE + repeat_type;
6958         PUT2INC(code, 0, repeat_min);
6959         if (repeat_max == REPEAT_UNLIMITED) repeat_max = 0;  /* 2-byte encoding for max */
6960         PUT2INC(code, 0, repeat_max);
6961         }
6962       break;
6963 
6964       /* If previous is OP_FAIL, it was generated by an empty class []
6965       (PCRE2_ALLOW_EMPTY_CLASS is set). The other ways in which OP_FAIL can be
6966       generated, that is by (*FAIL) or (?!), disallow a quantifier at parse
6967       time. We can just ignore this repeat. */
6968 
6969       case OP_FAIL:
6970       goto END_REPEAT;
6971 
6972       /* Prior to 10.30, repeated recursions were wrapped in OP_ONCE brackets
6973       because pcre2_match() could not handle backtracking into recursively
6974       called groups. Now that this backtracking is available, we no longer need
6975       to do this. However, we still need to replicate recursions as we do for
6976       groups so as to have independent backtracking points. We can replicate
6977       for the minimum number of repeats directly. For optional repeats we now
6978       wrap the recursion in OP_BRA brackets and make use of the bracket
6979       repetition. */
6980 
6981       case OP_RECURSE:
6982       if (repeat_max == 1 && repeat_min == 1 && !possessive_quantifier)
6983         goto END_REPEAT;
6984 
6985       /* Generate unwrapped repeats for a non-zero minimum, except when the
6986       minimum is 1 and the maximum unlimited, because that can be handled with
6987       OP_BRA terminated by OP_KETRMAX/MIN. When the maximum is equal to the
6988       minimum, we just need to generate the appropriate additional copies.
6989       Otherwise we need to generate one more, to simulate the situation when
6990       the minimum is zero. */
6991 
6992       if (repeat_min > 0 && (repeat_min != 1 || repeat_max != REPEAT_UNLIMITED))
6993         {
6994         int replicate = repeat_min;
6995         if (repeat_min == repeat_max) replicate--;
6996 
6997         /* In the pre-compile phase, we don't actually do the replication. We
6998         just adjust the length as if we had. Do some paranoid checks for
6999         potential integer overflow. The INT64_OR_DOUBLE type is a 64-bit
7000         integer type when available, otherwise double. */
7001 
7002         if (lengthptr != NULL)
7003           {
7004           PCRE2_SIZE delta = replicate*(1 + LINK_SIZE);
7005           if ((INT64_OR_DOUBLE)replicate*
7006                 (INT64_OR_DOUBLE)(1 + LINK_SIZE) >
7007                   (INT64_OR_DOUBLE)INT_MAX ||
7008               OFLOW_MAX - *lengthptr < delta)
7009             {
7010             *errorcodeptr = ERR20;
7011             return 0;
7012             }
7013           *lengthptr += delta;
7014           }
7015 
7016         else for (i = 0; i < replicate; i++)
7017           {
7018           memcpy(code, previous, CU2BYTES(1 + LINK_SIZE));
7019           previous = code;
7020           code += 1 + LINK_SIZE;
7021           }
7022 
7023         /* If the number of repeats is fixed, we are done. Otherwise, adjust
7024         the counts and fall through. */
7025 
7026         if (repeat_min == repeat_max) break;
7027         if (repeat_max != REPEAT_UNLIMITED) repeat_max -= repeat_min;
7028         repeat_min = 0;
7029         }
7030 
7031       /* Wrap the recursion call in OP_BRA brackets. */
7032 
7033       (void)memmove(previous + 1 + LINK_SIZE, previous, CU2BYTES(1 + LINK_SIZE));
7034       op_previous = *previous = OP_BRA;
7035       PUT(previous, 1, 2 + 2*LINK_SIZE);
7036       previous[2 + 2*LINK_SIZE] = OP_KET;
7037       PUT(previous, 3 + 2*LINK_SIZE, 2 + 2*LINK_SIZE);
7038       code += 2 + 2 * LINK_SIZE;
7039       length_prevgroup = 3 + 3*LINK_SIZE;
7040       group_return = -1;  /* Set "may match empty string" */
7041 
7042       /* Now treat as a repeated OP_BRA. */
7043       /* Fall through */
7044 
7045       /* If previous was a bracket group, we may have to replicate it in
7046       certain cases. Note that at this point we can encounter only the "basic"
7047       bracket opcodes such as BRA and CBRA, as this is the place where they get
7048       converted into the more special varieties such as BRAPOS and SBRA.
7049       Originally, PCRE did not allow repetition of assertions, but now it does,
7050       for Perl compatibility. */
7051 
7052       case OP_ASSERT:
7053       case OP_ASSERT_NOT:
7054       case OP_ASSERT_NA:
7055       case OP_ASSERTBACK:
7056       case OP_ASSERTBACK_NOT:
7057       case OP_ASSERTBACK_NA:
7058       case OP_ONCE:
7059       case OP_SCRIPT_RUN:
7060       case OP_BRA:
7061       case OP_CBRA:
7062       case OP_COND:
7063         {
7064         int len = (int)(code - previous);
7065         PCRE2_UCHAR *bralink = NULL;
7066         PCRE2_UCHAR *brazeroptr = NULL;
7067 
7068         if (repeat_max == 1 && repeat_min == 1 && !possessive_quantifier)
7069           goto END_REPEAT;
7070 
7071         /* Repeating a DEFINE group (or any group where the condition is always
7072         FALSE and there is only one branch) is pointless, but Perl allows the
7073         syntax, so we just ignore the repeat. */
7074 
7075         if (op_previous == OP_COND && previous[LINK_SIZE+1] == OP_FALSE &&
7076             previous[GET(previous, 1)] != OP_ALT)
7077           goto END_REPEAT;
7078 
7079         /* Perl allows all assertions to be quantified, and when they contain
7080         capturing parentheses and/or are optional there are potential uses for
7081         this feature. PCRE2 used to force the maximum quantifier to 1 on the
7082         invalid grounds that further repetition was never useful. This was
7083         always a bit pointless, since an assertion could be wrapped with a
7084         repeated group to achieve the effect. General repetition is now
7085         permitted, but if the maximum is unlimited it is set to one more than
7086         the minimum. */
7087 
7088         if (op_previous < OP_ONCE)    /* Assertion */
7089           {
7090           if (repeat_max == REPEAT_UNLIMITED) repeat_max = repeat_min + 1;
7091           }
7092 
7093         /* The case of a zero minimum is special because of the need to stick
7094         OP_BRAZERO in front of it, and because the group appears once in the
7095         data, whereas in other cases it appears the minimum number of times. For
7096         this reason, it is simplest to treat this case separately, as otherwise
7097         the code gets far too messy. There are several special subcases when the
7098         minimum is zero. */
7099 
7100         if (repeat_min == 0)
7101           {
7102           /* If the maximum is also zero, we used to just omit the group from
7103           the output altogether, like this:
7104 
7105           ** if (repeat_max == 0)
7106           **   {
7107           **   code = previous;
7108           **   goto END_REPEAT;
7109           **   }
7110 
7111           However, that fails when a group or a subgroup within it is
7112           referenced as a subroutine from elsewhere in the pattern, so now we
7113           stick in OP_SKIPZERO in front of it so that it is skipped on
7114           execution. As we don't have a list of which groups are referenced, we
7115           cannot do this selectively.
7116 
7117           If the maximum is 1 or unlimited, we just have to stick in the
7118           BRAZERO and do no more at this point. */
7119 
7120           if (repeat_max <= 1 || repeat_max == REPEAT_UNLIMITED)
7121             {
7122             (void)memmove(previous + 1, previous, CU2BYTES(len));
7123             code++;
7124             if (repeat_max == 0)
7125               {
7126               *previous++ = OP_SKIPZERO;
7127               goto END_REPEAT;
7128               }
7129             brazeroptr = previous;    /* Save for possessive optimizing */
7130             *previous++ = OP_BRAZERO + repeat_type;
7131             }
7132 
7133           /* If the maximum is greater than 1 and limited, we have to replicate
7134           in a nested fashion, sticking OP_BRAZERO before each set of brackets.
7135           The first one has to be handled carefully because it's the original
7136           copy, which has to be moved up. The remainder can be handled by code
7137           that is common with the non-zero minimum case below. We have to
7138           adjust the value or repeat_max, since one less copy is required. */
7139 
7140           else
7141             {
7142             int linkoffset;
7143             (void)memmove(previous + 2 + LINK_SIZE, previous, CU2BYTES(len));
7144             code += 2 + LINK_SIZE;
7145             *previous++ = OP_BRAZERO + repeat_type;
7146             *previous++ = OP_BRA;
7147 
7148             /* We chain together the bracket link offset fields that have to be
7149             filled in later when the ends of the brackets are reached. */
7150 
7151             linkoffset = (bralink == NULL)? 0 : (int)(previous - bralink);
7152             bralink = previous;
7153             PUTINC(previous, 0, linkoffset);
7154             }
7155 
7156           if (repeat_max != REPEAT_UNLIMITED) repeat_max--;
7157           }
7158 
7159         /* If the minimum is greater than zero, replicate the group as many
7160         times as necessary, and adjust the maximum to the number of subsequent
7161         copies that we need. */
7162 
7163         else
7164           {
7165           if (repeat_min > 1)
7166             {
7167             /* In the pre-compile phase, we don't actually do the replication.
7168             We just adjust the length as if we had. Do some paranoid checks for
7169             potential integer overflow. The INT64_OR_DOUBLE type is a 64-bit
7170             integer type when available, otherwise double. */
7171 
7172             if (lengthptr != NULL)
7173               {
7174               PCRE2_SIZE delta = (repeat_min - 1)*length_prevgroup;
7175               if ((INT64_OR_DOUBLE)(repeat_min - 1)*
7176                     (INT64_OR_DOUBLE)length_prevgroup >
7177                       (INT64_OR_DOUBLE)INT_MAX ||
7178                   OFLOW_MAX - *lengthptr < delta)
7179                 {
7180                 *errorcodeptr = ERR20;
7181                 return 0;
7182                 }
7183               *lengthptr += delta;
7184               }
7185 
7186             /* This is compiling for real. If there is a set first code unit
7187             for the group, and we have not yet set a "required code unit", set
7188             it. */
7189 
7190             else
7191               {
7192               if (groupsetfirstcu && reqcuflags < 0)
7193                 {
7194                 reqcu = firstcu;
7195                 reqcuflags = firstcuflags;
7196                 }
7197               for (i = 1; (uint32_t)i < repeat_min; i++)
7198                 {
7199                 memcpy(code, previous, CU2BYTES(len));
7200                 code += len;
7201                 }
7202               }
7203             }
7204 
7205           if (repeat_max != REPEAT_UNLIMITED) repeat_max -= repeat_min;
7206           }
7207 
7208         /* This code is common to both the zero and non-zero minimum cases. If
7209         the maximum is limited, it replicates the group in a nested fashion,
7210         remembering the bracket starts on a stack. In the case of a zero
7211         minimum, the first one was set up above. In all cases the repeat_max
7212         now specifies the number of additional copies needed. Again, we must
7213         remember to replicate entries on the forward reference list. */
7214 
7215         if (repeat_max != REPEAT_UNLIMITED)
7216           {
7217           /* In the pre-compile phase, we don't actually do the replication. We
7218           just adjust the length as if we had. For each repetition we must add
7219           1 to the length for BRAZERO and for all but the last repetition we
7220           must add 2 + 2*LINKSIZE to allow for the nesting that occurs. Do some
7221           paranoid checks to avoid integer overflow. The INT64_OR_DOUBLE type
7222           is a 64-bit integer type when available, otherwise double. */
7223 
7224           if (lengthptr != NULL && repeat_max > 0)
7225             {
7226             PCRE2_SIZE delta = repeat_max*(length_prevgroup + 1 + 2 + 2*LINK_SIZE) -
7227                         2 - 2*LINK_SIZE;   /* Last one doesn't nest */
7228             if ((INT64_OR_DOUBLE)repeat_max *
7229                   (INT64_OR_DOUBLE)(length_prevgroup + 1 + 2 + 2*LINK_SIZE)
7230                     > (INT64_OR_DOUBLE)INT_MAX ||
7231                 OFLOW_MAX - *lengthptr < delta)
7232               {
7233               *errorcodeptr = ERR20;
7234               return 0;
7235               }
7236             *lengthptr += delta;
7237             }
7238 
7239           /* This is compiling for real */
7240 
7241           else for (i = repeat_max - 1; i >= 0; i--)
7242             {
7243             *code++ = OP_BRAZERO + repeat_type;
7244 
7245             /* All but the final copy start a new nesting, maintaining the
7246             chain of brackets outstanding. */
7247 
7248             if (i != 0)
7249               {
7250               int linkoffset;
7251               *code++ = OP_BRA;
7252               linkoffset = (bralink == NULL)? 0 : (int)(code - bralink);
7253               bralink = code;
7254               PUTINC(code, 0, linkoffset);
7255               }
7256 
7257             memcpy(code, previous, CU2BYTES(len));
7258             code += len;
7259             }
7260 
7261           /* Now chain through the pending brackets, and fill in their length
7262           fields (which are holding the chain links pro tem). */
7263 
7264           while (bralink != NULL)
7265             {
7266             int oldlinkoffset;
7267             int linkoffset = (int)(code - bralink + 1);
7268             PCRE2_UCHAR *bra = code - linkoffset;
7269             oldlinkoffset = GET(bra, 1);
7270             bralink = (oldlinkoffset == 0)? NULL : bralink - oldlinkoffset;
7271             *code++ = OP_KET;
7272             PUTINC(code, 0, linkoffset);
7273             PUT(bra, 1, linkoffset);
7274             }
7275           }
7276 
7277         /* If the maximum is unlimited, set a repeater in the final copy. For
7278         SCRIPT_RUN and ONCE brackets, that's all we need to do. However,
7279         possessively repeated ONCE brackets can be converted into non-capturing
7280         brackets, as the behaviour of (?:xx)++ is the same as (?>xx)++ and this
7281         saves having to deal with possessive ONCEs specially.
7282 
7283         Otherwise, when we are doing the actual compile phase, check to see
7284         whether this group is one that could match an empty string. If so,
7285         convert the initial operator to the S form (e.g. OP_BRA -> OP_SBRA) so
7286         that runtime checking can be done. [This check is also applied to ONCE
7287         and SCRIPT_RUN groups at runtime, but in a different way.]
7288 
7289         Then, if the quantifier was possessive and the bracket is not a
7290         conditional, we convert the BRA code to the POS form, and the KET code
7291         to KETRPOS. (It turns out to be convenient at runtime to detect this
7292         kind of subpattern at both the start and at the end.) The use of
7293         special opcodes makes it possible to reduce greatly the stack usage in
7294         pcre2_match(). If the group is preceded by OP_BRAZERO, convert this to
7295         OP_BRAPOSZERO.
7296 
7297         Then, if the minimum number of matches is 1 or 0, cancel the possessive
7298         flag so that the default action below, of wrapping everything inside
7299         atomic brackets, does not happen. When the minimum is greater than 1,
7300         there will be earlier copies of the group, and so we still have to wrap
7301         the whole thing. */
7302 
7303         else
7304           {
7305           PCRE2_UCHAR *ketcode = code - 1 - LINK_SIZE;
7306           PCRE2_UCHAR *bracode = ketcode - GET(ketcode, 1);
7307 
7308           /* Convert possessive ONCE brackets to non-capturing */
7309 
7310           if (*bracode == OP_ONCE && possessive_quantifier) *bracode = OP_BRA;
7311 
7312           /* For non-possessive ONCE and for SCRIPT_RUN brackets, all we need
7313           to do is to set the KET. */
7314 
7315           if (*bracode == OP_ONCE || *bracode == OP_SCRIPT_RUN)
7316             *ketcode = OP_KETRMAX + repeat_type;
7317 
7318           /* Handle non-SCRIPT_RUN and non-ONCE brackets and possessive ONCEs
7319           (which have been converted to non-capturing above). */
7320 
7321           else
7322             {
7323             /* In the compile phase, adjust the opcode if the group can match
7324             an empty string. For a conditional group with only one branch, the
7325             value of group_return will not show "could be empty", so we must
7326             check that separately. */
7327 
7328             if (lengthptr == NULL)
7329               {
7330               if (group_return < 0) *bracode += OP_SBRA - OP_BRA;
7331               if (*bracode == OP_COND && bracode[GET(bracode,1)] != OP_ALT)
7332                 *bracode = OP_SCOND;
7333               }
7334 
7335             /* Handle possessive quantifiers. */
7336 
7337             if (possessive_quantifier)
7338               {
7339               /* For COND brackets, we wrap the whole thing in a possessively
7340               repeated non-capturing bracket, because we have not invented POS
7341               versions of the COND opcodes. */
7342 
7343               if (*bracode == OP_COND || *bracode == OP_SCOND)
7344                 {
7345                 int nlen = (int)(code - bracode);
7346                 (void)memmove(bracode + 1 + LINK_SIZE, bracode, CU2BYTES(nlen));
7347                 code += 1 + LINK_SIZE;
7348                 nlen += 1 + LINK_SIZE;
7349                 *bracode = (*bracode == OP_COND)? OP_BRAPOS : OP_SBRAPOS;
7350                 *code++ = OP_KETRPOS;
7351                 PUTINC(code, 0, nlen);
7352                 PUT(bracode, 1, nlen);
7353                 }
7354 
7355               /* For non-COND brackets, we modify the BRA code and use KETRPOS. */
7356 
7357               else
7358                 {
7359                 *bracode += 1;              /* Switch to xxxPOS opcodes */
7360                 *ketcode = OP_KETRPOS;
7361                 }
7362 
7363               /* If the minimum is zero, mark it as possessive, then unset the
7364               possessive flag when the minimum is 0 or 1. */
7365 
7366               if (brazeroptr != NULL) *brazeroptr = OP_BRAPOSZERO;
7367               if (repeat_min < 2) possessive_quantifier = FALSE;
7368               }
7369 
7370             /* Non-possessive quantifier */
7371 
7372             else *ketcode = OP_KETRMAX + repeat_type;
7373             }
7374           }
7375         }
7376       break;
7377 
7378       /* If previous was a character type match (\d or similar), abolish it and
7379       create a suitable repeat item. The code is shared with single-character
7380       repeats by setting op_type to add a suitable offset into repeat_type.
7381       Note the the Unicode property types will be present only when
7382       SUPPORT_UNICODE is defined, but we don't wrap the little bits of code
7383       here because it just makes it horribly messy. */
7384 
7385       default:
7386       if (op_previous >= OP_EODN)   /* Not a character type - internal error */
7387         {
7388         *errorcodeptr = ERR10;
7389         return 0;
7390         }
7391       else
7392         {
7393         int prop_type, prop_value;
7394         PCRE2_UCHAR *oldcode;
7395 
7396         if (repeat_max == 1 && repeat_min == 1) goto END_REPEAT;
7397 
7398         op_type = OP_TYPESTAR - OP_STAR;      /* Use type opcodes */
7399         mclength = 0;                         /* Not a character */
7400 
7401         if (op_previous == OP_PROP || op_previous == OP_NOTPROP)
7402           {
7403           prop_type = previous[1];
7404           prop_value = previous[2];
7405           }
7406         else
7407           {
7408           /* Come here from just above with a character in mcbuffer/mclength. */
7409           OUTPUT_SINGLE_REPEAT:
7410           prop_type = prop_value = -1;
7411           }
7412 
7413         /* At this point, if prop_type == prop_value == -1 we either have a
7414         character in mcbuffer when mclength is greater than zero, or we have
7415         mclength zero, in which case there is a non-property character type in
7416         op_previous. If prop_type/value are not negative, we have a property
7417         character type in op_previous. */
7418 
7419         oldcode = code;                   /* Save where we were */
7420         code = previous;                  /* Usually overwrite previous item */
7421 
7422         /* If the maximum is zero then the minimum must also be zero; Perl allows
7423         this case, so we do too - by simply omitting the item altogether. */
7424 
7425         if (repeat_max == 0) goto END_REPEAT;
7426 
7427         /* Combine the op_type with the repeat_type */
7428 
7429         repeat_type += op_type;
7430 
7431         /* A minimum of zero is handled either as the special case * or ?, or as
7432         an UPTO, with the maximum given. */
7433 
7434         if (repeat_min == 0)
7435           {
7436           if (repeat_max == REPEAT_UNLIMITED) *code++ = OP_STAR + repeat_type;
7437             else if (repeat_max == 1) *code++ = OP_QUERY + repeat_type;
7438           else
7439             {
7440             *code++ = OP_UPTO + repeat_type;
7441             PUT2INC(code, 0, repeat_max);
7442             }
7443           }
7444 
7445         /* A repeat minimum of 1 is optimized into some special cases. If the
7446         maximum is unlimited, we use OP_PLUS. Otherwise, the original item is
7447         left in place and, if the maximum is greater than 1, we use OP_UPTO with
7448         one less than the maximum. */
7449 
7450         else if (repeat_min == 1)
7451           {
7452           if (repeat_max == REPEAT_UNLIMITED)
7453             *code++ = OP_PLUS + repeat_type;
7454           else
7455             {
7456             code = oldcode;  /* Leave previous item in place */
7457             if (repeat_max == 1) goto END_REPEAT;
7458             *code++ = OP_UPTO + repeat_type;
7459             PUT2INC(code, 0, repeat_max - 1);
7460             }
7461           }
7462 
7463         /* The case {n,n} is just an EXACT, while the general case {n,m} is
7464         handled as an EXACT followed by an UPTO or STAR or QUERY. */
7465 
7466         else
7467           {
7468           *code++ = OP_EXACT + op_type;  /* NB EXACT doesn't have repeat_type */
7469           PUT2INC(code, 0, repeat_min);
7470 
7471           /* Unless repeat_max equals repeat_min, fill in the data for EXACT,
7472           and then generate the second opcode. For a repeated Unicode property
7473           match, there are two extra values that define the required property,
7474           and mclength is set zero to indicate this. */
7475 
7476           if (repeat_max != repeat_min)
7477             {
7478             if (mclength > 0)
7479               {
7480               memcpy(code, mcbuffer, CU2BYTES(mclength));
7481               code += mclength;
7482               }
7483             else
7484               {
7485               *code++ = op_previous;
7486               if (prop_type >= 0)
7487                 {
7488                 *code++ = prop_type;
7489                 *code++ = prop_value;
7490                 }
7491               }
7492 
7493             /* Now set up the following opcode */
7494 
7495             if (repeat_max == REPEAT_UNLIMITED)
7496               *code++ = OP_STAR + repeat_type;
7497             else
7498               {
7499               repeat_max -= repeat_min;
7500               if (repeat_max == 1)
7501                 {
7502                 *code++ = OP_QUERY + repeat_type;
7503                 }
7504               else
7505                 {
7506                 *code++ = OP_UPTO + repeat_type;
7507                 PUT2INC(code, 0, repeat_max);
7508                 }
7509               }
7510             }
7511           }
7512 
7513         /* Fill in the character or character type for the final opcode. */
7514 
7515         if (mclength > 0)
7516           {
7517           memcpy(code, mcbuffer, CU2BYTES(mclength));
7518           code += mclength;
7519           }
7520         else
7521           {
7522           *code++ = op_previous;
7523           if (prop_type >= 0)
7524             {
7525             *code++ = prop_type;
7526             *code++ = prop_value;
7527             }
7528           }
7529         }
7530       break;
7531       }  /* End of switch on different op_previous values */
7532 
7533 
7534     /* If the character following a repeat is '+', possessive_quantifier is
7535     TRUE. For some opcodes, there are special alternative opcodes for this
7536     case. For anything else, we wrap the entire repeated item inside OP_ONCE
7537     brackets. Logically, the '+' notation is just syntactic sugar, taken from
7538     Sun's Java package, but the special opcodes can optimize it.
7539 
7540     Some (but not all) possessively repeated subpatterns have already been
7541     completely handled in the code just above. For them, possessive_quantifier
7542     is always FALSE at this stage. Note that the repeated item starts at
7543     tempcode, not at previous, which might be the first part of a string whose
7544     (former) last char we repeated. */
7545 
7546     if (possessive_quantifier)
7547       {
7548       int len;
7549 
7550       /* Possessifying an EXACT quantifier has no effect, so we can ignore it.
7551       However, QUERY, STAR, or UPTO may follow (for quantifiers such as {5,6},
7552       {5,}, or {5,10}). We skip over an EXACT item; if the length of what
7553       remains is greater than zero, there's a further opcode that can be
7554       handled. If not, do nothing, leaving the EXACT alone. */
7555 
7556       switch(*tempcode)
7557         {
7558         case OP_TYPEEXACT:
7559         tempcode += PRIV(OP_lengths)[*tempcode] +
7560           ((tempcode[1 + IMM2_SIZE] == OP_PROP
7561           || tempcode[1 + IMM2_SIZE] == OP_NOTPROP)? 2 : 0);
7562         break;
7563 
7564         /* CHAR opcodes are used for exacts whose count is 1. */
7565 
7566         case OP_CHAR:
7567         case OP_CHARI:
7568         case OP_NOT:
7569         case OP_NOTI:
7570         case OP_EXACT:
7571         case OP_EXACTI:
7572         case OP_NOTEXACT:
7573         case OP_NOTEXACTI:
7574         tempcode += PRIV(OP_lengths)[*tempcode];
7575 #ifdef SUPPORT_UNICODE
7576         if (utf && HAS_EXTRALEN(tempcode[-1]))
7577           tempcode += GET_EXTRALEN(tempcode[-1]);
7578 #endif
7579         break;
7580 
7581         /* For the class opcodes, the repeat operator appears at the end;
7582         adjust tempcode to point to it. */
7583 
7584         case OP_CLASS:
7585         case OP_NCLASS:
7586         tempcode += 1 + 32/sizeof(PCRE2_UCHAR);
7587         break;
7588 
7589 #ifdef SUPPORT_WIDE_CHARS
7590         case OP_XCLASS:
7591         tempcode += GET(tempcode, 1);
7592         break;
7593 #endif
7594         }
7595 
7596       /* If tempcode is equal to code (which points to the end of the repeated
7597       item), it means we have skipped an EXACT item but there is no following
7598       QUERY, STAR, or UPTO; the value of len will be 0, and we do nothing. In
7599       all other cases, tempcode will be pointing to the repeat opcode, and will
7600       be less than code, so the value of len will be greater than 0. */
7601 
7602       len = (int)(code - tempcode);
7603       if (len > 0)
7604         {
7605         unsigned int repcode = *tempcode;
7606 
7607         /* There is a table for possessifying opcodes, all of which are less
7608         than OP_CALLOUT. A zero entry means there is no possessified version.
7609         */
7610 
7611         if (repcode < OP_CALLOUT && opcode_possessify[repcode] > 0)
7612           *tempcode = opcode_possessify[repcode];
7613 
7614         /* For opcode without a special possessified version, wrap the item in
7615         ONCE brackets. */
7616 
7617         else
7618           {
7619           (void)memmove(tempcode + 1 + LINK_SIZE, tempcode, CU2BYTES(len));
7620           code += 1 + LINK_SIZE;
7621           len += 1 + LINK_SIZE;
7622           tempcode[0] = OP_ONCE;
7623           *code++ = OP_KET;
7624           PUTINC(code, 0, len);
7625           PUT(tempcode, 1, len);
7626           }
7627         }
7628       }
7629 
7630     /* We set the "follows varying string" flag for subsequently encountered
7631     reqcus if it isn't already set and we have just passed a varying length
7632     item. */
7633 
7634     END_REPEAT:
7635     cb->req_varyopt |= reqvary;
7636     break;
7637 
7638 
7639     /* ===================================================================*/
7640     /* Handle a 32-bit data character with a value greater than META_END. */
7641 
7642     case META_BIGVALUE:
7643     pptr++;
7644     goto NORMAL_CHAR;
7645 
7646 
7647     /* ===============================================================*/
7648     /* Handle a back reference by number, which is the meta argument. The
7649     pattern offsets for back references to group numbers less than 10 are held
7650     in a special vector, to avoid using more than two parsed pattern elements
7651     in 64-bit environments. We only need the offset to the first occurrence,
7652     because if that doesn't fail, subsequent ones will also be OK. */
7653 
7654     case META_BACKREF:
7655     if (meta_arg < 10) offset = cb->small_ref_offset[meta_arg];
7656       else GETPLUSOFFSET(offset, pptr);
7657 
7658     if (meta_arg > cb->bracount)
7659       {
7660       cb->erroroffset = offset;
7661       *errorcodeptr = ERR15;  /* Non-existent subpattern */
7662       return 0;
7663       }
7664 
7665     /* Come here from named backref handling when the reference is to a
7666     single group (that is, not to a duplicated name). The back reference
7667     data will have already been updated. We must disable firstcu if not
7668     set, to cope with cases like (?=(\w+))\1: which would otherwise set ':'
7669     later. */
7670 
7671     HANDLE_SINGLE_REFERENCE:
7672     if (firstcuflags == REQ_UNSET) zerofirstcuflags = firstcuflags = REQ_NONE;
7673     *code++ = ((options & PCRE2_CASELESS) != 0)? OP_REFI : OP_REF;
7674     PUT2INC(code, 0, meta_arg);
7675 
7676     /* Update the map of back references, and keep the highest one. We
7677     could do this in parse_regex() for numerical back references, but not
7678     for named back references, because we don't know the numbers to which
7679     named back references refer. So we do it all in this function. */
7680 
7681     cb->backref_map |= (meta_arg < 32)? (1u << meta_arg) : 1;
7682     if (meta_arg > cb->top_backref) cb->top_backref = meta_arg;
7683     break;
7684 
7685 
7686     /* ===============================================================*/
7687     /* Handle recursion by inserting the number of the called group (which is
7688     the meta argument) after OP_RECURSE. At the end of compiling the pattern is
7689     scanned and these numbers are replaced by offsets within the pattern. It is
7690     done like this to avoid problems with forward references and adjusting
7691     offsets when groups are duplicated and moved (as discovered in previous
7692     implementations). Note that a recursion does not have a set first
7693     character. */
7694 
7695     case META_RECURSE:
7696     GETPLUSOFFSET(offset, pptr);
7697     if (meta_arg > cb->bracount)
7698       {
7699       cb->erroroffset = offset;
7700       *errorcodeptr = ERR15;  /* Non-existent subpattern */
7701       return 0;
7702       }
7703     HANDLE_NUMERICAL_RECURSION:
7704     *code = OP_RECURSE;
7705     PUT(code, 1, meta_arg);
7706     code += 1 + LINK_SIZE;
7707     groupsetfirstcu = FALSE;
7708     cb->had_recurse = TRUE;
7709     if (firstcuflags == REQ_UNSET) firstcuflags = REQ_NONE;
7710     zerofirstcu = firstcu;
7711     zerofirstcuflags = firstcuflags;
7712     break;
7713 
7714 
7715     /* ===============================================================*/
7716     /* Handle capturing parentheses; the number is the meta argument. */
7717 
7718     case META_CAPTURE:
7719     bravalue = OP_CBRA;
7720     skipunits = IMM2_SIZE;
7721     PUT2(code, 1+LINK_SIZE, meta_arg);
7722     cb->lastcapture = meta_arg;
7723     goto GROUP_PROCESS_NOTE_EMPTY;
7724 
7725 
7726     /* ===============================================================*/
7727     /* Handle escape sequence items. For ones like \d, the ESC_values are
7728     arranged to be the same as the corresponding OP_values in the default case
7729     when PCRE2_UCP is not set (which is the only case in which they will appear
7730     here).
7731 
7732     Note: \Q and \E are never seen here, as they were dealt with in
7733     parse_pattern(). Neither are numerical back references or recursions, which
7734     were turned into META_BACKREF or META_RECURSE items, respectively. \k and
7735     \g, when followed by names, are turned into META_BACKREF_BYNAME or
7736     META_RECURSE_BYNAME. */
7737 
7738     case META_ESCAPE:
7739 
7740     /* We can test for escape sequences that consume a character because their
7741     values lie between ESC_b and ESC_Z; this may have to change if any new ones
7742     are ever created. For these sequences, we disable the setting of a first
7743     character if it hasn't already been set. */
7744 
7745     if (meta_arg > ESC_b && meta_arg < ESC_Z)
7746       {
7747       matched_char = TRUE;
7748       if (firstcuflags == REQ_UNSET) firstcuflags = REQ_NONE;
7749       }
7750 
7751     /* Set values to reset to if this is followed by a zero repeat. */
7752 
7753     zerofirstcu = firstcu;
7754     zerofirstcuflags = firstcuflags;
7755     zeroreqcu = reqcu;
7756     zeroreqcuflags = reqcuflags;
7757 
7758     /* If Unicode is not supported, \P and \p are not allowed and are
7759     faulted at parse time, so will never appear here. */
7760 
7761 #ifdef SUPPORT_UNICODE
7762     if (meta_arg == ESC_P || meta_arg == ESC_p)
7763       {
7764       uint32_t ptype = *(++pptr) >> 16;
7765       uint32_t pdata = *pptr & 0xffff;
7766 
7767       /* The special case of \p{Any} is compiled to OP_ALLANY so as to benefit
7768       from the auto-anchoring code. */
7769 
7770       if (meta_arg == ESC_p && ptype == PT_ANY)
7771         {
7772         *code++ = OP_ALLANY;
7773         }
7774       else
7775         {
7776         *code++ = (meta_arg == ESC_p)? OP_PROP : OP_NOTPROP;
7777         *code++ = ptype;
7778         *code++ = pdata;
7779         }
7780       break;  /* End META_ESCAPE */
7781       }
7782 #endif
7783 
7784     /* For the rest (including \X when Unicode is supported - if not it's
7785     faulted at parse time), the OP value is the escape value when PCRE2_UCP is
7786     not set; if it is set, these escapes do not show up here because they are
7787     converted into Unicode property tests in parse_regex(). Note that \b and \B
7788     do a one-character lookbehind, and \A also behaves as if it does. */
7789 
7790     if (meta_arg == ESC_C) cb->external_flags |= PCRE2_HASBKC; /* Record */
7791     if ((meta_arg == ESC_b || meta_arg == ESC_B || meta_arg == ESC_A) &&
7792          cb->max_lookbehind == 0)
7793       cb->max_lookbehind = 1;
7794 
7795     /* In non-UTF mode, and for both 32-bit modes, we turn \C into OP_ALLANY
7796     instead of OP_ANYBYTE so that it works in DFA mode and in lookbehinds. */
7797 
7798 #if PCRE2_CODE_UNIT_WIDTH == 32
7799     *code++ = (meta_arg == ESC_C)? OP_ALLANY : meta_arg;
7800 #else
7801     *code++ = (!utf && meta_arg == ESC_C)? OP_ALLANY : meta_arg;
7802 #endif
7803     break;  /* End META_ESCAPE */
7804 
7805 
7806     /* ===================================================================*/
7807     /* Handle an unrecognized meta value. A parsed pattern value less than
7808     META_END is a literal. Otherwise we have a problem. */
7809 
7810     default:
7811     if (meta >= META_END)
7812       {
7813 #ifdef DEBUG_SHOW_PARSED
7814       fprintf(stderr, "** Unrecognized parsed pattern item 0x%.8x\n", *pptr);
7815 #endif
7816       *errorcodeptr = ERR89;  /* Internal error - unrecognized. */
7817       return 0;
7818       }
7819 
7820     /* Handle a literal character. We come here by goto in the case of a
7821     32-bit, non-UTF character whose value is greater than META_END. */
7822 
7823     NORMAL_CHAR:
7824     meta = *pptr;     /* Get the full 32 bits */
7825     NORMAL_CHAR_SET:  /* Character is already in meta */
7826     matched_char = TRUE;
7827 
7828     /* For caseless UTF or UCP mode, check whether this character has more than
7829     one other case. If so, generate a special OP_PROP item instead of OP_CHARI.
7830     */
7831 
7832 #ifdef SUPPORT_UNICODE
7833     if ((utf||ucp) && (options & PCRE2_CASELESS) != 0)
7834       {
7835       uint32_t caseset = UCD_CASESET(meta);
7836       if (caseset != 0)
7837         {
7838         *code++ = OP_PROP;
7839         *code++ = PT_CLIST;
7840         *code++ = caseset;
7841         if (firstcuflags == REQ_UNSET)
7842           firstcuflags = zerofirstcuflags = REQ_NONE;
7843         break;  /* End handling this meta item */
7844         }
7845       }
7846 #endif
7847 
7848     /* Caseful matches, or caseless and not one of the multicase characters. We
7849     come here by goto in the case of a positive class that contains only
7850     case-partners of a character with just two cases; matched_char has already
7851     been set TRUE and options fudged if necessary. */
7852 
7853     CLASS_CASELESS_CHAR:
7854 
7855     /* Get the character's code units into mcbuffer, with the length in
7856     mclength. When not in UTF mode, the length is always 1. */
7857 
7858 #ifdef SUPPORT_UNICODE
7859     if (utf) mclength = PRIV(ord2utf)(meta, mcbuffer); else
7860 #endif
7861       {
7862       mclength = 1;
7863       mcbuffer[0] = meta;
7864       }
7865 
7866     /* Generate the appropriate code */
7867 
7868     *code++ = ((options & PCRE2_CASELESS) != 0)? OP_CHARI : OP_CHAR;
7869     memcpy(code, mcbuffer, CU2BYTES(mclength));
7870     code += mclength;
7871 
7872     /* Remember if \r or \n were seen */
7873 
7874     if (mcbuffer[0] == CHAR_CR || mcbuffer[0] == CHAR_NL)
7875       cb->external_flags |= PCRE2_HASCRORLF;
7876 
7877     /* Set the first and required code units appropriately. If no previous
7878     first code unit, set it from this character, but revert to none on a zero
7879     repeat. Otherwise, leave the firstcu value alone, and don't change it on
7880     a zero repeat. */
7881 
7882     if (firstcuflags == REQ_UNSET)
7883       {
7884       zerofirstcuflags = REQ_NONE;
7885       zeroreqcu = reqcu;
7886       zeroreqcuflags = reqcuflags;
7887 
7888       /* If the character is more than one code unit long, we can set a single
7889       firstcu only if it is not to be matched caselessly. Multiple possible
7890       starting code units may be picked up later in the studying code. */
7891 
7892       if (mclength == 1 || req_caseopt == 0)
7893         {
7894         firstcu = mcbuffer[0];
7895         firstcuflags = req_caseopt;
7896         if (mclength != 1)
7897           {
7898           reqcu = code[-1];
7899           reqcuflags = cb->req_varyopt;
7900           }
7901         }
7902       else firstcuflags = reqcuflags = REQ_NONE;
7903       }
7904 
7905     /* firstcu was previously set; we can set reqcu only if the length is
7906     1 or the matching is caseful. */
7907 
7908     else
7909       {
7910       zerofirstcu = firstcu;
7911       zerofirstcuflags = firstcuflags;
7912       zeroreqcu = reqcu;
7913       zeroreqcuflags = reqcuflags;
7914       if (mclength == 1 || req_caseopt == 0)
7915         {
7916         reqcu = code[-1];
7917         reqcuflags = req_caseopt | cb->req_varyopt;
7918         }
7919       }
7920 
7921     /* If caselessness was temporarily instated, reset it. */
7922 
7923     if (reset_caseful)
7924       {
7925       options &= ~PCRE2_CASELESS;
7926       req_caseopt = 0;
7927       reset_caseful = FALSE;
7928       }
7929 
7930     break;    /* End literal character handling */
7931     }         /* End of big switch */
7932   }           /* End of big loop */
7933 
7934 /* Control never reaches here. */
7935 }
7936 
7937 
7938 
7939 /*************************************************
7940 *   Compile regex: a sequence of alternatives    *
7941 *************************************************/
7942 
7943 /* On entry, pptr is pointing past the bracket meta, but on return it points to
7944 the closing bracket or META_END. The code variable is pointing at the code unit
7945 into which the BRA operator has been stored. This function is used during the
7946 pre-compile phase when we are trying to find out the amount of memory needed,
7947 as well as during the real compile phase. The value of lengthptr distinguishes
7948 the two phases.
7949 
7950 Arguments:
7951   options           option bits, including any changes for this subpattern
7952   codeptr           -> the address of the current code pointer
7953   pptrptr           -> the address of the current parsed pattern pointer
7954   errorcodeptr      -> pointer to error code variable
7955   skipunits         skip this many code units at start (for brackets and OP_COND)
7956   firstcuptr        place to put the first required code unit
7957   firstcuflagsptr   place to put the first code unit flags, or a negative number
7958   reqcuptr          place to put the last required code unit
7959   reqcuflagsptr     place to put the last required code unit flags, or a negative number
7960   bcptr             pointer to the chain of currently open branches
7961   cb                points to the data block with tables pointers etc.
7962   lengthptr         NULL during the real compile phase
7963                     points to length accumulator during pre-compile phase
7964 
7965 Returns:            0 There has been an error
7966                    +1 Success, this group must match at least one character
7967                    -1 Success, this group may match an empty string
7968 */
7969 
7970 static int
compile_regex(uint32_t options,PCRE2_UCHAR ** codeptr,uint32_t ** pptrptr,int * errorcodeptr,uint32_t skipunits,uint32_t * firstcuptr,int32_t * firstcuflagsptr,uint32_t * reqcuptr,int32_t * reqcuflagsptr,branch_chain * bcptr,compile_block * cb,PCRE2_SIZE * lengthptr)7971 compile_regex(uint32_t options, PCRE2_UCHAR **codeptr, uint32_t **pptrptr,
7972   int *errorcodeptr, uint32_t skipunits, uint32_t *firstcuptr,
7973   int32_t *firstcuflagsptr, uint32_t *reqcuptr,int32_t *reqcuflagsptr,
7974   branch_chain *bcptr, compile_block *cb, PCRE2_SIZE *lengthptr)
7975 {
7976 PCRE2_UCHAR *code = *codeptr;
7977 PCRE2_UCHAR *last_branch = code;
7978 PCRE2_UCHAR *start_bracket = code;
7979 BOOL lookbehind;
7980 open_capitem capitem;
7981 int capnumber = 0;
7982 int okreturn = 1;
7983 uint32_t *pptr = *pptrptr;
7984 uint32_t firstcu, reqcu;
7985 uint32_t lookbehindlength;
7986 int32_t firstcuflags, reqcuflags;
7987 uint32_t branchfirstcu, branchreqcu;
7988 int32_t branchfirstcuflags, branchreqcuflags;
7989 PCRE2_SIZE length;
7990 branch_chain bc;
7991 
7992 /* If set, call the external function that checks for stack availability. */
7993 
7994 if (cb->cx->stack_guard != NULL &&
7995     cb->cx->stack_guard(cb->parens_depth, cb->cx->stack_guard_data))
7996   {
7997   *errorcodeptr= ERR33;
7998   return 0;
7999   }
8000 
8001 /* Miscellaneous initialization */
8002 
8003 bc.outer = bcptr;
8004 bc.current_branch = code;
8005 
8006 firstcu = reqcu = 0;
8007 firstcuflags = reqcuflags = REQ_UNSET;
8008 
8009 /* Accumulate the length for use in the pre-compile phase. Start with the
8010 length of the BRA and KET and any extra code units that are required at the
8011 beginning. We accumulate in a local variable to save frequent testing of
8012 lengthptr for NULL. We cannot do this by looking at the value of 'code' at the
8013 start and end of each alternative, because compiled items are discarded during
8014 the pre-compile phase so that the workspace is not exceeded. */
8015 
8016 length = 2 + 2*LINK_SIZE + skipunits;
8017 
8018 /* Remember if this is a lookbehind assertion, and if it is, save its length
8019 and skip over the pattern offset. */
8020 
8021 lookbehind = *code == OP_ASSERTBACK ||
8022              *code == OP_ASSERTBACK_NOT ||
8023              *code == OP_ASSERTBACK_NA;
8024 
8025 if (lookbehind)
8026   {
8027   lookbehindlength = META_DATA(pptr[-1]);
8028   pptr += SIZEOFFSET;
8029   }
8030 else lookbehindlength = 0;
8031 
8032 /* If this is a capturing subpattern, add to the chain of open capturing items
8033 so that we can detect them if (*ACCEPT) is encountered. Note that only OP_CBRA
8034 need be tested here; changing this opcode to one of its variants, e.g.
8035 OP_SCBRAPOS, happens later, after the group has been compiled. */
8036 
8037 if (*code == OP_CBRA)
8038   {
8039   capnumber = GET2(code, 1 + LINK_SIZE);
8040   capitem.number = capnumber;
8041   capitem.next = cb->open_caps;
8042   capitem.assert_depth = cb->assert_depth;
8043   cb->open_caps = &capitem;
8044   }
8045 
8046 /* Offset is set zero to mark that this bracket is still open */
8047 
8048 PUT(code, 1, 0);
8049 code += 1 + LINK_SIZE + skipunits;
8050 
8051 /* Loop for each alternative branch */
8052 
8053 for (;;)
8054   {
8055   int branch_return;
8056 
8057   /* Insert OP_REVERSE if this is as lookbehind assertion. */
8058 
8059   if (lookbehind && lookbehindlength > 0)
8060     {
8061     *code++ = OP_REVERSE;
8062     PUTINC(code, 0, lookbehindlength);
8063     length += 1 + LINK_SIZE;
8064     }
8065 
8066   /* Now compile the branch; in the pre-compile phase its length gets added
8067   into the length. */
8068 
8069   if ((branch_return =
8070         compile_branch(&options, &code, &pptr, errorcodeptr, &branchfirstcu,
8071           &branchfirstcuflags, &branchreqcu, &branchreqcuflags, &bc,
8072           cb, (lengthptr == NULL)? NULL : &length)) == 0)
8073     return 0;
8074 
8075   /* If a branch can match an empty string, so can the whole group. */
8076 
8077   if (branch_return < 0) okreturn = -1;
8078 
8079   /* In the real compile phase, there is some post-processing to be done. */
8080 
8081   if (lengthptr == NULL)
8082     {
8083     /* If this is the first branch, the firstcu and reqcu values for the
8084     branch become the values for the regex. */
8085 
8086     if (*last_branch != OP_ALT)
8087       {
8088       firstcu = branchfirstcu;
8089       firstcuflags = branchfirstcuflags;
8090       reqcu = branchreqcu;
8091       reqcuflags = branchreqcuflags;
8092       }
8093 
8094     /* If this is not the first branch, the first char and reqcu have to
8095     match the values from all the previous branches, except that if the
8096     previous value for reqcu didn't have REQ_VARY set, it can still match,
8097     and we set REQ_VARY for the group from this branch's value. */
8098 
8099     else
8100       {
8101       /* If we previously had a firstcu, but it doesn't match the new branch,
8102       we have to abandon the firstcu for the regex, but if there was
8103       previously no reqcu, it takes on the value of the old firstcu. */
8104 
8105       if (firstcuflags != branchfirstcuflags || firstcu != branchfirstcu)
8106         {
8107         if (firstcuflags >= 0)
8108           {
8109           if (reqcuflags < 0)
8110             {
8111             reqcu = firstcu;
8112             reqcuflags = firstcuflags;
8113             }
8114           }
8115         firstcuflags = REQ_NONE;
8116         }
8117 
8118       /* If we (now or from before) have no firstcu, a firstcu from the
8119       branch becomes a reqcu if there isn't a branch reqcu. */
8120 
8121       if (firstcuflags < 0 && branchfirstcuflags >= 0 &&
8122           branchreqcuflags < 0)
8123         {
8124         branchreqcu = branchfirstcu;
8125         branchreqcuflags = branchfirstcuflags;
8126         }
8127 
8128       /* Now ensure that the reqcus match */
8129 
8130       if (((reqcuflags & ~REQ_VARY) != (branchreqcuflags & ~REQ_VARY)) ||
8131           reqcu != branchreqcu)
8132         reqcuflags = REQ_NONE;
8133       else
8134         {
8135         reqcu = branchreqcu;
8136         reqcuflags |= branchreqcuflags; /* To "or" REQ_VARY if present */
8137         }
8138       }
8139     }
8140 
8141   /* Handle reaching the end of the expression, either ')' or end of pattern.
8142   In the real compile phase, go back through the alternative branches and
8143   reverse the chain of offsets, with the field in the BRA item now becoming an
8144   offset to the first alternative. If there are no alternatives, it points to
8145   the end of the group. The length in the terminating ket is always the length
8146   of the whole bracketed item. Return leaving the pointer at the terminating
8147   char. */
8148 
8149   if (META_CODE(*pptr) != META_ALT)
8150     {
8151     if (lengthptr == NULL)
8152       {
8153       PCRE2_SIZE branch_length = code - last_branch;
8154       do
8155         {
8156         PCRE2_SIZE prev_length = GET(last_branch, 1);
8157         PUT(last_branch, 1, branch_length);
8158         branch_length = prev_length;
8159         last_branch -= branch_length;
8160         }
8161       while (branch_length > 0);
8162       }
8163 
8164     /* Fill in the ket */
8165 
8166     *code = OP_KET;
8167     PUT(code, 1, (int)(code - start_bracket));
8168     code += 1 + LINK_SIZE;
8169 
8170     /* If it was a capturing subpattern, remove the block from the chain. */
8171 
8172     if (capnumber > 0) cb->open_caps = cb->open_caps->next;
8173 
8174     /* Set values to pass back */
8175 
8176     *codeptr = code;
8177     *pptrptr = pptr;
8178     *firstcuptr = firstcu;
8179     *firstcuflagsptr = firstcuflags;
8180     *reqcuptr = reqcu;
8181     *reqcuflagsptr = reqcuflags;
8182     if (lengthptr != NULL)
8183       {
8184       if (OFLOW_MAX - *lengthptr < length)
8185         {
8186         *errorcodeptr = ERR20;
8187         return 0;
8188         }
8189       *lengthptr += length;
8190       }
8191     return okreturn;
8192     }
8193 
8194   /* Another branch follows. In the pre-compile phase, we can move the code
8195   pointer back to where it was for the start of the first branch. (That is,
8196   pretend that each branch is the only one.)
8197 
8198   In the real compile phase, insert an ALT node. Its length field points back
8199   to the previous branch while the bracket remains open. At the end the chain
8200   is reversed. It's done like this so that the start of the bracket has a
8201   zero offset until it is closed, making it possible to detect recursion. */
8202 
8203   if (lengthptr != NULL)
8204     {
8205     code = *codeptr + 1 + LINK_SIZE + skipunits;
8206     length += 1 + LINK_SIZE;
8207     }
8208   else
8209     {
8210     *code = OP_ALT;
8211     PUT(code, 1, (int)(code - last_branch));
8212     bc.current_branch = last_branch = code;
8213     code += 1 + LINK_SIZE;
8214     }
8215 
8216   /* Set the lookbehind length (if not in a lookbehind the value will be zero)
8217   and then advance past the vertical bar. */
8218 
8219   lookbehindlength = META_DATA(*pptr);
8220   pptr++;
8221   }
8222 /* Control never reaches here */
8223 }
8224 
8225 
8226 
8227 /*************************************************
8228 *          Check for anchored pattern            *
8229 *************************************************/
8230 
8231 /* Try to find out if this is an anchored regular expression. Consider each
8232 alternative branch. If they all start with OP_SOD or OP_CIRC, or with a bracket
8233 all of whose alternatives start with OP_SOD or OP_CIRC (recurse ad lib), then
8234 it's anchored. However, if this is a multiline pattern, then only OP_SOD will
8235 be found, because ^ generates OP_CIRCM in that mode.
8236 
8237 We can also consider a regex to be anchored if OP_SOM starts all its branches.
8238 This is the code for \G, which means "match at start of match position, taking
8239 into account the match offset".
8240 
8241 A branch is also implicitly anchored if it starts with .* and DOTALL is set,
8242 because that will try the rest of the pattern at all possible matching points,
8243 so there is no point trying again.... er ....
8244 
8245 .... except when the .* appears inside capturing parentheses, and there is a
8246 subsequent back reference to those parentheses. We haven't enough information
8247 to catch that case precisely.
8248 
8249 At first, the best we could do was to detect when .* was in capturing brackets
8250 and the highest back reference was greater than or equal to that level.
8251 However, by keeping a bitmap of the first 31 back references, we can catch some
8252 of the more common cases more precisely.
8253 
8254 ... A second exception is when the .* appears inside an atomic group, because
8255 this prevents the number of characters it matches from being adjusted.
8256 
8257 Arguments:
8258   code           points to start of the compiled pattern
8259   bracket_map    a bitmap of which brackets we are inside while testing; this
8260                    handles up to substring 31; after that we just have to take
8261                    the less precise approach
8262   cb             points to the compile data block
8263   atomcount      atomic group level
8264   inassert       TRUE if in an assertion
8265 
8266 Returns:     TRUE or FALSE
8267 */
8268 
8269 static BOOL
is_anchored(PCRE2_SPTR code,unsigned int bracket_map,compile_block * cb,int atomcount,BOOL inassert)8270 is_anchored(PCRE2_SPTR code, unsigned int bracket_map, compile_block *cb,
8271   int atomcount, BOOL inassert)
8272 {
8273 do {
8274    PCRE2_SPTR scode = first_significant_code(
8275      code + PRIV(OP_lengths)[*code], FALSE);
8276    int op = *scode;
8277 
8278    /* Non-capturing brackets */
8279 
8280    if (op == OP_BRA  || op == OP_BRAPOS ||
8281        op == OP_SBRA || op == OP_SBRAPOS)
8282      {
8283      if (!is_anchored(scode, bracket_map, cb, atomcount, inassert))
8284        return FALSE;
8285      }
8286 
8287    /* Capturing brackets */
8288 
8289    else if (op == OP_CBRA  || op == OP_CBRAPOS ||
8290             op == OP_SCBRA || op == OP_SCBRAPOS)
8291      {
8292      int n = GET2(scode, 1+LINK_SIZE);
8293      int new_map = bracket_map | ((n < 32)? (1u << n) : 1);
8294      if (!is_anchored(scode, new_map, cb, atomcount, inassert)) return FALSE;
8295      }
8296 
8297    /* Positive forward assertion */
8298 
8299    else if (op == OP_ASSERT || op == OP_ASSERT_NA)
8300      {
8301      if (!is_anchored(scode, bracket_map, cb, atomcount, TRUE)) return FALSE;
8302      }
8303 
8304    /* Condition. If there is no second branch, it can't be anchored. */
8305 
8306    else if (op == OP_COND || op == OP_SCOND)
8307      {
8308      if (scode[GET(scode,1)] != OP_ALT) return FALSE;
8309      if (!is_anchored(scode, bracket_map, cb, atomcount, inassert))
8310        return FALSE;
8311      }
8312 
8313    /* Atomic groups */
8314 
8315    else if (op == OP_ONCE)
8316      {
8317      if (!is_anchored(scode, bracket_map, cb, atomcount + 1, inassert))
8318        return FALSE;
8319      }
8320 
8321    /* .* is not anchored unless DOTALL is set (which generates OP_ALLANY) and
8322    it isn't in brackets that are or may be referenced or inside an atomic
8323    group or an assertion. Also the pattern must not contain *PRUNE or *SKIP,
8324    because these break the feature. Consider, for example, /(?s).*?(*PRUNE)b/
8325    with the subject "aab", which matches "b", i.e. not at the start of a line.
8326    There is also an option that disables auto-anchoring. */
8327 
8328    else if ((op == OP_TYPESTAR || op == OP_TYPEMINSTAR ||
8329              op == OP_TYPEPOSSTAR))
8330      {
8331      if (scode[1] != OP_ALLANY || (bracket_map & cb->backref_map) != 0 ||
8332          atomcount > 0 || cb->had_pruneorskip || inassert ||
8333          (cb->external_options & PCRE2_NO_DOTSTAR_ANCHOR) != 0)
8334        return FALSE;
8335      }
8336 
8337    /* Check for explicit anchoring */
8338 
8339    else if (op != OP_SOD && op != OP_SOM && op != OP_CIRC) return FALSE;
8340 
8341    code += GET(code, 1);
8342    }
8343 while (*code == OP_ALT);   /* Loop for each alternative */
8344 return TRUE;
8345 }
8346 
8347 
8348 
8349 /*************************************************
8350 *         Check for starting with ^ or .*        *
8351 *************************************************/
8352 
8353 /* This is called to find out if every branch starts with ^ or .* so that
8354 "first char" processing can be done to speed things up in multiline
8355 matching and for non-DOTALL patterns that start with .* (which must start at
8356 the beginning or after \n). As in the case of is_anchored() (see above), we
8357 have to take account of back references to capturing brackets that contain .*
8358 because in that case we can't make the assumption. Also, the appearance of .*
8359 inside atomic brackets or in an assertion, or in a pattern that contains *PRUNE
8360 or *SKIP does not count, because once again the assumption no longer holds.
8361 
8362 Arguments:
8363   code           points to start of the compiled pattern or a group
8364   bracket_map    a bitmap of which brackets we are inside while testing; this
8365                    handles up to substring 31; after that we just have to take
8366                    the less precise approach
8367   cb             points to the compile data
8368   atomcount      atomic group level
8369   inassert       TRUE if in an assertion
8370 
8371 Returns:         TRUE or FALSE
8372 */
8373 
8374 static BOOL
is_startline(PCRE2_SPTR code,unsigned int bracket_map,compile_block * cb,int atomcount,BOOL inassert)8375 is_startline(PCRE2_SPTR code, unsigned int bracket_map, compile_block *cb,
8376   int atomcount, BOOL inassert)
8377 {
8378 do {
8379    PCRE2_SPTR scode = first_significant_code(
8380      code + PRIV(OP_lengths)[*code], FALSE);
8381    int op = *scode;
8382 
8383    /* If we are at the start of a conditional assertion group, *both* the
8384    conditional assertion *and* what follows the condition must satisfy the test
8385    for start of line. Other kinds of condition fail. Note that there may be an
8386    auto-callout at the start of a condition. */
8387 
8388    if (op == OP_COND)
8389      {
8390      scode += 1 + LINK_SIZE;
8391 
8392      if (*scode == OP_CALLOUT) scode += PRIV(OP_lengths)[OP_CALLOUT];
8393        else if (*scode == OP_CALLOUT_STR) scode += GET(scode, 1 + 2*LINK_SIZE);
8394 
8395      switch (*scode)
8396        {
8397        case OP_CREF:
8398        case OP_DNCREF:
8399        case OP_RREF:
8400        case OP_DNRREF:
8401        case OP_FAIL:
8402        case OP_FALSE:
8403        case OP_TRUE:
8404        return FALSE;
8405 
8406        default:     /* Assertion */
8407        if (!is_startline(scode, bracket_map, cb, atomcount, TRUE)) return FALSE;
8408        do scode += GET(scode, 1); while (*scode == OP_ALT);
8409        scode += 1 + LINK_SIZE;
8410        break;
8411        }
8412      scode = first_significant_code(scode, FALSE);
8413      op = *scode;
8414      }
8415 
8416    /* Non-capturing brackets */
8417 
8418    if (op == OP_BRA  || op == OP_BRAPOS ||
8419        op == OP_SBRA || op == OP_SBRAPOS)
8420      {
8421      if (!is_startline(scode, bracket_map, cb, atomcount, inassert))
8422        return FALSE;
8423      }
8424 
8425    /* Capturing brackets */
8426 
8427    else if (op == OP_CBRA  || op == OP_CBRAPOS ||
8428             op == OP_SCBRA || op == OP_SCBRAPOS)
8429      {
8430      int n = GET2(scode, 1+LINK_SIZE);
8431      int new_map = bracket_map | ((n < 32)? (1u << n) : 1);
8432      if (!is_startline(scode, new_map, cb, atomcount, inassert)) return FALSE;
8433      }
8434 
8435    /* Positive forward assertions */
8436 
8437    else if (op == OP_ASSERT || op == OP_ASSERT_NA)
8438      {
8439      if (!is_startline(scode, bracket_map, cb, atomcount, TRUE))
8440        return FALSE;
8441      }
8442 
8443    /* Atomic brackets */
8444 
8445    else if (op == OP_ONCE)
8446      {
8447      if (!is_startline(scode, bracket_map, cb, atomcount + 1, inassert))
8448        return FALSE;
8449      }
8450 
8451    /* .* means "start at start or after \n" if it isn't in atomic brackets or
8452    brackets that may be referenced or an assertion, and as long as the pattern
8453    does not contain *PRUNE or *SKIP, because these break the feature. Consider,
8454    for example, /.*?a(*PRUNE)b/ with the subject "aab", which matches "ab",
8455    i.e. not at the start of a line. There is also an option that disables this
8456    optimization. */
8457 
8458    else if (op == OP_TYPESTAR || op == OP_TYPEMINSTAR || op == OP_TYPEPOSSTAR)
8459      {
8460      if (scode[1] != OP_ANY || (bracket_map & cb->backref_map) != 0 ||
8461          atomcount > 0 || cb->had_pruneorskip || inassert ||
8462          (cb->external_options & PCRE2_NO_DOTSTAR_ANCHOR) != 0)
8463        return FALSE;
8464      }
8465 
8466    /* Check for explicit circumflex; anything else gives a FALSE result. Note
8467    in particular that this includes atomic brackets OP_ONCE because the number
8468    of characters matched by .* cannot be adjusted inside them. */
8469 
8470    else if (op != OP_CIRC && op != OP_CIRCM) return FALSE;
8471 
8472    /* Move on to the next alternative */
8473 
8474    code += GET(code, 1);
8475    }
8476 while (*code == OP_ALT);  /* Loop for each alternative */
8477 return TRUE;
8478 }
8479 
8480 
8481 
8482 /*************************************************
8483 *   Scan compiled regex for recursion reference  *
8484 *************************************************/
8485 
8486 /* This function scans through a compiled pattern until it finds an instance of
8487 OP_RECURSE.
8488 
8489 Arguments:
8490   code        points to start of expression
8491   utf         TRUE in UTF mode
8492 
8493 Returns:      pointer to the opcode for OP_RECURSE, or NULL if not found
8494 */
8495 
8496 static PCRE2_SPTR
find_recurse(PCRE2_SPTR code,BOOL utf)8497 find_recurse(PCRE2_SPTR code, BOOL utf)
8498 {
8499 for (;;)
8500   {
8501   PCRE2_UCHAR c = *code;
8502   if (c == OP_END) return NULL;
8503   if (c == OP_RECURSE) return code;
8504 
8505   /* XCLASS is used for classes that cannot be represented just by a bit map.
8506   This includes negated single high-valued characters. CALLOUT_STR is used for
8507   callouts with string arguments. In both cases the length in the table is
8508   zero; the actual length is stored in the compiled code. */
8509 
8510   if (c == OP_XCLASS) code += GET(code, 1);
8511     else if (c == OP_CALLOUT_STR) code += GET(code, 1 + 2*LINK_SIZE);
8512 
8513   /* Otherwise, we can get the item's length from the table, except that for
8514   repeated character types, we have to test for \p and \P, which have an extra
8515   two code units of parameters, and for MARK/PRUNE/SKIP/THEN with an argument,
8516   we must add in its length. */
8517 
8518   else
8519     {
8520     switch(c)
8521       {
8522       case OP_TYPESTAR:
8523       case OP_TYPEMINSTAR:
8524       case OP_TYPEPLUS:
8525       case OP_TYPEMINPLUS:
8526       case OP_TYPEQUERY:
8527       case OP_TYPEMINQUERY:
8528       case OP_TYPEPOSSTAR:
8529       case OP_TYPEPOSPLUS:
8530       case OP_TYPEPOSQUERY:
8531       if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
8532       break;
8533 
8534       case OP_TYPEPOSUPTO:
8535       case OP_TYPEUPTO:
8536       case OP_TYPEMINUPTO:
8537       case OP_TYPEEXACT:
8538       if (code[1 + IMM2_SIZE] == OP_PROP || code[1 + IMM2_SIZE] == OP_NOTPROP)
8539         code += 2;
8540       break;
8541 
8542       case OP_MARK:
8543       case OP_COMMIT_ARG:
8544       case OP_PRUNE_ARG:
8545       case OP_SKIP_ARG:
8546       case OP_THEN_ARG:
8547       code += code[1];
8548       break;
8549       }
8550 
8551     /* Add in the fixed length from the table */
8552 
8553     code += PRIV(OP_lengths)[c];
8554 
8555     /* In UTF-8 and UTF-16 modes, opcodes that are followed by a character may
8556     be followed by a multi-unit character. The length in the table is a
8557     minimum, so we have to arrange to skip the extra units. */
8558 
8559 #ifdef MAYBE_UTF_MULTI
8560     if (utf) switch(c)
8561       {
8562       case OP_CHAR:
8563       case OP_CHARI:
8564       case OP_NOT:
8565       case OP_NOTI:
8566       case OP_EXACT:
8567       case OP_EXACTI:
8568       case OP_NOTEXACT:
8569       case OP_NOTEXACTI:
8570       case OP_UPTO:
8571       case OP_UPTOI:
8572       case OP_NOTUPTO:
8573       case OP_NOTUPTOI:
8574       case OP_MINUPTO:
8575       case OP_MINUPTOI:
8576       case OP_NOTMINUPTO:
8577       case OP_NOTMINUPTOI:
8578       case OP_POSUPTO:
8579       case OP_POSUPTOI:
8580       case OP_NOTPOSUPTO:
8581       case OP_NOTPOSUPTOI:
8582       case OP_STAR:
8583       case OP_STARI:
8584       case OP_NOTSTAR:
8585       case OP_NOTSTARI:
8586       case OP_MINSTAR:
8587       case OP_MINSTARI:
8588       case OP_NOTMINSTAR:
8589       case OP_NOTMINSTARI:
8590       case OP_POSSTAR:
8591       case OP_POSSTARI:
8592       case OP_NOTPOSSTAR:
8593       case OP_NOTPOSSTARI:
8594       case OP_PLUS:
8595       case OP_PLUSI:
8596       case OP_NOTPLUS:
8597       case OP_NOTPLUSI:
8598       case OP_MINPLUS:
8599       case OP_MINPLUSI:
8600       case OP_NOTMINPLUS:
8601       case OP_NOTMINPLUSI:
8602       case OP_POSPLUS:
8603       case OP_POSPLUSI:
8604       case OP_NOTPOSPLUS:
8605       case OP_NOTPOSPLUSI:
8606       case OP_QUERY:
8607       case OP_QUERYI:
8608       case OP_NOTQUERY:
8609       case OP_NOTQUERYI:
8610       case OP_MINQUERY:
8611       case OP_MINQUERYI:
8612       case OP_NOTMINQUERY:
8613       case OP_NOTMINQUERYI:
8614       case OP_POSQUERY:
8615       case OP_POSQUERYI:
8616       case OP_NOTPOSQUERY:
8617       case OP_NOTPOSQUERYI:
8618       if (HAS_EXTRALEN(code[-1])) code += GET_EXTRALEN(code[-1]);
8619       break;
8620       }
8621 #else
8622     (void)(utf);  /* Keep compiler happy by referencing function argument */
8623 #endif  /* MAYBE_UTF_MULTI */
8624     }
8625   }
8626 }
8627 
8628 
8629 
8630 /*************************************************
8631 *    Check for asserted fixed first code unit    *
8632 *************************************************/
8633 
8634 /* During compilation, the "first code unit" settings from forward assertions
8635 are discarded, because they can cause conflicts with actual literals that
8636 follow. However, if we end up without a first code unit setting for an
8637 unanchored pattern, it is worth scanning the regex to see if there is an
8638 initial asserted first code unit. If all branches start with the same asserted
8639 code unit, or with a non-conditional bracket all of whose alternatives start
8640 with the same asserted code unit (recurse ad lib), then we return that code
8641 unit, with the flags set to zero or REQ_CASELESS; otherwise return zero with
8642 REQ_NONE in the flags.
8643 
8644 Arguments:
8645   code       points to start of compiled pattern
8646   flags      points to the first code unit flags
8647   inassert   non-zero if in an assertion
8648 
8649 Returns:     the fixed first code unit, or 0 with REQ_NONE in flags
8650 */
8651 
8652 static uint32_t
find_firstassertedcu(PCRE2_SPTR code,int32_t * flags,uint32_t inassert)8653 find_firstassertedcu(PCRE2_SPTR code, int32_t *flags, uint32_t inassert)
8654 {
8655 uint32_t c = 0;
8656 int cflags = REQ_NONE;
8657 
8658 *flags = REQ_NONE;
8659 do {
8660    uint32_t d;
8661    int dflags;
8662    int xl = (*code == OP_CBRA || *code == OP_SCBRA ||
8663              *code == OP_CBRAPOS || *code == OP_SCBRAPOS)? IMM2_SIZE:0;
8664    PCRE2_SPTR scode = first_significant_code(code + 1+LINK_SIZE + xl, TRUE);
8665    PCRE2_UCHAR op = *scode;
8666 
8667    switch(op)
8668      {
8669      default:
8670      return 0;
8671 
8672      case OP_BRA:
8673      case OP_BRAPOS:
8674      case OP_CBRA:
8675      case OP_SCBRA:
8676      case OP_CBRAPOS:
8677      case OP_SCBRAPOS:
8678      case OP_ASSERT:
8679      case OP_ASSERT_NA:
8680      case OP_ONCE:
8681      case OP_SCRIPT_RUN:
8682      d = find_firstassertedcu(scode, &dflags, inassert +
8683        ((op == OP_ASSERT || op == OP_ASSERT_NA)?1:0));
8684      if (dflags < 0)
8685        return 0;
8686      if (cflags < 0) { c = d; cflags = dflags; }
8687        else if (c != d || cflags != dflags) return 0;
8688      break;
8689 
8690      case OP_EXACT:
8691      scode += IMM2_SIZE;
8692      /* Fall through */
8693 
8694      case OP_CHAR:
8695      case OP_PLUS:
8696      case OP_MINPLUS:
8697      case OP_POSPLUS:
8698      if (inassert == 0) return 0;
8699      if (cflags < 0) { c = scode[1]; cflags = 0; }
8700        else if (c != scode[1]) return 0;
8701      break;
8702 
8703      case OP_EXACTI:
8704      scode += IMM2_SIZE;
8705      /* Fall through */
8706 
8707      case OP_CHARI:
8708      case OP_PLUSI:
8709      case OP_MINPLUSI:
8710      case OP_POSPLUSI:
8711      if (inassert == 0) return 0;
8712 
8713      /* If the character is more than one code unit long, we cannot set its
8714      first code unit when matching caselessly. Later scanning may pick up
8715      multiple code units. */
8716 
8717 #ifdef SUPPORT_UNICODE
8718 #if PCRE2_CODE_UNIT_WIDTH == 8
8719      if (scode[1] >= 0x80) return 0;
8720 #elif PCRE2_CODE_UNIT_WIDTH == 16
8721      if (scode[1] >= 0xd800 && scode[1] <= 0xdfff) return 0;
8722 #endif
8723 #endif
8724 
8725      if (cflags < 0) { c = scode[1]; cflags = REQ_CASELESS; }
8726        else if (c != scode[1]) return 0;
8727      break;
8728      }
8729 
8730    code += GET(code, 1);
8731    }
8732 while (*code == OP_ALT);
8733 
8734 *flags = cflags;
8735 return c;
8736 }
8737 
8738 
8739 
8740 /*************************************************
8741 *     Add an entry to the name/number table      *
8742 *************************************************/
8743 
8744 /* This function is called between compiling passes to add an entry to the
8745 name/number table, maintaining alphabetical order. Checking for permitted
8746 and forbidden duplicates has already been done.
8747 
8748 Arguments:
8749   cb           the compile data block
8750   name         the name to add
8751   length       the length of the name
8752   groupno      the group number
8753   tablecount   the count of names in the table so far
8754 
8755 Returns:       nothing
8756 */
8757 
8758 static void
add_name_to_table(compile_block * cb,PCRE2_SPTR name,int length,unsigned int groupno,uint32_t tablecount)8759 add_name_to_table(compile_block *cb, PCRE2_SPTR name, int length,
8760   unsigned int groupno, uint32_t tablecount)
8761 {
8762 uint32_t i;
8763 PCRE2_UCHAR *slot = cb->name_table;
8764 
8765 for (i = 0; i < tablecount; i++)
8766   {
8767   int crc = memcmp(name, slot+IMM2_SIZE, CU2BYTES(length));
8768   if (crc == 0 && slot[IMM2_SIZE+length] != 0)
8769     crc = -1; /* Current name is a substring */
8770 
8771   /* Make space in the table and break the loop for an earlier name. For a
8772   duplicate or later name, carry on. We do this for duplicates so that in the
8773   simple case (when ?(| is not used) they are in order of their numbers. In all
8774   cases they are in the order in which they appear in the pattern. */
8775 
8776   if (crc < 0)
8777     {
8778     (void)memmove(slot + cb->name_entry_size, slot,
8779       CU2BYTES((tablecount - i) * cb->name_entry_size));
8780     break;
8781     }
8782 
8783   /* Continue the loop for a later or duplicate name */
8784 
8785   slot += cb->name_entry_size;
8786   }
8787 
8788 PUT2(slot, 0, groupno);
8789 memcpy(slot + IMM2_SIZE, name, CU2BYTES(length));
8790 
8791 /* Add a terminating zero and fill the rest of the slot with zeroes so that
8792 the memory is all initialized. Otherwise valgrind moans about uninitialized
8793 memory when saving serialized compiled patterns. */
8794 
8795 memset(slot + IMM2_SIZE + length, 0,
8796   CU2BYTES(cb->name_entry_size - length - IMM2_SIZE));
8797 }
8798 
8799 
8800 
8801 /*************************************************
8802 *             Skip in parsed pattern             *
8803 *************************************************/
8804 
8805 /* This function is called to skip parts of the parsed pattern when finding the
8806 length of a lookbehind branch. It is called after (*ACCEPT) and (*FAIL) to find
8807 the end of the branch, it is called to skip over an internal lookaround or
8808 (DEFINE) group, and it is also called to skip to the end of a class, during
8809 which it will never encounter nested groups (but there's no need to have
8810 special code for that).
8811 
8812 When called to find the end of a branch or group, pptr must point to the first
8813 meta code inside the branch, not the branch-starting code. In other cases it
8814 can point to the item that causes the function to be called.
8815 
8816 Arguments:
8817   pptr       current pointer to skip from
8818   skiptype   PSKIP_CLASS when skipping to end of class
8819              PSKIP_ALT when META_ALT ends the skip
8820              PSKIP_KET when only META_KET ends the skip
8821 
8822 Returns:     new value of pptr
8823              NULL if META_END is reached - should never occur
8824                or for an unknown meta value - likewise
8825 */
8826 
8827 static uint32_t *
parsed_skip(uint32_t * pptr,uint32_t skiptype)8828 parsed_skip(uint32_t *pptr, uint32_t skiptype)
8829 {
8830 uint32_t nestlevel = 0;
8831 
8832 for (;; pptr++)
8833   {
8834   uint32_t meta = META_CODE(*pptr);
8835 
8836   switch(meta)
8837     {
8838     default:  /* Just skip over most items */
8839     if (meta < META_END) continue;  /* Literal */
8840     break;
8841 
8842     /* This should never occur. */
8843 
8844     case META_END:
8845     return NULL;
8846 
8847     /* The data for these items is variable in length. */
8848 
8849     case META_BACKREF:  /* Offset is present only if group >= 10 */
8850     if (META_DATA(*pptr) >= 10) pptr += SIZEOFFSET;
8851     break;
8852 
8853     case META_ESCAPE:   /* A few escapes are followed by data items. */
8854     switch (META_DATA(*pptr))
8855       {
8856       case ESC_P:
8857       case ESC_p:
8858       pptr += 1;
8859       break;
8860 
8861       case ESC_g:
8862       case ESC_k:
8863       pptr += 1 + SIZEOFFSET;
8864       break;
8865       }
8866     break;
8867 
8868     case META_MARK:     /* Add the length of the name. */
8869     case META_COMMIT_ARG:
8870     case META_PRUNE_ARG:
8871     case META_SKIP_ARG:
8872     case META_THEN_ARG:
8873     pptr += pptr[1];
8874     break;
8875 
8876     /* These are the "active" items in this loop. */
8877 
8878     case META_CLASS_END:
8879     if (skiptype == PSKIP_CLASS) return pptr;
8880     break;
8881 
8882     case META_ATOMIC:
8883     case META_CAPTURE:
8884     case META_COND_ASSERT:
8885     case META_COND_DEFINE:
8886     case META_COND_NAME:
8887     case META_COND_NUMBER:
8888     case META_COND_RNAME:
8889     case META_COND_RNUMBER:
8890     case META_COND_VERSION:
8891     case META_LOOKAHEAD:
8892     case META_LOOKAHEADNOT:
8893     case META_LOOKAHEAD_NA:
8894     case META_LOOKBEHIND:
8895     case META_LOOKBEHINDNOT:
8896     case META_LOOKBEHIND_NA:
8897     case META_NOCAPTURE:
8898     case META_SCRIPT_RUN:
8899     nestlevel++;
8900     break;
8901 
8902     case META_ALT:
8903     if (nestlevel == 0 && skiptype == PSKIP_ALT) return pptr;
8904     break;
8905 
8906     case META_KET:
8907     if (nestlevel == 0) return pptr;
8908     nestlevel--;
8909     break;
8910     }
8911 
8912   /* The extra data item length for each meta is in a table. */
8913 
8914   meta = (meta >> 16) & 0x7fff;
8915   if (meta >= sizeof(meta_extra_lengths)) return NULL;
8916   pptr += meta_extra_lengths[meta];
8917   }
8918 /* Control never reaches here */
8919 return pptr;
8920 }
8921 
8922 
8923 
8924 /*************************************************
8925 *       Find length of a parsed group            *
8926 *************************************************/
8927 
8928 /* This is called for nested groups within a branch of a lookbehind whose
8929 length is being computed. If all the branches in the nested group have the same
8930 length, that is OK. On entry, the pointer must be at the first element after
8931 the group initializing code. On exit it points to OP_KET. Caching is used to
8932 improve processing speed when the same capturing group occurs many times.
8933 
8934 Arguments:
8935   pptrptr     pointer to pointer in the parsed pattern
8936   isinline    FALSE if a reference or recursion; TRUE for inline group
8937   errcodeptr  pointer to the errorcode
8938   lcptr       pointer to the loop counter
8939   group       number of captured group or -1 for a non-capturing group
8940   recurses    chain of recurse_check to catch mutual recursion
8941   cb          pointer to the compile data
8942 
8943 Returns:      the group length or a negative number
8944 */
8945 
8946 static int
get_grouplength(uint32_t ** pptrptr,BOOL isinline,int * errcodeptr,int * lcptr,int group,parsed_recurse_check * recurses,compile_block * cb)8947 get_grouplength(uint32_t **pptrptr, BOOL isinline, int *errcodeptr, int *lcptr,
8948    int group, parsed_recurse_check *recurses, compile_block *cb)
8949 {
8950 int branchlength;
8951 int grouplength = -1;
8952 
8953 /* The cache can be used only if there is no possibility of there being two
8954 groups with the same number. We do not need to set the end pointer for a group
8955 that is being processed as a back reference or recursion, but we must do so for
8956 an inline group. */
8957 
8958 if (group > 0 && (cb->external_flags & PCRE2_DUPCAPUSED) == 0)
8959   {
8960   uint32_t groupinfo = cb->groupinfo[group];
8961   if ((groupinfo & GI_NOT_FIXED_LENGTH) != 0) return -1;
8962   if ((groupinfo & GI_SET_FIXED_LENGTH) != 0)
8963     {
8964     if (isinline) *pptrptr = parsed_skip(*pptrptr, PSKIP_KET);
8965     return groupinfo & GI_FIXED_LENGTH_MASK;
8966     }
8967   }
8968 
8969 /* Scan the group. In this case we find the end pointer of necessity. */
8970 
8971 for(;;)
8972   {
8973   branchlength = get_branchlength(pptrptr, errcodeptr, lcptr, recurses, cb);
8974   if (branchlength < 0) goto ISNOTFIXED;
8975   if (grouplength == -1) grouplength = branchlength;
8976     else if (grouplength != branchlength) goto ISNOTFIXED;
8977   if (**pptrptr == META_KET) break;
8978   *pptrptr += 1;   /* Skip META_ALT */
8979   }
8980 
8981 if (group > 0)
8982   cb->groupinfo[group] |= (uint32_t)(GI_SET_FIXED_LENGTH | grouplength);
8983 return grouplength;
8984 
8985 ISNOTFIXED:
8986 if (group > 0) cb->groupinfo[group] |= GI_NOT_FIXED_LENGTH;
8987 return -1;
8988 }
8989 
8990 
8991 
8992 /*************************************************
8993 *        Find length of a parsed branch          *
8994 *************************************************/
8995 
8996 /* Return a fixed length for a branch in a lookbehind, giving an error if the
8997 length is not fixed. On entry, *pptrptr points to the first element inside the
8998 branch. On exit it is set to point to the ALT or KET.
8999 
9000 Arguments:
9001   pptrptr     pointer to pointer in the parsed pattern
9002   errcodeptr  pointer to error code
9003   lcptr       pointer to loop counter
9004   recurses    chain of recurse_check to catch mutual recursion
9005   cb          pointer to compile block
9006 
9007 Returns:      the length, or a negative value on error
9008 */
9009 
9010 static int
get_branchlength(uint32_t ** pptrptr,int * errcodeptr,int * lcptr,parsed_recurse_check * recurses,compile_block * cb)9011 get_branchlength(uint32_t **pptrptr, int *errcodeptr, int *lcptr,
9012   parsed_recurse_check *recurses, compile_block *cb)
9013 {
9014 int branchlength = 0;
9015 int grouplength;
9016 uint32_t lastitemlength = 0;
9017 uint32_t *pptr = *pptrptr;
9018 PCRE2_SIZE offset;
9019 parsed_recurse_check this_recurse;
9020 
9021 /* A large and/or complex regex can take too long to process. This can happen
9022 more often when (?| groups are present in the pattern because their length
9023 cannot be cached. */
9024 
9025 if ((*lcptr)++ > 2000)
9026   {
9027   *errcodeptr = ERR35;  /* Lookbehind is too complicated */
9028   return -1;
9029   }
9030 
9031 /* Scan the branch, accumulating the length. */
9032 
9033 for (;; pptr++)
9034   {
9035   parsed_recurse_check *r;
9036   uint32_t *gptr, *gptrend;
9037   uint32_t escape;
9038   uint32_t group = 0;
9039   uint32_t itemlength = 0;
9040 
9041   if (*pptr < META_END)
9042     {
9043     itemlength = 1;
9044     }
9045 
9046   else switch (META_CODE(*pptr))
9047     {
9048     case META_KET:
9049     case META_ALT:
9050     goto EXIT;
9051 
9052     /* (*ACCEPT) and (*FAIL) terminate the branch, but we must skip to the
9053     actual termination. */
9054 
9055     case META_ACCEPT:
9056     case META_FAIL:
9057     pptr = parsed_skip(pptr, PSKIP_ALT);
9058     if (pptr == NULL) goto PARSED_SKIP_FAILED;
9059     goto EXIT;
9060 
9061     case META_MARK:
9062     case META_COMMIT_ARG:
9063     case META_PRUNE_ARG:
9064     case META_SKIP_ARG:
9065     case META_THEN_ARG:
9066     pptr += pptr[1] + 1;
9067     break;
9068 
9069     case META_CIRCUMFLEX:
9070     case META_COMMIT:
9071     case META_DOLLAR:
9072     case META_PRUNE:
9073     case META_SKIP:
9074     case META_THEN:
9075     break;
9076 
9077     case META_OPTIONS:
9078     pptr += 1;
9079     break;
9080 
9081     case META_BIGVALUE:
9082     itemlength = 1;
9083     pptr += 1;
9084     break;
9085 
9086     case META_CLASS:
9087     case META_CLASS_NOT:
9088     itemlength = 1;
9089     pptr = parsed_skip(pptr, PSKIP_CLASS);
9090     if (pptr == NULL) goto PARSED_SKIP_FAILED;
9091     break;
9092 
9093     case META_CLASS_EMPTY_NOT:
9094     case META_DOT:
9095     itemlength = 1;
9096     break;
9097 
9098     case META_CALLOUT_NUMBER:
9099     pptr += 3;
9100     break;
9101 
9102     case META_CALLOUT_STRING:
9103     pptr += 3 + SIZEOFFSET;
9104     break;
9105 
9106     /* Only some escapes consume a character. Of those, \R and \X are never
9107     allowed because they might match more than character. \C is allowed only in
9108     32-bit and non-UTF 8/16-bit modes. */
9109 
9110     case META_ESCAPE:
9111     escape = META_DATA(*pptr);
9112     if (escape == ESC_R || escape == ESC_X) return -1;
9113     if (escape > ESC_b && escape < ESC_Z)
9114       {
9115 #if PCRE2_CODE_UNIT_WIDTH != 32
9116       if ((cb->external_options & PCRE2_UTF) != 0 && escape == ESC_C)
9117         {
9118         *errcodeptr = ERR36;
9119         return -1;
9120         }
9121 #endif
9122       itemlength = 1;
9123       if (escape == ESC_p || escape == ESC_P) pptr++;  /* Skip prop data */
9124       }
9125     break;
9126 
9127     /* Lookaheads do not contribute to the length of this branch, but they may
9128     contain lookbehinds within them whose lengths need to be set. */
9129 
9130     case META_LOOKAHEAD:
9131     case META_LOOKAHEADNOT:
9132     case META_LOOKAHEAD_NA:
9133     *errcodeptr = check_lookbehinds(pptr + 1, &pptr, recurses, cb);
9134     if (*errcodeptr != 0) return -1;
9135 
9136     /* Ignore any qualifiers that follow a lookahead assertion. */
9137 
9138     switch (pptr[1])
9139       {
9140       case META_ASTERISK:
9141       case META_ASTERISK_PLUS:
9142       case META_ASTERISK_QUERY:
9143       case META_PLUS:
9144       case META_PLUS_PLUS:
9145       case META_PLUS_QUERY:
9146       case META_QUERY:
9147       case META_QUERY_PLUS:
9148       case META_QUERY_QUERY:
9149       pptr++;
9150       break;
9151 
9152       case META_MINMAX:
9153       case META_MINMAX_PLUS:
9154       case META_MINMAX_QUERY:
9155       pptr += 3;
9156       break;
9157 
9158       default:
9159       break;
9160       }
9161     break;
9162 
9163     /* A nested lookbehind does not contribute any length to this lookbehind,
9164     but must itself be checked and have its lengths set. */
9165 
9166     case META_LOOKBEHIND:
9167     case META_LOOKBEHINDNOT:
9168     case META_LOOKBEHIND_NA:
9169     if (!set_lookbehind_lengths(&pptr, errcodeptr, lcptr, recurses, cb))
9170       return -1;
9171     break;
9172 
9173     /* Back references and recursions are handled by very similar code. At this
9174     stage, the names generated in the parsing pass are available, but the main
9175     name table has not yet been created. So for the named varieties, scan the
9176     list of names in order to get the number of the first one in the pattern,
9177     and whether or not this name is duplicated. */
9178 
9179     case META_BACKREF_BYNAME:
9180     if ((cb->external_options & PCRE2_MATCH_UNSET_BACKREF) != 0)
9181       goto ISNOTFIXED;
9182     /* Fall through */
9183 
9184     case META_RECURSE_BYNAME:
9185       {
9186       int i;
9187       PCRE2_SPTR name;
9188       BOOL is_dupname = FALSE;
9189       named_group *ng = cb->named_groups;
9190       uint32_t meta_code = META_CODE(*pptr);
9191       uint32_t length = *(++pptr);
9192 
9193       GETPLUSOFFSET(offset, pptr);
9194       name = cb->start_pattern + offset;
9195       for (i = 0; i < cb->names_found; i++, ng++)
9196         {
9197         if (length == ng->length && PRIV(strncmp)(name, ng->name, length) == 0)
9198           {
9199           group = ng->number;
9200           is_dupname = ng->isdup;
9201           break;
9202           }
9203         }
9204 
9205       if (group == 0)
9206         {
9207         *errcodeptr = ERR15;  /* Non-existent subpattern */
9208         cb->erroroffset = offset;
9209         return -1;
9210         }
9211 
9212       /* A numerical back reference can be fixed length if duplicate capturing
9213       groups are not being used. A non-duplicate named back reference can also
9214       be handled. */
9215 
9216       if (meta_code == META_RECURSE_BYNAME ||
9217           (!is_dupname && (cb->external_flags & PCRE2_DUPCAPUSED) == 0))
9218         goto RECURSE_OR_BACKREF_LENGTH;  /* Handle as a numbered version. */
9219       }
9220     goto ISNOTFIXED;                     /* Duplicate name or number */
9221 
9222     /* The offset values for back references < 10 are in a separate vector
9223     because otherwise they would use more than two parsed pattern elements on
9224     64-bit systems. */
9225 
9226     case META_BACKREF:
9227     if ((cb->external_options & PCRE2_MATCH_UNSET_BACKREF) != 0 ||
9228         (cb->external_flags & PCRE2_DUPCAPUSED) != 0)
9229       goto ISNOTFIXED;
9230     group = META_DATA(*pptr);
9231     if (group < 10)
9232       {
9233       offset = cb->small_ref_offset[group];
9234       goto RECURSE_OR_BACKREF_LENGTH;
9235       }
9236 
9237     /* Fall through */
9238     /* For groups >= 10 - picking up group twice does no harm. */
9239 
9240     /* A true recursion implies not fixed length, but a subroutine call may
9241     be OK. Back reference "recursions" are also failed. */
9242 
9243     case META_RECURSE:
9244     group = META_DATA(*pptr);
9245     GETPLUSOFFSET(offset, pptr);
9246 
9247     RECURSE_OR_BACKREF_LENGTH:
9248     if (group > cb->bracount)
9249       {
9250       cb->erroroffset = offset;
9251       *errcodeptr = ERR15;  /* Non-existent subpattern */
9252       return -1;
9253       }
9254     if (group == 0) goto ISNOTFIXED;  /* Local recursion */
9255     for (gptr = cb->parsed_pattern; *gptr != META_END; gptr++)
9256       {
9257       if (META_CODE(*gptr) == META_BIGVALUE) gptr++;
9258         else if (*gptr == (META_CAPTURE | group)) break;
9259       }
9260 
9261     /* We must start the search for the end of the group at the first meta code
9262     inside the group. Otherwise it will be treated as an enclosed group. */
9263 
9264     gptrend = parsed_skip(gptr + 1, PSKIP_KET);
9265     if (gptrend == NULL) goto PARSED_SKIP_FAILED;
9266     if (pptr > gptr && pptr < gptrend) goto ISNOTFIXED;  /* Local recursion */
9267     for (r = recurses; r != NULL; r = r->prev) if (r->groupptr == gptr) break;
9268     if (r != NULL) goto ISNOTFIXED;   /* Mutual recursion */
9269     this_recurse.prev = recurses;
9270     this_recurse.groupptr = gptr;
9271 
9272     /* We do not need to know the position of the end of the group, that is,
9273     gptr is not used after the call to get_grouplength(). Setting the second
9274     argument FALSE stops it scanning for the end when the length can be found
9275     in the cache. */
9276 
9277     gptr++;
9278     grouplength = get_grouplength(&gptr, FALSE, errcodeptr, lcptr, group,
9279       &this_recurse, cb);
9280     if (grouplength < 0)
9281       {
9282       if (*errcodeptr == 0) goto ISNOTFIXED;
9283       return -1;  /* Error already set */
9284       }
9285     itemlength = grouplength;
9286     break;
9287 
9288     /* A (DEFINE) group is never obeyed inline and so it does not contribute to
9289     the length of this branch. Skip from the following item to the next
9290     unpaired ket. */
9291 
9292     case META_COND_DEFINE:
9293     pptr = parsed_skip(pptr + 1, PSKIP_KET);
9294     break;
9295 
9296     /* Check other nested groups - advance past the initial data for each type
9297     and then seek a fixed length with get_grouplength(). */
9298 
9299     case META_COND_NAME:
9300     case META_COND_NUMBER:
9301     case META_COND_RNAME:
9302     case META_COND_RNUMBER:
9303     pptr += 2 + SIZEOFFSET;
9304     goto CHECK_GROUP;
9305 
9306     case META_COND_ASSERT:
9307     pptr += 1;
9308     goto CHECK_GROUP;
9309 
9310     case META_COND_VERSION:
9311     pptr += 4;
9312     goto CHECK_GROUP;
9313 
9314     case META_CAPTURE:
9315     group = META_DATA(*pptr);
9316     /* Fall through */
9317 
9318     case META_ATOMIC:
9319     case META_NOCAPTURE:
9320     case META_SCRIPT_RUN:
9321     pptr++;
9322     CHECK_GROUP:
9323     grouplength = get_grouplength(&pptr, TRUE, errcodeptr, lcptr, group,
9324       recurses, cb);
9325     if (grouplength < 0) return -1;
9326     itemlength = grouplength;
9327     break;
9328 
9329     /* Exact repetition is OK; variable repetition is not. A repetition of zero
9330     must subtract the length that has already been added. */
9331 
9332     case META_MINMAX:
9333     case META_MINMAX_PLUS:
9334     case META_MINMAX_QUERY:
9335     if (pptr[1] == pptr[2])
9336       {
9337       switch(pptr[1])
9338         {
9339         case 0:
9340         branchlength -= lastitemlength;
9341         break;
9342 
9343         case 1:
9344         itemlength = 0;
9345         break;
9346 
9347         default:  /* Check for integer overflow */
9348         if (lastitemlength != 0 &&  /* Should not occur, but just in case */
9349             INT_MAX/lastitemlength < pptr[1] - 1)
9350           {
9351           *errcodeptr = ERR87;  /* Integer overflow; lookbehind too big */
9352           return -1;
9353           }
9354         itemlength = (pptr[1] - 1) * lastitemlength;
9355         break;
9356         }
9357       pptr += 2;
9358       break;
9359       }
9360     /* Fall through */
9361 
9362     /* Any other item means this branch does not have a fixed length. */
9363 
9364     default:
9365     ISNOTFIXED:
9366     *errcodeptr = ERR25;   /* Not fixed length */
9367     return -1;
9368     }
9369 
9370   /* Add the item length to the branchlength, checking for integer overflow and
9371   for the branch length exceeding the limit. */
9372 
9373   if (INT_MAX - branchlength < (int)itemlength ||
9374       (branchlength += itemlength) > LOOKBEHIND_MAX)
9375     {
9376     *errcodeptr = ERR87;
9377     return -1;
9378     }
9379 
9380   /* Save this item length for use if the next item is a quantifier. */
9381 
9382   lastitemlength = itemlength;
9383   }
9384 
9385 EXIT:
9386 *pptrptr = pptr;
9387 return branchlength;
9388 
9389 PARSED_SKIP_FAILED:
9390 *errcodeptr = ERR90;
9391 return -1;
9392 }
9393 
9394 
9395 
9396 /*************************************************
9397 *        Set lengths in a lookbehind             *
9398 *************************************************/
9399 
9400 /* This function is called for each lookbehind, to set the lengths in its
9401 branches. An error occurs if any branch does not have a fixed length that is
9402 less than the maximum (65535). On exit, the pointer must be left on the final
9403 ket.
9404 
9405 The function also maintains the max_lookbehind value. Any lookbehind branch
9406 that contains a nested lookbehind may actually look further back than the
9407 length of the branch. The additional amount is passed back from
9408 get_branchlength() as an "extra" value.
9409 
9410 Arguments:
9411   pptrptr     pointer to pointer in the parsed pattern
9412   errcodeptr  pointer to error code
9413   lcptr       pointer to loop counter
9414   recurses    chain of recurse_check to catch mutual recursion
9415   cb          pointer to compile block
9416 
9417 Returns:      TRUE if all is well
9418               FALSE otherwise, with error code and offset set
9419 */
9420 
9421 static BOOL
set_lookbehind_lengths(uint32_t ** pptrptr,int * errcodeptr,int * lcptr,parsed_recurse_check * recurses,compile_block * cb)9422 set_lookbehind_lengths(uint32_t **pptrptr, int *errcodeptr, int *lcptr,
9423   parsed_recurse_check *recurses, compile_block *cb)
9424 {
9425 PCRE2_SIZE offset;
9426 int branchlength;
9427 uint32_t *bptr = *pptrptr;
9428 
9429 READPLUSOFFSET(offset, bptr);  /* Offset for error messages */
9430 *pptrptr += SIZEOFFSET;
9431 
9432 do
9433   {
9434   *pptrptr += 1;
9435   branchlength = get_branchlength(pptrptr, errcodeptr, lcptr, recurses, cb);
9436   if (branchlength < 0)
9437     {
9438     /* The errorcode and offset may already be set from a nested lookbehind. */
9439     if (*errcodeptr == 0) *errcodeptr = ERR25;
9440     if (cb->erroroffset == PCRE2_UNSET) cb->erroroffset = offset;
9441     return FALSE;
9442     }
9443   if (branchlength > cb->max_lookbehind) cb->max_lookbehind = branchlength;
9444   *bptr |= branchlength;  /* branchlength never more than 65535 */
9445   bptr = *pptrptr;
9446   }
9447 while (*bptr == META_ALT);
9448 
9449 return TRUE;
9450 }
9451 
9452 
9453 
9454 /*************************************************
9455 *         Check parsed pattern lookbehinds       *
9456 *************************************************/
9457 
9458 /* This function is called at the end of parsing a pattern if any lookbehinds
9459 were encountered. It scans the parsed pattern for them, calling
9460 set_lookbehind_lengths() for each one. At the start, the errorcode is zero and
9461 the error offset is marked unset. The enables the functions above not to
9462 override settings from deeper nestings.
9463 
9464 This function is called recursively from get_branchlength() for lookaheads in
9465 order to process any lookbehinds that they may contain. It stops when it hits a
9466 non-nested closing parenthesis in this case, returning a pointer to it.
9467 
9468 Arguments
9469   pptr      points to where to start (start of pattern or start of lookahead)
9470   retptr    if not NULL, return the ket pointer here
9471   recurses  chain of recurse_check to catch mutual recursion
9472   cb        points to the compile block
9473 
9474 Returns:    0 on success, or an errorcode (cb->erroroffset will be set)
9475 */
9476 
9477 static int
check_lookbehinds(uint32_t * pptr,uint32_t ** retptr,parsed_recurse_check * recurses,compile_block * cb)9478 check_lookbehinds(uint32_t *pptr, uint32_t **retptr,
9479   parsed_recurse_check *recurses, compile_block *cb)
9480 {
9481 int errorcode = 0;
9482 int loopcount = 0;
9483 int nestlevel = 0;
9484 
9485 cb->erroroffset = PCRE2_UNSET;
9486 
9487 for (; *pptr != META_END; pptr++)
9488   {
9489   if (*pptr < META_END) continue;  /* Literal */
9490 
9491   switch (META_CODE(*pptr))
9492     {
9493     default:
9494     return ERR70;  /* Unrecognized meta code */
9495 
9496     case META_ESCAPE:
9497     if (*pptr - META_ESCAPE == ESC_P || *pptr - META_ESCAPE == ESC_p)
9498       pptr += 1;
9499     break;
9500 
9501     case META_KET:
9502     if (--nestlevel < 0)
9503       {
9504       if (retptr != NULL) *retptr = pptr;
9505       return 0;
9506       }
9507     break;
9508 
9509     case META_ATOMIC:
9510     case META_CAPTURE:
9511     case META_COND_ASSERT:
9512     case META_LOOKAHEAD:
9513     case META_LOOKAHEADNOT:
9514     case META_LOOKAHEAD_NA:
9515     case META_NOCAPTURE:
9516     case META_SCRIPT_RUN:
9517     nestlevel++;
9518     break;
9519 
9520     case META_ACCEPT:
9521     case META_ALT:
9522     case META_ASTERISK:
9523     case META_ASTERISK_PLUS:
9524     case META_ASTERISK_QUERY:
9525     case META_BACKREF:
9526     case META_CIRCUMFLEX:
9527     case META_CLASS:
9528     case META_CLASS_EMPTY:
9529     case META_CLASS_EMPTY_NOT:
9530     case META_CLASS_END:
9531     case META_CLASS_NOT:
9532     case META_COMMIT:
9533     case META_DOLLAR:
9534     case META_DOT:
9535     case META_FAIL:
9536     case META_PLUS:
9537     case META_PLUS_PLUS:
9538     case META_PLUS_QUERY:
9539     case META_PRUNE:
9540     case META_QUERY:
9541     case META_QUERY_PLUS:
9542     case META_QUERY_QUERY:
9543     case META_RANGE_ESCAPED:
9544     case META_RANGE_LITERAL:
9545     case META_SKIP:
9546     case META_THEN:
9547     break;
9548 
9549     case META_RECURSE:
9550     pptr += SIZEOFFSET;
9551     break;
9552 
9553     case META_BACKREF_BYNAME:
9554     case META_RECURSE_BYNAME:
9555     pptr += 1 + SIZEOFFSET;
9556     break;
9557 
9558     case META_COND_DEFINE:
9559     pptr += SIZEOFFSET;
9560     nestlevel++;
9561     break;
9562 
9563     case META_COND_NAME:
9564     case META_COND_NUMBER:
9565     case META_COND_RNAME:
9566     case META_COND_RNUMBER:
9567     pptr += 1 + SIZEOFFSET;
9568     nestlevel++;
9569     break;
9570 
9571     case META_COND_VERSION:
9572     pptr += 3;
9573     nestlevel++;
9574     break;
9575 
9576     case META_CALLOUT_STRING:
9577     pptr += 3 + SIZEOFFSET;
9578     break;
9579 
9580     case META_BIGVALUE:
9581     case META_OPTIONS:
9582     case META_POSIX:
9583     case META_POSIX_NEG:
9584     pptr += 1;
9585     break;
9586 
9587     case META_MINMAX:
9588     case META_MINMAX_QUERY:
9589     case META_MINMAX_PLUS:
9590     pptr += 2;
9591     break;
9592 
9593     case META_CALLOUT_NUMBER:
9594     pptr += 3;
9595     break;
9596 
9597     case META_MARK:
9598     case META_COMMIT_ARG:
9599     case META_PRUNE_ARG:
9600     case META_SKIP_ARG:
9601     case META_THEN_ARG:
9602     pptr += 1 + pptr[1];
9603     break;
9604 
9605     case META_LOOKBEHIND:
9606     case META_LOOKBEHINDNOT:
9607     case META_LOOKBEHIND_NA:
9608     if (!set_lookbehind_lengths(&pptr, &errorcode, &loopcount, recurses, cb))
9609       return errorcode;
9610     break;
9611     }
9612   }
9613 
9614 return 0;
9615 }
9616 
9617 
9618 
9619 /*************************************************
9620 *     External function to compile a pattern     *
9621 *************************************************/
9622 
9623 /* This function reads a regular expression in the form of a string and returns
9624 a pointer to a block of store holding a compiled version of the expression.
9625 
9626 Arguments:
9627   pattern       the regular expression
9628   patlen        the length of the pattern, or PCRE2_ZERO_TERMINATED
9629   options       option bits
9630   errorptr      pointer to errorcode
9631   erroroffset   pointer to error offset
9632   ccontext      points to a compile context or is NULL
9633 
9634 Returns:        pointer to compiled data block, or NULL on error,
9635                 with errorcode and erroroffset set
9636 */
9637 
9638 PCRE2_EXP_DEFN pcre2_code * PCRE2_CALL_CONVENTION
pcre2_compile(PCRE2_SPTR pattern,PCRE2_SIZE patlen,uint32_t options,int * errorptr,PCRE2_SIZE * erroroffset,pcre2_compile_context * ccontext)9639 pcre2_compile(PCRE2_SPTR pattern, PCRE2_SIZE patlen, uint32_t options,
9640    int *errorptr, PCRE2_SIZE *erroroffset, pcre2_compile_context *ccontext)
9641 {
9642 BOOL utf;                             /* Set TRUE for UTF mode */
9643 BOOL ucp;                             /* Set TRUE for UCP mode */
9644 BOOL has_lookbehind = FALSE;          /* Set TRUE if a lookbehind is found */
9645 BOOL zero_terminated;                 /* Set TRUE for zero-terminated pattern */
9646 pcre2_real_code *re = NULL;           /* What we will return */
9647 compile_block cb;                     /* "Static" compile-time data */
9648 const uint8_t *tables;                /* Char tables base pointer */
9649 
9650 PCRE2_UCHAR *code;                    /* Current pointer in compiled code */
9651 PCRE2_SPTR codestart;                 /* Start of compiled code */
9652 PCRE2_SPTR ptr;                       /* Current pointer in pattern */
9653 uint32_t *pptr;                       /* Current pointer in parsed pattern */
9654 
9655 PCRE2_SIZE length = 1;                /* Allow for final END opcode */
9656 PCRE2_SIZE usedlength;                /* Actual length used */
9657 PCRE2_SIZE re_blocksize;              /* Size of memory block */
9658 PCRE2_SIZE big32count = 0;            /* 32-bit literals >= 0x80000000 */
9659 PCRE2_SIZE parsed_size_needed;        /* Needed for parsed pattern */
9660 
9661 int32_t firstcuflags, reqcuflags;     /* Type of first/req code unit */
9662 uint32_t firstcu, reqcu;              /* Value of first/req code unit */
9663 uint32_t setflags = 0;                /* NL and BSR set flags */
9664 
9665 uint32_t skipatstart;                 /* When checking (*UTF) etc */
9666 uint32_t limit_heap  = UINT32_MAX;
9667 uint32_t limit_match = UINT32_MAX;    /* Unset match limits */
9668 uint32_t limit_depth = UINT32_MAX;
9669 
9670 int newline = 0;                      /* Unset; can be set by the pattern */
9671 int bsr = 0;                          /* Unset; can be set by the pattern */
9672 int errorcode = 0;                    /* Initialize to avoid compiler warn */
9673 int regexrc;                          /* Return from compile */
9674 
9675 uint32_t i;                           /* Local loop counter */
9676 
9677 /* Comments at the head of this file explain about these variables. */
9678 
9679 uint32_t stack_groupinfo[GROUPINFO_DEFAULT_SIZE];
9680 uint32_t stack_parsed_pattern[PARSED_PATTERN_DEFAULT_SIZE];
9681 named_group named_groups[NAMED_GROUP_LIST_SIZE];
9682 
9683 /* The workspace is used in different ways in the different compiling phases.
9684 It needs to be 16-bit aligned for the preliminary parsing scan. */
9685 
9686 uint32_t c16workspace[C16_WORK_SIZE];
9687 PCRE2_UCHAR *cworkspace = (PCRE2_UCHAR *)c16workspace;
9688 
9689 
9690 /* -------------- Check arguments and set up the pattern ----------------- */
9691 
9692 /* There must be error code and offset pointers. */
9693 
9694 if (errorptr == NULL || erroroffset == NULL) return NULL;
9695 *errorptr = ERR0;
9696 *erroroffset = 0;
9697 
9698 /* There must be a pattern! */
9699 
9700 if (pattern == NULL)
9701   {
9702   *errorptr = ERR16;
9703   return NULL;
9704   }
9705 
9706 /* A NULL compile context means "use a default context" */
9707 
9708 if (ccontext == NULL)
9709   ccontext = (pcre2_compile_context *)(&PRIV(default_compile_context));
9710 
9711 /* PCRE2_MATCH_INVALID_UTF implies UTF */
9712 
9713 if ((options & PCRE2_MATCH_INVALID_UTF) != 0) options |= PCRE2_UTF;
9714 
9715 /* Check that all undefined public option bits are zero. */
9716 
9717 if ((options & ~PUBLIC_COMPILE_OPTIONS) != 0 ||
9718     (ccontext->extra_options & ~PUBLIC_COMPILE_EXTRA_OPTIONS) != 0)
9719   {
9720   *errorptr = ERR17;
9721   return NULL;
9722   }
9723 
9724 if ((options & PCRE2_LITERAL) != 0 &&
9725     ((options & ~PUBLIC_LITERAL_COMPILE_OPTIONS) != 0 ||
9726      (ccontext->extra_options & ~PUBLIC_LITERAL_COMPILE_EXTRA_OPTIONS) != 0))
9727   {
9728   *errorptr = ERR92;
9729   return NULL;
9730   }
9731 
9732 /* A zero-terminated pattern is indicated by the special length value
9733 PCRE2_ZERO_TERMINATED. Check for an overlong pattern. */
9734 
9735 if ((zero_terminated = (patlen == PCRE2_ZERO_TERMINATED)))
9736   patlen = PRIV(strlen)(pattern);
9737 
9738 if (patlen > ccontext->max_pattern_length)
9739   {
9740   *errorptr = ERR88;
9741   return NULL;
9742   }
9743 
9744 /* From here on, all returns from this function should end up going via the
9745 EXIT label. */
9746 
9747 
9748 /* ------------ Initialize the "static" compile data -------------- */
9749 
9750 tables = (ccontext->tables != NULL)? ccontext->tables : PRIV(default_tables);
9751 
9752 cb.lcc = tables + lcc_offset;          /* Individual */
9753 cb.fcc = tables + fcc_offset;          /*   character */
9754 cb.cbits = tables + cbits_offset;      /*      tables */
9755 cb.ctypes = tables + ctypes_offset;
9756 
9757 cb.assert_depth = 0;
9758 cb.bracount = 0;
9759 cb.cx = ccontext;
9760 cb.dupnames = FALSE;
9761 cb.end_pattern = pattern + patlen;
9762 cb.erroroffset = 0;
9763 cb.external_flags = 0;
9764 cb.external_options = options;
9765 cb.groupinfo = stack_groupinfo;
9766 cb.had_recurse = FALSE;
9767 cb.lastcapture = 0;
9768 cb.max_lookbehind = 0;
9769 cb.name_entry_size = 0;
9770 cb.name_table = NULL;
9771 cb.named_groups = named_groups;
9772 cb.named_group_list_size = NAMED_GROUP_LIST_SIZE;
9773 cb.names_found = 0;
9774 cb.open_caps = NULL;
9775 cb.parens_depth = 0;
9776 cb.parsed_pattern = stack_parsed_pattern;
9777 cb.req_varyopt = 0;
9778 cb.start_code = cworkspace;
9779 cb.start_pattern = pattern;
9780 cb.start_workspace = cworkspace;
9781 cb.workspace_size = COMPILE_WORK_SIZE;
9782 
9783 /* Maximum back reference and backref bitmap. The bitmap records up to 31 back
9784 references to help in deciding whether (.*) can be treated as anchored or not.
9785 */
9786 
9787 cb.top_backref = 0;
9788 cb.backref_map = 0;
9789 
9790 /* Escape sequences \1 to \9 are always back references, but as they are only
9791 two characters long, only two elements can be used in the parsed_pattern
9792 vector. The first contains the reference, and we'd like to use the second to
9793 record the offset in the pattern, so that forward references to non-existent
9794 groups can be diagnosed later with an offset. However, on 64-bit systems,
9795 PCRE2_SIZE won't fit. Instead, we have a vector of offsets for the first
9796 occurrence of \1 to \9, indexed by the second parsed_pattern value. All other
9797 references have enough space for the offset to be put into the parsed pattern.
9798 */
9799 
9800 for (i = 0; i < 10; i++) cb.small_ref_offset[i] = PCRE2_UNSET;
9801 
9802 
9803 /* --------------- Start looking at the pattern --------------- */
9804 
9805 /* Unless PCRE2_LITERAL is set, check for global one-time option settings at
9806 the start of the pattern, and remember the offset to the actual regex. With
9807 valgrind support, make the terminator of a zero-terminated pattern
9808 inaccessible. This catches bugs that would otherwise only show up for
9809 non-zero-terminated patterns. */
9810 
9811 #ifdef SUPPORT_VALGRIND
9812 if (zero_terminated) VALGRIND_MAKE_MEM_NOACCESS(pattern + patlen, CU2BYTES(1));
9813 #endif
9814 
9815 ptr = pattern;
9816 skipatstart = 0;
9817 
9818 if ((options & PCRE2_LITERAL) == 0)
9819   {
9820   while (patlen - skipatstart >= 2 &&
9821          ptr[skipatstart] == CHAR_LEFT_PARENTHESIS &&
9822          ptr[skipatstart+1] == CHAR_ASTERISK)
9823     {
9824     for (i = 0; i < sizeof(pso_list)/sizeof(pso); i++)
9825       {
9826       uint32_t c, pp;
9827       pso *p = pso_list + i;
9828 
9829       if (patlen - skipatstart - 2 >= p->length &&
9830           PRIV(strncmp_c8)(ptr + skipatstart + 2, (char *)(p->name),
9831             p->length) == 0)
9832         {
9833         skipatstart += p->length + 2;
9834         switch(p->type)
9835           {
9836           case PSO_OPT:
9837           cb.external_options |= p->value;
9838           break;
9839 
9840           case PSO_FLG:
9841           setflags |= p->value;
9842           break;
9843 
9844           case PSO_NL:
9845           newline = p->value;
9846           setflags |= PCRE2_NL_SET;
9847           break;
9848 
9849           case PSO_BSR:
9850           bsr = p->value;
9851           setflags |= PCRE2_BSR_SET;
9852           break;
9853 
9854           case PSO_LIMM:
9855           case PSO_LIMD:
9856           case PSO_LIMH:
9857           c = 0;
9858           pp = skipatstart;
9859           if (!IS_DIGIT(ptr[pp]))
9860             {
9861             errorcode = ERR60;
9862             ptr += pp;
9863             goto HAD_EARLY_ERROR;
9864             }
9865           while (IS_DIGIT(ptr[pp]))
9866             {
9867             if (c > UINT32_MAX / 10 - 1) break;   /* Integer overflow */
9868             c = c*10 + (ptr[pp++] - CHAR_0);
9869             }
9870           if (ptr[pp++] != CHAR_RIGHT_PARENTHESIS)
9871             {
9872             errorcode = ERR60;
9873             ptr += pp;
9874             goto HAD_EARLY_ERROR;
9875             }
9876           if (p->type == PSO_LIMH) limit_heap = c;
9877             else if (p->type == PSO_LIMM) limit_match = c;
9878             else limit_depth = c;
9879           skipatstart += pp - skipatstart;
9880           break;
9881           }
9882         break;   /* Out of the table scan loop */
9883         }
9884       }
9885     if (i >= sizeof(pso_list)/sizeof(pso)) break;   /* Out of pso loop */
9886     }
9887   }
9888 
9889 /* End of pattern-start options; advance to start of real regex. */
9890 
9891 ptr += skipatstart;
9892 
9893 /* Can't support UTF or UCP if PCRE2 was built without Unicode support. */
9894 
9895 #ifndef SUPPORT_UNICODE
9896 if ((cb.external_options & (PCRE2_UTF|PCRE2_UCP)) != 0)
9897   {
9898   errorcode = ERR32;
9899   goto HAD_EARLY_ERROR;
9900   }
9901 #endif
9902 
9903 /* Check UTF. We have the original options in 'options', with that value as
9904 modified by (*UTF) etc in cb->external_options. The extra option
9905 PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES is not permitted in UTF-16 mode because the
9906 surrogate code points cannot be represented in UTF-16. */
9907 
9908 utf = (cb.external_options & PCRE2_UTF) != 0;
9909 if (utf)
9910   {
9911   if ((options & PCRE2_NEVER_UTF) != 0)
9912     {
9913     errorcode = ERR74;
9914     goto HAD_EARLY_ERROR;
9915     }
9916   if ((options & PCRE2_NO_UTF_CHECK) == 0 &&
9917        (errorcode = PRIV(valid_utf)(pattern, patlen, erroroffset)) != 0)
9918     goto HAD_ERROR;  /* Offset was set by valid_utf() */
9919 
9920 #if PCRE2_CODE_UNIT_WIDTH == 16
9921   if ((ccontext->extra_options & PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES) != 0)
9922     {
9923     errorcode = ERR91;
9924     goto HAD_EARLY_ERROR;
9925     }
9926 #endif
9927   }
9928 
9929 /* Check UCP lockout. */
9930 
9931 ucp = (cb.external_options & PCRE2_UCP) != 0;
9932 if (ucp && (cb.external_options & PCRE2_NEVER_UCP) != 0)
9933   {
9934   errorcode = ERR75;
9935   goto HAD_EARLY_ERROR;
9936   }
9937 
9938 /* Process the BSR setting. */
9939 
9940 if (bsr == 0) bsr = ccontext->bsr_convention;
9941 
9942 /* Process the newline setting. */
9943 
9944 if (newline == 0) newline = ccontext->newline_convention;
9945 cb.nltype = NLTYPE_FIXED;
9946 switch(newline)
9947   {
9948   case PCRE2_NEWLINE_CR:
9949   cb.nllen = 1;
9950   cb.nl[0] = CHAR_CR;
9951   break;
9952 
9953   case PCRE2_NEWLINE_LF:
9954   cb.nllen = 1;
9955   cb.nl[0] = CHAR_NL;
9956   break;
9957 
9958   case PCRE2_NEWLINE_NUL:
9959   cb.nllen = 1;
9960   cb.nl[0] = CHAR_NUL;
9961   break;
9962 
9963   case PCRE2_NEWLINE_CRLF:
9964   cb.nllen = 2;
9965   cb.nl[0] = CHAR_CR;
9966   cb.nl[1] = CHAR_NL;
9967   break;
9968 
9969   case PCRE2_NEWLINE_ANY:
9970   cb.nltype = NLTYPE_ANY;
9971   break;
9972 
9973   case PCRE2_NEWLINE_ANYCRLF:
9974   cb.nltype = NLTYPE_ANYCRLF;
9975   break;
9976 
9977   default:
9978   errorcode = ERR56;
9979   goto HAD_EARLY_ERROR;
9980   }
9981 
9982 /* Pre-scan the pattern to do two things: (1) Discover the named groups and
9983 their numerical equivalents, so that this information is always available for
9984 the remaining processing. (2) At the same time, parse the pattern and put a
9985 processed version into the parsed_pattern vector. This has escapes interpreted
9986 and comments removed (amongst other things).
9987 
9988 In all but one case, when PCRE2_AUTO_CALLOUT is not set, the number of unsigned
9989 32-bit ints in the parsed pattern is bounded by the length of the pattern plus
9990 one (for the terminator) plus four if PCRE2_EXTRA_WORD or PCRE2_EXTRA_LINE is
9991 set. The exceptional case is when running in 32-bit, non-UTF mode, when literal
9992 characters greater than META_END (0x80000000) have to be coded as two units. In
9993 this case, therefore, we scan the pattern to check for such values. */
9994 
9995 #if PCRE2_CODE_UNIT_WIDTH == 32
9996 if (!utf)
9997   {
9998   PCRE2_SPTR p;
9999   for (p = ptr; p < cb.end_pattern; p++) if (*p >= META_END) big32count++;
10000   }
10001 #endif
10002 
10003 /* Ensure that the parsed pattern buffer is big enough. When PCRE2_AUTO_CALLOUT
10004 is set we have to assume a numerical callout (4 elements) for each character
10005 plus one at the end. This is overkill, but memory is plentiful these days. For
10006 many smaller patterns the vector on the stack (which was set up above) can be
10007 used. */
10008 
10009 parsed_size_needed = patlen - skipatstart + big32count;
10010 
10011 if ((ccontext->extra_options &
10012      (PCRE2_EXTRA_MATCH_WORD|PCRE2_EXTRA_MATCH_LINE)) != 0)
10013   parsed_size_needed += 4;
10014 
10015 if ((options & PCRE2_AUTO_CALLOUT) != 0)
10016   parsed_size_needed = (parsed_size_needed + 1) * 5;
10017 
10018 if (parsed_size_needed >= PARSED_PATTERN_DEFAULT_SIZE)
10019   {
10020   uint32_t *heap_parsed_pattern = ccontext->memctl.malloc(
10021     (parsed_size_needed + 1) * sizeof(uint32_t), ccontext->memctl.memory_data);
10022   if (heap_parsed_pattern == NULL)
10023     {
10024     *errorptr = ERR21;
10025     goto EXIT;
10026     }
10027   cb.parsed_pattern = heap_parsed_pattern;
10028   }
10029 cb.parsed_pattern_end = cb.parsed_pattern + parsed_size_needed + 1;
10030 
10031 /* Do the parsing scan. */
10032 
10033 errorcode = parse_regex(ptr, cb.external_options, &has_lookbehind, &cb);
10034 if (errorcode != 0) goto HAD_CB_ERROR;
10035 
10036 /* Workspace is needed to remember information about numbered groups: whether a
10037 group can match an empty string and what its fixed length is. This is done to
10038 avoid the possibility of recursive references causing very long compile times
10039 when checking these features. Unnumbered groups do not have this exposure since
10040 they cannot be referenced. We use an indexed vector for this purpose. If there
10041 are sufficiently few groups, the default vector on the stack, as set up above,
10042 can be used. Otherwise we have to get/free a special vector. The vector must be
10043 initialized to zero. */
10044 
10045 if (cb.bracount >= GROUPINFO_DEFAULT_SIZE)
10046   {
10047   cb.groupinfo = ccontext->memctl.malloc(
10048     (cb.bracount + 1)*sizeof(uint32_t), ccontext->memctl.memory_data);
10049   if (cb.groupinfo == NULL)
10050     {
10051     errorcode = ERR21;
10052     cb.erroroffset = 0;
10053     goto HAD_CB_ERROR;
10054     }
10055   }
10056 memset(cb.groupinfo, 0, (cb.bracount + 1) * sizeof(uint32_t));
10057 
10058 /* If there were any lookbehinds, scan the parsed pattern to figure out their
10059 lengths. */
10060 
10061 if (has_lookbehind)
10062   {
10063   errorcode = check_lookbehinds(cb.parsed_pattern, NULL, NULL, &cb);
10064   if (errorcode != 0) goto HAD_CB_ERROR;
10065   }
10066 
10067 /* For debugging, there is a function that shows the parsed data vector. */
10068 
10069 #ifdef DEBUG_SHOW_PARSED
10070 fprintf(stderr, "+++ Pre-scan complete:\n");
10071 show_parsed(&cb);
10072 #endif
10073 
10074 /* For debugging capturing information this code can be enabled. */
10075 
10076 #ifdef DEBUG_SHOW_CAPTURES
10077   {
10078   named_group *ng = cb.named_groups;
10079   fprintf(stderr, "+++Captures: %d\n", cb.bracount);
10080   for (i = 0; i < cb.names_found; i++, ng++)
10081     {
10082     fprintf(stderr, "+++%3d %.*s\n", ng->number, ng->length, ng->name);
10083     }
10084   }
10085 #endif
10086 
10087 /* Pretend to compile the pattern while actually just accumulating the amount
10088 of memory required in the 'length' variable. This behaviour is triggered by
10089 passing a non-NULL final argument to compile_regex(). We pass a block of
10090 workspace (cworkspace) for it to compile parts of the pattern into; the
10091 compiled code is discarded when it is no longer needed, so hopefully this
10092 workspace will never overflow, though there is a test for its doing so.
10093 
10094 On error, errorcode will be set non-zero, so we don't need to look at the
10095 result of the function. The initial options have been put into the cb block,
10096 but we still have to pass a separate options variable (the first argument)
10097 because the options may change as the pattern is processed. */
10098 
10099 cb.erroroffset = patlen;   /* For any subsequent errors that do not set it */
10100 pptr = cb.parsed_pattern;
10101 code = cworkspace;
10102 *code = OP_BRA;
10103 
10104 (void)compile_regex(cb.external_options, &code, &pptr, &errorcode, 0, &firstcu,
10105    &firstcuflags, &reqcu, &reqcuflags, NULL, &cb, &length);
10106 
10107 if (errorcode != 0) goto HAD_CB_ERROR;  /* Offset is in cb.erroroffset */
10108 
10109 /* This should be caught in compile_regex(), but just in case... */
10110 
10111 if (length > MAX_PATTERN_SIZE)
10112   {
10113   errorcode = ERR20;
10114   goto HAD_CB_ERROR;
10115   }
10116 
10117 /* Compute the size of, and then get and initialize, the data block for storing
10118 the compiled pattern and names table. Integer overflow should no longer be
10119 possible because nowadays we limit the maximum value of cb.names_found and
10120 cb.name_entry_size. */
10121 
10122 re_blocksize = sizeof(pcre2_real_code) +
10123   CU2BYTES(length +
10124   (PCRE2_SIZE)cb.names_found * (PCRE2_SIZE)cb.name_entry_size);
10125 re = (pcre2_real_code *)
10126   ccontext->memctl.malloc(re_blocksize, ccontext->memctl.memory_data);
10127 if (re == NULL)
10128   {
10129   errorcode = ERR21;
10130   goto HAD_CB_ERROR;
10131   }
10132 
10133 /* The compiler may put padding at the end of the pcre2_real_code structure in
10134 order to round it up to a multiple of 4 or 8 bytes. This means that when a
10135 compiled pattern is copied (for example, when serialized) undefined bytes are
10136 read, and this annoys debuggers such as valgrind. To avoid this, we explicitly
10137 write to the last 8 bytes of the structure before setting the fields. */
10138 
10139 memset((char *)re + sizeof(pcre2_real_code) - 8, 0, 8);
10140 re->memctl = ccontext->memctl;
10141 re->tables = tables;
10142 re->executable_jit = NULL;
10143 memset(re->start_bitmap, 0, 32 * sizeof(uint8_t));
10144 re->blocksize = re_blocksize;
10145 re->magic_number = MAGIC_NUMBER;
10146 re->compile_options = options;
10147 re->overall_options = cb.external_options;
10148 re->extra_options = ccontext->extra_options;
10149 re->flags = PCRE2_CODE_UNIT_WIDTH/8 | cb.external_flags | setflags;
10150 re->limit_heap = limit_heap;
10151 re->limit_match = limit_match;
10152 re->limit_depth = limit_depth;
10153 re->first_codeunit = 0;
10154 re->last_codeunit = 0;
10155 re->bsr_convention = bsr;
10156 re->newline_convention = newline;
10157 re->max_lookbehind = 0;
10158 re->minlength = 0;
10159 re->top_bracket = 0;
10160 re->top_backref = 0;
10161 re->name_entry_size = cb.name_entry_size;
10162 re->name_count = cb.names_found;
10163 
10164 /* The basic block is immediately followed by the name table, and the compiled
10165 code follows after that. */
10166 
10167 codestart = (PCRE2_SPTR)((uint8_t *)re + sizeof(pcre2_real_code)) +
10168   re->name_entry_size * re->name_count;
10169 
10170 /* Update the compile data block for the actual compile. The starting points of
10171 the name/number translation table and of the code are passed around in the
10172 compile data block. The start/end pattern and initial options are already set
10173 from the pre-compile phase, as is the name_entry_size field. */
10174 
10175 cb.parens_depth = 0;
10176 cb.assert_depth = 0;
10177 cb.lastcapture = 0;
10178 cb.name_table = (PCRE2_UCHAR *)((uint8_t *)re + sizeof(pcre2_real_code));
10179 cb.start_code = codestart;
10180 cb.req_varyopt = 0;
10181 cb.had_accept = FALSE;
10182 cb.had_pruneorskip = FALSE;
10183 cb.open_caps = NULL;
10184 
10185 /* If any named groups were found, create the name/number table from the list
10186 created in the pre-pass. */
10187 
10188 if (cb.names_found > 0)
10189   {
10190   named_group *ng = cb.named_groups;
10191   for (i = 0; i < cb.names_found; i++, ng++)
10192     add_name_to_table(&cb, ng->name, ng->length, ng->number, i);
10193   }
10194 
10195 /* Set up a starting, non-extracting bracket, then compile the expression. On
10196 error, errorcode will be set non-zero, so we don't need to look at the result
10197 of the function here. */
10198 
10199 pptr = cb.parsed_pattern;
10200 code = (PCRE2_UCHAR *)codestart;
10201 *code = OP_BRA;
10202 regexrc = compile_regex(re->overall_options, &code, &pptr, &errorcode, 0,
10203   &firstcu, &firstcuflags, &reqcu, &reqcuflags, NULL, &cb, NULL);
10204 if (regexrc < 0) re->flags |= PCRE2_MATCH_EMPTY;
10205 re->top_bracket = cb.bracount;
10206 re->top_backref = cb.top_backref;
10207 re->max_lookbehind = cb.max_lookbehind;
10208 
10209 if (cb.had_accept)
10210   {
10211   reqcu = 0;                     /* Must disable after (*ACCEPT) */
10212   reqcuflags = REQ_NONE;
10213   re->flags |= PCRE2_HASACCEPT;  /* Disables minimum length */
10214   }
10215 
10216 /* Fill in the final opcode and check for disastrous overflow. If no overflow,
10217 but the estimated length exceeds the really used length, adjust the value of
10218 re->blocksize, and if valgrind support is configured, mark the extra allocated
10219 memory as unaddressable, so that any out-of-bound reads can be detected. */
10220 
10221 *code++ = OP_END;
10222 usedlength = code - codestart;
10223 if (usedlength > length) errorcode = ERR23; else
10224   {
10225   re->blocksize -= CU2BYTES(length - usedlength);
10226 #ifdef SUPPORT_VALGRIND
10227   VALGRIND_MAKE_MEM_NOACCESS(code, CU2BYTES(length - usedlength));
10228 #endif
10229   }
10230 
10231 /* Scan the pattern for recursion/subroutine calls and convert the group
10232 numbers into offsets. Maintain a small cache so that repeated groups containing
10233 recursions are efficiently handled. */
10234 
10235 #define RSCAN_CACHE_SIZE 8
10236 
10237 if (errorcode == 0 && cb.had_recurse)
10238   {
10239   PCRE2_UCHAR *rcode;
10240   PCRE2_SPTR rgroup;
10241   unsigned int ccount = 0;
10242   int start = RSCAN_CACHE_SIZE;
10243   recurse_cache rc[RSCAN_CACHE_SIZE];
10244 
10245   for (rcode = (PCRE2_UCHAR *)find_recurse(codestart, utf);
10246        rcode != NULL;
10247        rcode = (PCRE2_UCHAR *)find_recurse(rcode + 1 + LINK_SIZE, utf))
10248     {
10249     int p, groupnumber;
10250 
10251     groupnumber = (int)GET(rcode, 1);
10252     if (groupnumber == 0) rgroup = codestart; else
10253       {
10254       PCRE2_SPTR search_from = codestart;
10255       rgroup = NULL;
10256       for (i = 0, p = start; i < ccount; i++, p = (p + 1) & 7)
10257         {
10258         if (groupnumber == rc[p].groupnumber)
10259           {
10260           rgroup = rc[p].group;
10261           break;
10262           }
10263 
10264         /* Group n+1 must always start to the right of group n, so we can save
10265         search time below when the new group number is greater than any of the
10266         previously found groups. */
10267 
10268         if (groupnumber > rc[p].groupnumber) search_from = rc[p].group;
10269         }
10270 
10271       if (rgroup == NULL)
10272         {
10273         rgroup = PRIV(find_bracket)(search_from, utf, groupnumber);
10274         if (rgroup == NULL)
10275           {
10276           errorcode = ERR53;
10277           break;
10278           }
10279         if (--start < 0) start = RSCAN_CACHE_SIZE - 1;
10280         rc[start].groupnumber = groupnumber;
10281         rc[start].group = rgroup;
10282         if (ccount < RSCAN_CACHE_SIZE) ccount++;
10283         }
10284       }
10285 
10286     PUT(rcode, 1, rgroup - codestart);
10287     }
10288   }
10289 
10290 /* In rare debugging situations we sometimes need to look at the compiled code
10291 at this stage. */
10292 
10293 #ifdef DEBUG_CALL_PRINTINT
10294 pcre2_printint(re, stderr, TRUE);
10295 fprintf(stderr, "Length=%lu Used=%lu\n", length, usedlength);
10296 #endif
10297 
10298 /* Unless disabled, check whether any single character iterators can be
10299 auto-possessified. The function overwrites the appropriate opcode values, so
10300 the type of the pointer must be cast. NOTE: the intermediate variable "temp" is
10301 used in this code because at least one compiler gives a warning about loss of
10302 "const" attribute if the cast (PCRE2_UCHAR *)codestart is used directly in the
10303 function call. */
10304 
10305 if (errorcode == 0 && (re->overall_options & PCRE2_NO_AUTO_POSSESS) == 0)
10306   {
10307   PCRE2_UCHAR *temp = (PCRE2_UCHAR *)codestart;
10308   if (PRIV(auto_possessify)(temp, &cb) != 0) errorcode = ERR80;
10309   }
10310 
10311 /* Failed to compile, or error while post-processing. */
10312 
10313 if (errorcode != 0) goto HAD_CB_ERROR;
10314 
10315 /* Successful compile. If the anchored option was not passed, set it if
10316 we can determine that the pattern is anchored by virtue of ^ characters or \A
10317 or anything else, such as starting with non-atomic .* when DOTALL is set and
10318 there are no occurrences of *PRUNE or *SKIP (though there is an option to
10319 disable this case). */
10320 
10321 if ((re->overall_options & PCRE2_ANCHORED) == 0 &&
10322      is_anchored(codestart, 0, &cb, 0, FALSE))
10323   re->overall_options |= PCRE2_ANCHORED;
10324 
10325 /* Set up the first code unit or startline flag, the required code unit, and
10326 then study the pattern. This code need not be obeyed if PCRE2_NO_START_OPTIMIZE
10327 is set, as the data it would create will not be used. Note that a first code
10328 unit (but not the startline flag) is useful for anchored patterns because it
10329 can still give a quick "no match" and also avoid searching for a last code
10330 unit. */
10331 
10332 if ((re->overall_options & PCRE2_NO_START_OPTIMIZE) == 0)
10333   {
10334   int minminlength = 0;  /* For minimal minlength from first/required CU */
10335 
10336   /* If we do not have a first code unit, see if there is one that is asserted
10337   (these are not saved during the compile because they can cause conflicts with
10338   actual literals that follow). */
10339 
10340   if (firstcuflags < 0)
10341     firstcu = find_firstassertedcu(codestart, &firstcuflags, 0);
10342 
10343   /* Save the data for a first code unit. The existence of one means the
10344   minimum length must be at least 1. */
10345 
10346   if (firstcuflags >= 0)
10347     {
10348     re->first_codeunit = firstcu;
10349     re->flags |= PCRE2_FIRSTSET;
10350     minminlength++;
10351 
10352     /* Handle caseless first code units. */
10353 
10354     if ((firstcuflags & REQ_CASELESS) != 0)
10355       {
10356       if (firstcu < 128 || (!utf && !ucp && firstcu < 255))
10357         {
10358         if (cb.fcc[firstcu] != firstcu) re->flags |= PCRE2_FIRSTCASELESS;
10359         }
10360 
10361       /* The first code unit is > 128 in UTF or UCP mode, or > 255 otherwise.
10362       In 8-bit UTF mode, codepoints in the range 128-255 are introductory code
10363       points and cannot have another case, but if UCP is set they may do. */
10364 
10365 #ifdef SUPPORT_UNICODE
10366 #if PCRE2_CODE_UNIT_WIDTH == 8
10367       else if (ucp && !utf && UCD_OTHERCASE(firstcu) != firstcu)
10368         re->flags |= PCRE2_FIRSTCASELESS;
10369 #else
10370       else if ((utf || ucp) && firstcu <= MAX_UTF_CODE_POINT &&
10371                UCD_OTHERCASE(firstcu) != firstcu)
10372         re->flags |= PCRE2_FIRSTCASELESS;
10373 #endif
10374 #endif  /* SUPPORT_UNICODE */
10375       }
10376     }
10377 
10378   /* When there is no first code unit, for non-anchored patterns, see if we can
10379   set the PCRE2_STARTLINE flag. This is helpful for multiline matches when all
10380   branches start with ^ and also when all branches start with non-atomic .* for
10381   non-DOTALL matches when *PRUNE and SKIP are not present. (There is an option
10382   that disables this case.) */
10383 
10384   else if ((re->overall_options & PCRE2_ANCHORED) == 0 &&
10385            is_startline(codestart, 0, &cb, 0, FALSE))
10386     re->flags |= PCRE2_STARTLINE;
10387 
10388   /* Handle the "required code unit", if one is set. In the UTF case we can
10389   increment the minimum minimum length only if we are sure this really is a
10390   different character and not a non-starting code unit of the first character,
10391   because the minimum length count is in characters, not code units. */
10392 
10393   if (reqcuflags >= 0)
10394     {
10395 #if PCRE2_CODE_UNIT_WIDTH == 16
10396     if ((re->overall_options & PCRE2_UTF) == 0 ||   /* Not UTF */
10397         firstcuflags < 0 ||                         /* First not set */
10398         (firstcu & 0xf800) != 0xd800 ||             /* First not surrogate */
10399         (reqcu & 0xfc00) != 0xdc00)                 /* Req not low surrogate */
10400 #elif PCRE2_CODE_UNIT_WIDTH == 8
10401     if ((re->overall_options & PCRE2_UTF) == 0 ||   /* Not UTF */
10402         firstcuflags < 0 ||                         /* First not set */
10403         (firstcu & 0x80) == 0 ||                    /* First is ASCII */
10404         (reqcu & 0x80) == 0)                        /* Req is ASCII */
10405 #endif
10406       {
10407       minminlength++;
10408       }
10409 
10410     /* In the case of an anchored pattern, set up the value only if it follows
10411     a variable length item in the pattern. */
10412 
10413     if ((re->overall_options & PCRE2_ANCHORED) == 0 ||
10414         (reqcuflags & REQ_VARY) != 0)
10415       {
10416       re->last_codeunit = reqcu;
10417       re->flags |= PCRE2_LASTSET;
10418 
10419       /* Handle caseless required code units as for first code units (above). */
10420 
10421       if ((reqcuflags & REQ_CASELESS) != 0)
10422         {
10423         if (reqcu < 128 || (!utf && !ucp && reqcu < 255))
10424           {
10425           if (cb.fcc[reqcu] != reqcu) re->flags |= PCRE2_LASTCASELESS;
10426           }
10427 #ifdef SUPPORT_UNICODE
10428 #if PCRE2_CODE_UNIT_WIDTH == 8
10429       else if (ucp && !utf && UCD_OTHERCASE(reqcu) != reqcu)
10430         re->flags |= PCRE2_LASTCASELESS;
10431 #else
10432       else if ((utf || ucp) && reqcu <= MAX_UTF_CODE_POINT &&
10433                UCD_OTHERCASE(reqcu) != reqcu)
10434         re->flags |= PCRE2_LASTCASELESS;
10435 #endif
10436 #endif  /* SUPPORT_UNICODE */
10437         }
10438       }
10439     }
10440 
10441   /* Study the compiled pattern to set up information such as a bitmap of
10442   starting code units and a minimum matching length. */
10443 
10444   if (PRIV(study)(re) != 0)
10445     {
10446     errorcode = ERR31;
10447     goto HAD_CB_ERROR;
10448     }
10449 
10450   /* If study() set a bitmap of starting code units, it implies a minimum
10451   length of at least one. */
10452 
10453   if ((re->flags & PCRE2_FIRSTMAPSET) != 0 && minminlength == 0)
10454     minminlength = 1;
10455 
10456   /* If the minimum length set (or not set) by study() is less than the minimum
10457   implied by required code units, override it. */
10458 
10459   if (re->minlength < minminlength) re->minlength = minminlength;
10460   }   /* End of start-of-match optimizations. */
10461 
10462 /* Control ends up here in all cases. When running under valgrind, make a
10463 pattern's terminating zero defined again. If memory was obtained for the parsed
10464 version of the pattern, free it before returning. Also free the list of named
10465 groups if a larger one had to be obtained, and likewise the group information
10466 vector. */
10467 
10468 EXIT:
10469 #ifdef SUPPORT_VALGRIND
10470 if (zero_terminated) VALGRIND_MAKE_MEM_DEFINED(pattern + patlen, CU2BYTES(1));
10471 #endif
10472 if (cb.parsed_pattern != stack_parsed_pattern)
10473   ccontext->memctl.free(cb.parsed_pattern, ccontext->memctl.memory_data);
10474 if (cb.named_group_list_size > NAMED_GROUP_LIST_SIZE)
10475   ccontext->memctl.free((void *)cb.named_groups, ccontext->memctl.memory_data);
10476 if (cb.groupinfo != stack_groupinfo)
10477   ccontext->memctl.free((void *)cb.groupinfo, ccontext->memctl.memory_data);
10478 return re;    /* Will be NULL after an error */
10479 
10480 /* Errors discovered in parse_regex() set the offset value in the compile
10481 block. Errors discovered before it is called must compute it from the ptr
10482 value. After parse_regex() is called, the offset in the compile block is set to
10483 the end of the pattern, but certain errors in compile_regex() may reset it if
10484 an offset is available in the parsed pattern. */
10485 
10486 HAD_CB_ERROR:
10487 ptr = pattern + cb.erroroffset;
10488 
10489 HAD_EARLY_ERROR:
10490 *erroroffset = ptr - pattern;
10491 
10492 HAD_ERROR:
10493 *errorptr = errorcode;
10494 pcre2_code_free(re);
10495 re = NULL;
10496 goto EXIT;
10497 }
10498 
10499 /* End of pcre2_compile.c */
10500